From 20bb9c984c6a261ac4fb225407579f4ec901a44b Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Thu, 15 Aug 2024 13:22:13 -0500 Subject: [PATCH 01/20] Rename key tuning functions --- tuning/autotune.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 4b3f7a6..4a1c1a5 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -777,7 +777,7 @@ def collision_handler(index_hash_list: list[tuple[int, str]]) -> tuple[bool, lis return collision_detected, unique_indexes -def compile_candidates( +def compile_dispatches( args: argparse.Namespace, path_config: PathConfig, candidates: list[int], @@ -905,7 +905,7 @@ def generate_dryrun_dispatch_benchmark_results( return task_results -def benchmark_compiled_candidates( +def benchmark_dispatches( args: argparse.Namespace, path_config: PathConfig, compiled_candidates: list[int], @@ -981,7 +981,7 @@ def benchmark_compiled_candidates( return top_candidates -def compile_unet_candidates( +def compile_models( args: argparse.Namespace, path_config: PathConfig, candidates: list[int], @@ -1205,7 +1205,7 @@ def dryrun_benchmark_unet( ) -def benchmark_unet( +def benchmark_model( args: argparse.Namespace, path_config: PathConfig, unet_candidates: list[int], @@ -1338,7 +1338,7 @@ def autotune(args: argparse.Namespace) -> None: return print("Compiling candidates...") - compiled_candidates = compile_candidates( + compiled_candidates = compile_dispatches( args, path_config, candidates, candidate_trackers ) print(f"Compiled files are stored in {path_config.compiled_dir}\n") @@ -1346,7 +1346,7 @@ def autotune(args: argparse.Namespace) -> None: return print("Benchmarking compiled candidates...") - top_candidates = benchmark_compiled_candidates( + top_candidates = benchmark_dispatches( args, path_config, compiled_candidates, candidate_trackers ) print(f"Stored results in {path_config.output_unilog}\n") @@ -1355,7 +1355,7 @@ def autotune(args: argparse.Namespace) -> None: return print(f"Compiling top unet candidates...") - unet_candidates = compile_unet_candidates( + unet_candidates = compile_models( args, path_config, top_candidates, candidate_trackers ) print(f"Unet candidates compiled in {path_config.base_dir}\n") @@ -1363,7 +1363,7 @@ def autotune(args: argparse.Namespace) -> None: return print("Benchmarking unet candidates...") - benchmark_unet(args, path_config, unet_candidates, candidate_trackers) + benchmark_model(args, path_config, unet_candidates, candidate_trackers) print(f"Stored results in {path_config.output_unilog}") if stop_after_phase == ExecutionPhases.benchmark_unet_candidates: return From 994aa4f3ed4e83f2c78a58031cc08735508ced6f Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Thu, 15 Aug 2024 16:07:39 -0500 Subject: [PATCH 02/20] Edit compile_dispatches() --- tuning/autotune.py | 45 +++++++++++++++++++-------- tuning/punet_autotune.py | 66 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 13 deletions(-) create mode 100644 tuning/punet_autotune.py diff --git a/tuning/autotune.py b/tuning/autotune.py index 4a1c1a5..4ee2b58 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -24,6 +24,8 @@ import pickle import iree.runtime as ireert import random +from abc import ABC, abstractmethod + """ Sample Usage: @@ -60,7 +62,7 @@ @dataclass -class CandidateTracker: +class CandidateTracker(ABC): candidate_id: int mlir_path: Optional[Path] = None mlir_config_path: Optional[Path] = None @@ -146,6 +148,25 @@ def get_candidate_spec_mlir_path(self, candidate_id: int) -> Path: def get_exe_format(self, path: Path) -> str: return f"./{path.as_posix()}" + + +@dataclass +class TuningClient(ABC): + @abstractmethod + def get_dispatch_compile_command(self, candidate_tracker: CandidateTracker) -> list[str]: + pass + + @abstractmethod + def get_dispatch_benchmark_command(self, candidate_tracker) -> list[str]: + pass + + @abstractmethod + def get_model_compile_command(self, candidate_tracker) -> list[str]: + pass + + @abstractmethod + def get_model_benchmark_command(self, candidate_tracker) -> list[str]: + pass @dataclass @@ -756,6 +777,8 @@ def generate_candidates( condition=(len(candidates) == 0), msg="Failed to generate any candidates" ) + logging.critical(f"Generated [{len(candidates)}] candidates") + return candidates @@ -782,22 +805,17 @@ def compile_dispatches( path_config: PathConfig, candidates: list[int], candidate_trackers: list[CandidateTracker], + tuning_client: TuningClient ) -> list[int]: """Compile candidate files for tuning and record in candidate_vmfbs.txt. Returns the list of compiled candidate indexes.""" logging.info("compile_candidates()") - task_list = [] - for candidate_index in candidates: - mlir_path = candidate_trackers[candidate_index].mlir_path - assert mlir_path is not None - command = [ - path_config.get_exe_format(path_config.compile_candidate_sh), - args.mode, - mlir_path.as_posix(), - ] - task_list.append(TaskTuple(args, command, check=False)) + if not candidates: + logging.info("No candidates to compile.") + return [] - num_worker = max(min(args.max_cpu_workers, len(task_list)), 1) # at least 1 worker + task_list = [TaskTuple(args, tuning_client.get_dispatch_compile_command(candidate_trackers[i]), check=False) for i in candidates] + num_worker = min(args.max_cpu_workers, len(task_list)) multiprocess_progress_wrapper( num_worker=num_worker, task_list=task_list, function=run_command_wrapper ) @@ -1321,6 +1339,7 @@ def autotune(args: argparse.Namespace) -> None: path_config.output_unilog.touch() candidate_trackers: list[CandidateTracker] = [] + tuning_client = TuningClient() stop_after_phase: str = args.stop_after print("Setup logging") @@ -1339,7 +1358,7 @@ def autotune(args: argparse.Namespace) -> None: print("Compiling candidates...") compiled_candidates = compile_dispatches( - args, path_config, candidates, candidate_trackers + args, path_config, candidates, candidate_trackers, tuning_client ) print(f"Compiled files are stored in {path_config.compiled_dir}\n") if stop_after_phase == ExecutionPhases.compile_candidates: diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py new file mode 100644 index 0000000..8b9ae0d --- /dev/null +++ b/tuning/punet_autotune.py @@ -0,0 +1,66 @@ +import autotune +import subprocess +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class PunetClient(autotune.TuningClient): + def get_dispatch_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + mlir_path = candidate_tracker.mlir_path + assert mlir_path is not None + command = [ + "./compile_candidate.sh", + "winograd", + mlir_path.as_posix(), + ] + return command + + def get_dispatch_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + # Implement as needed + pass + + def get_model_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + # Implement as needed + pass + + def get_model_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + # Implement as needed + pass + +# command = TuningClient.get_candidate_compile_command(candidate_trackers[index]) + + +def main(): + args = autotune.parse_arguments() + path_config = autotune.PathConfig() + path_config.base_dir.mkdir(parents=True, exist_ok=True) + path_config.output_unilog.touch() + candidate_trackers: list[autotune.CandidateTracker] = [] + punet_candidates = PunetClient() + + # path_config.compile_model_sh = Path("./compile_unet_candidate.sh") + # path_config.benchmark_model_sh = Path("./benchmark_unet_candidate.sh") + # path_config.model_baseline_vmfb = Path("./unet_baseline.vmfb") + + autotune.setup_logging(args, path_config) + + candidates = autotune.generate_candidates(args, path_config, candidate_trackers) + + + compiled_candidates = autotune.compile_dispatches( + args, path_config, candidates, candidate_trackers, punet_candidates + ) + exit() + top_candidates = autotune.benchmark_dispatches( + args, path_config, compiled_candidates, candidate_trackers + ) + + punet_candidates = autotune.compile_models( + args, path_config, top_candidates, candidate_trackers + ) + + autotune.benchmark_models(args, path_config, punet_candidates, candidate_trackers) + +if __name__ == "__main__": + main() From 897ff8858d0aef6b3fe2f24072e48ec3fe5e7179 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Fri, 16 Aug 2024 10:22:43 -0500 Subject: [PATCH 03/20] Edit CandidateTracker updates in compile_dispatches() --- tuning/autotune.py | 11 +++++++---- tuning/punet_autotune.py | 11 +++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 4ee2b58..0b8bda9 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -100,7 +100,7 @@ class PathConfig: candidates_dir: Path = field(init=False) candidate_configs_pkl: Path = field(init=False) compiled_dir: Path = field(init=False) - compilefailed_dir: Path = field(init=False) + compile_failed_dir: Path = field(init=False) output_unilog: Path = field(init=False) result_summary_log: Path = field(init=False) @@ -168,6 +168,9 @@ def get_model_compile_command(self, candidate_tracker) -> list[str]: def get_model_benchmark_command(self, candidate_tracker) -> list[str]: pass + def get_compiled_file_index(self, file_name: Path) -> int: + pass + @dataclass class TaskTuple: @@ -824,7 +827,7 @@ def compile_dispatches( path_config.compiled_dir.glob("*.vmfb"), key=numerical_sort_key ) failed_files = sorted( - path_config.compilefailed_dir.glob("*.mlir"), key=numerical_sort_key + path_config.compile_failed_dir.glob("*.mlir"), key=numerical_sort_key ) total, good, bad = len(task_list), len(compiled_files), len(failed_files) @@ -835,12 +838,12 @@ def compile_dispatches( # Update candidate tracker for failed_file in failed_files: - index = int(failed_file.stem) + index = tuning_client.get_compiled_file_index(failed_file) candidate_trackers[index].compilation_successful = False compiled_candidates = [] compiled_candidates_hash_list = [] for compiled_file in compiled_files: - index = int(compiled_file.stem) + index = tuning_client.get_compiled_file_index(failed_file) compiled_candidates.append(index) candidate_trackers[index].compilation_successful = True candidate_trackers[index].compiled_vmfb_path = compiled_file diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index 8b9ae0d..2452245 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -6,6 +6,7 @@ @dataclass class PunetClient(autotune.TuningClient): + def get_dispatch_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: mlir_path = candidate_tracker.mlir_path assert mlir_path is not None @@ -17,18 +18,16 @@ def get_dispatch_compile_command(self, candidate_tracker: autotune.CandidateTrac return command def get_dispatch_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: - # Implement as needed pass def get_model_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: - # Implement as needed pass def get_model_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: - # Implement as needed pass - -# command = TuningClient.get_candidate_compile_command(candidate_trackers[index]) + + def get_compiled_file_index(self, file_name: Path) -> int: + return int(file_name.stem) def main(): @@ -47,10 +46,10 @@ def main(): candidates = autotune.generate_candidates(args, path_config, candidate_trackers) - compiled_candidates = autotune.compile_dispatches( args, path_config, candidates, candidate_trackers, punet_candidates ) + exit() top_candidates = autotune.benchmark_dispatches( args, path_config, compiled_candidates, candidate_trackers From 81100112babbb6a419e522d8cd35b82cc5c1421b Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Fri, 16 Aug 2024 13:00:40 -0500 Subject: [PATCH 04/20] Edit compile and benchmark model function --- tuning/autotune.py | 119 ++++++++++++++++------------------- tuning/benchmark_dispatch.sh | 7 ++- tuning/punet_autotune.py | 66 ++++++++++++++----- 3 files changed, 108 insertions(+), 84 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 0b8bda9..49e2787 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -81,16 +81,17 @@ class CandidateTracker(ABC): calibrated_benchmark_diff: Optional[float] = None -@dataclass(frozen=True) +# @dataclass(frozen=True) +@dataclass class PathConfig: # Preset constants global_config_prolog_mlir: Path = Path("./config_prolog.mlir") global_config_epilog_mlir: Path = Path("./config_epilog.mlir") - compile_candidate_sh: Path = Path("./compile_candidate.sh") - benchmark_dispatch_sh: Path = Path("./benchmark_dispatch.sh") - compile_unet_candidate_sh: Path = Path("./compile_unet_candidate.sh") - benchmark_unet_candidate_sh: Path = Path("./benchmark_unet_candidate.sh") - unet_baseline_vmfb: Path = Path("./unet_baseline.vmfb") + # compile_candidate_sh: Path = Path("./compile_candidate.sh") + # benchmark_dispatch_sh: Path = Path("./benchmark_dispatch.sh") + # compile_unet_candidate_sh: Path = Path("./compile_unet_candidate.sh") + # benchmark_unet_candidate_sh: Path = Path("./benchmark_unet_candidate.sh") + model_baseline_vmfb: Path = Path("./unet_baseline.vmfb") # Dynamic paths base_dir: Path = field(init=False) @@ -101,6 +102,7 @@ class PathConfig: candidate_configs_pkl: Path = field(init=False) compiled_dir: Path = field(init=False) compile_failed_dir: Path = field(init=False) + spec_dir: Path = field(init=False) output_unilog: Path = field(init=False) result_summary_log: Path = field(init=False) @@ -123,7 +125,8 @@ def __post_init__(self): self, "candidate_configs_pkl", self.candidates_dir / "configs.pkl" ) object.__setattr__(self, "compiled_dir", self.candidates_dir / "compiled") - object.__setattr__(self, "compilefailed_dir", self.candidates_dir / "failed") + object.__setattr__(self, "compile_failed_dir", self.candidates_dir / "failed") + object.__setattr__(self, "spec_dir", self.candidates_dir / "specs") object.__setattr__(self, "output_unilog", self.base_dir / "output.log") object.__setattr__( self, "result_summary_log", self.base_dir / "result_summary.log" @@ -168,7 +171,13 @@ def get_model_compile_command(self, candidate_tracker) -> list[str]: def get_model_benchmark_command(self, candidate_tracker) -> list[str]: pass - def get_compiled_file_index(self, file_name: Path) -> int: + def get_compiled_dispatch_index(self, file_path: Path) -> int: + pass + + def get_candidate_spec_filename(self, candidate_id: int) -> Path: + pass + + def get_compiled_model_index(self, file_path: Path) -> int: pass @@ -710,6 +719,7 @@ def generate_candidates( args: argparse.Namespace, path_config: PathConfig, candidate_trackers: list[CandidateTracker], + tuning_client: TuningClient ) -> list[int]: """Generate candidate files for tuning. Returns the list of candidate indexes""" logging.info("generate_candidates()") @@ -838,12 +848,12 @@ def compile_dispatches( # Update candidate tracker for failed_file in failed_files: - index = tuning_client.get_compiled_file_index(failed_file) + index = tuning_client.get_compiled_dispatch_index(failed_file) candidate_trackers[index].compilation_successful = False compiled_candidates = [] compiled_candidates_hash_list = [] for compiled_file in compiled_files: - index = tuning_client.get_compiled_file_index(failed_file) + index = tuning_client.get_compiled_dispatch_index(failed_file) compiled_candidates.append(index) candidate_trackers[index].compilation_successful = True candidate_trackers[index].compiled_vmfb_path = compiled_file @@ -875,6 +885,7 @@ def parse_dispatch_benchmark_results( path_config: PathConfig, benchmark_results: list[TaskResult], candidate_trackers: list[CandidateTracker], + tuning_client: TuningClient ) -> tuple[list[ParsedDisptachBenchmarkResult], list[str]]: benchmark_result_configs = [] dump_list = [] @@ -888,9 +899,7 @@ def parse_dispatch_benchmark_results( benchmark_time = res.get_benchmark_time() assert candidate_id is not None and benchmark_time is not None candidate_trackers[candidate_id].first_benchmark_time = benchmark_time - candidate_trackers[candidate_id].mlir_spec_path = ( - path_config.get_candidate_spec_mlir_path(candidate_id) - ) + candidate_trackers[candidate_id].mlir_spec_path = path_config.spec_dir / tuning_client.get_candidate_spec_filename(candidate_id) mlir_path = candidate_trackers[candidate_id].mlir_path mlir_spec_path = candidate_trackers[candidate_id].mlir_spec_path assert mlir_path is not None and mlir_spec_path is not None @@ -931,6 +940,7 @@ def benchmark_dispatches( path_config: PathConfig, compiled_candidates: list[int], candidate_trackers: list[CandidateTracker], + tuning_client: TuningClient ): """Benchmark the candidate files and store the topN results in file (best.log).""" logging.info("benchmark_top_candidates()") @@ -942,18 +952,7 @@ def benchmark_dispatches( ) else: # Benchmarking dispatch candidates - task_list = [] - for index in compiled_candidates: - compiled_vmfb_path = candidate_trackers[index].compiled_vmfb_path - assert compiled_vmfb_path is not None - command = [ - path_config.get_exe_format(path_config.benchmark_dispatch_sh), - compiled_vmfb_path.as_posix(), - ] - task_list.append( - TaskTuple(args, command, check=False, command_need_device_id=True) - ) - + task_list = [TaskTuple(args, tuning_client.get_dispatch_benchmark_command(candidate_trackers[i]), check=False, command_need_device_id=True) for i in compiled_candidates] worker_context_queue = create_worker_context_queue(args.devices) benchmark_results = multiprocess_progress_wrapper( num_worker=len(args.devices), @@ -967,7 +966,7 @@ def benchmark_dispatches( parsed_benchmark_results, dispatch_benchmark_dump_list, ) = parse_dispatch_benchmark_results( - path_config, benchmark_results, candidate_trackers + path_config, benchmark_results, candidate_trackers, tuning_client ) append_to_file( dispatch_benchmark_dump_list, @@ -1007,6 +1006,7 @@ def compile_models( path_config: PathConfig, candidates: list[int], candidate_trackers: list[CandidateTracker], + tuning_client: TuningClient ) -> list[int]: """Compile U-Net candidates stored in best.log. Return the list of U-Net candidate files.""" logging.info("compile_unet_candidates()") @@ -1014,20 +1014,15 @@ def compile_models( if args.dry_run: return candidates - task_list = [] - for index in candidates: - if index == 0: - continue - mlir_spec_path = candidate_trackers[index].mlir_spec_path - assert mlir_spec_path is not None - command = [ - path_config.get_exe_format(path_config.compile_unet_candidate_sh), - args.mode, - mlir_spec_path.as_posix(), - ] - task_list.append(TaskTuple(args, command)) - - num_worker = max(min(args.max_cpu_workers, len(task_list)), 1) # at least 1 worker + if not candidates: + logging.info("No model candidates to compile.") + return [] + + task_list = [ + TaskTuple(args, tuning_client.get_model_compile_command(candidate_trackers[i])) + for i in candidates if i != 0 + ] + num_worker = min(args.max_cpu_workers, len(task_list)) multiprocess_progress_wrapper( num_worker=num_worker, task_list=task_list, function=run_command_wrapper ) @@ -1040,7 +1035,7 @@ def compile_models( # Update candidate tracker for unet_candidate in unet_candidates_files: assert unet_candidate is not None - index = int(unet_candidate.stem.split("_")[-1]) + index = tuning_client.get_compiled_model_index(unet_candidate) candidate_trackers[index].unet_candidate_path = unet_candidate hash_val = calculate_md5(unet_candidate) candidate_trackers[index].unet_vmfb_hash = hash_val @@ -1131,7 +1126,7 @@ def parse_grouped_benchmark_results( unet_candidate_path = res.get_unet_candidate_path() if ( unet_candidate_path is not None - and str(path_config.unet_baseline_vmfb) in unet_candidate_path + and str(path_config.model_baseline_vmfb) in unet_candidate_path ): baseline_time = res.get_benchmark_time() if baseline_time is None: @@ -1171,7 +1166,7 @@ def parse_grouped_benchmark_results( # Store incomplete .vmfb file at the end of dump_list. for index, device_id in incomplete_list: index_to_path = lambda index: ( - f"{path_config.unet_baseline_vmfb.as_posix()}" + f"{path_config.model_baseline_vmfb.as_posix()}" if index == 0 else f"{candidate_trackers[index].unet_candidate_path}" ) @@ -1211,7 +1206,7 @@ def dryrun_benchmark_unet( candidate_trackers: list[CandidateTracker], ): - unet_vmfb_paths = [path_config.unet_baseline_vmfb] + [ + unet_vmfb_paths = [path_config.model_baseline_vmfb] + [ Path(f"unet_candidate_{index}.vmfb") for index in unet_candidates ] benchmark_results = generate_dryrun_unet_benchmark_results(unet_vmfb_paths) @@ -1231,6 +1226,7 @@ def benchmark_model( path_config: PathConfig, unet_candidates: list[int], candidate_trackers: list[CandidateTracker], + tuning_client: TuningClient ): """Benchmark U-Net candidate files and log the results.""" logging.info("benchmark_unet()") @@ -1241,24 +1237,17 @@ def benchmark_model( # Benchmarking unet candidates worker_context_queue = create_worker_context_queue(args.devices) - benchmark_task_list = [] - for index in unet_candidates: - unet_candidate_path = candidate_trackers[index].unet_candidate_path - assert unet_candidate_path is not None - command = [ - path_config.get_exe_format(path_config.benchmark_unet_candidate_sh), - unet_candidate_path.as_posix(), - ] - benchmark_task_list.append( - TaskTuple( - args, - command, - check=False, - command_need_device_id=True, - cooling_time=10, - result_need_device_id=True, - ) + benchmark_task_list = [ + TaskTuple( + args, + tuning_client.get_model_benchmark_command(candidate_trackers[i]), + check=False, + command_need_device_id=True, + cooling_time=10, + result_need_device_id=True, ) + for i in unet_candidates + ] benchmark_results = multiprocess_progress_wrapper( num_worker=len(args.devices), task_list=benchmark_task_list, @@ -1270,14 +1259,12 @@ def benchmark_model( grouped_benchmark_results = group_benchmark_results_by_device_id(benchmark_results) # Benchmarking baselines on each involved device + candidate_trackers[0].unet_candidate_path = path_config.model_baseline_vmfb worker_context_queue = create_worker_context_queue(args.devices) baseline_task_list = [ TaskTuple( args, - command=[ - path_config.get_exe_format(path_config.benchmark_unet_candidate_sh), - path_config.unet_baseline_vmfb.as_posix(), - ], + tuning_client.get_model_benchmark_command(candidate_trackers[0]), check=False, command_need_device_id=True, result_need_device_id=True, @@ -1354,7 +1341,7 @@ def autotune(args: argparse.Namespace) -> None: print("Validation successful!\n") print("Generating candidates...") - candidates = generate_candidates(args, path_config, candidate_trackers) + candidates = generate_candidates(args, path_config, candidate_trackers, tuning_client) print(f"Generated [{len(candidates)}] candidates in {path_config.candidates_dir}\n") if stop_after_phase == ExecutionPhases.generate_candidates: return diff --git a/tuning/benchmark_dispatch.sh b/tuning/benchmark_dispatch.sh index dd8f29c..73a2fbd 100755 --- a/tuning/benchmark_dispatch.sh +++ b/tuning/benchmark_dispatch.sh @@ -12,9 +12,12 @@ readonly NAME="$(basename "$INPUT" .mlir)" # printf "Benchmarking $(basename ${INPUT}) on ${DEVICE}\n" +# Replace invalid characters in DEVICE variable +SANITIZED_DEVICE=$(echo "${DEVICE}" | sed 's/[^a-zA-Z0-9._-]/_/g') + timeout 16s ./tools/iree-benchmark-module --device="${DEVICE}" --module="${INPUT}" \ --hip_use_streams=true --hip_allow_inline_execution=true \ - --batch_size=1000 --benchmark_repetitions=3 > "${DIR}/benchmark_log_${DEVICE}.out" 2>&1 || (mv "$INPUT" "${DIR}/benchmark_failed" && exit 0) + --batch_size=1000 --benchmark_repetitions=3 > "${DIR}/benchmark_log_${SANITIZED_DEVICE}.out" 2>&1 || (mv "$INPUT" "${DIR}/benchmark_failed" && exit 0) -MEAN_TIME="$(grep --text real_time_mean "${DIR}/benchmark_log_${DEVICE}.out" | awk '{print $2}')" +MEAN_TIME="$(grep --text real_time_mean "${DIR}/benchmark_log_${SANITIZED_DEVICE}.out" | awk '{print $2}')" printf "%s\tMean Time: %.1f\n" "$(basename "$INPUT" .vmfb)" "$MEAN_TIME" diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index 2452245..9767d95 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -18,16 +18,51 @@ def get_dispatch_compile_command(self, candidate_tracker: autotune.CandidateTrac return command def get_dispatch_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: - pass + compiled_vmfb_path = candidate_tracker.compiled_vmfb_path + assert compiled_vmfb_path is not None + command = [ + "./benchmark_dispatch.sh", + compiled_vmfb_path.as_posix(), + ] + return command def get_model_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: - pass + mlir_spec_path = candidate_tracker.mlir_spec_path + assert mlir_spec_path is not None + command = [ + "./compile_unet_candidate.sh", + "winograd", + mlir_spec_path.as_posix(), + ] + return command + def get_model_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: - pass + unet_candidate_path = candidate_tracker.unet_candidate_path + assert unet_candidate_path is not None + command = [ + "./benchmark_unet_candidate.sh", + unet_candidate_path.as_posix(), + ] + return command + + def get_compiled_dispatch_index(self, file_path: Path) -> int: + return int(file_path.stem) + + def get_candidate_spec_filename(self, candidate_id: int) -> Path: + return f"{candidate_id}_spec.mlir" + + def get_compiled_model_index(self, file_path: Path) -> int: + return int(file_path.stem.split("_")[-1]) + - def get_compiled_file_index(self, file_name: Path) -> int: - return int(file_name.stem) +def set_path_config(path_config: autotune.PathConfig) -> None: + path_config.model_baseline_vmfb = Path("./unet_baseline.vmfb") + path_config.candidates_dir = path_config.base_dir / "candidates" + path_config.candidate_configs_pkl = path_config.candidates_dir / "configs.pkl" + path_config.compiled_dir = path_config.candidates_dir / "compiled" + path_config.compile_failed_dir = path_config.candidates_dir / "failed" + path_config.spec_dir = path_config.candidates_dir / "configs" def main(): @@ -36,30 +71,29 @@ def main(): path_config.base_dir.mkdir(parents=True, exist_ok=True) path_config.output_unilog.touch() candidate_trackers: list[autotune.CandidateTracker] = [] - punet_candidates = PunetClient() + punet_client = PunetClient() - # path_config.compile_model_sh = Path("./compile_unet_candidate.sh") - # path_config.benchmark_model_sh = Path("./benchmark_unet_candidate.sh") - # path_config.model_baseline_vmfb = Path("./unet_baseline.vmfb") + set_path_config(path_config) autotune.setup_logging(args, path_config) - candidates = autotune.generate_candidates(args, path_config, candidate_trackers) + candidates = autotune.generate_candidates(args, path_config, candidate_trackers, punet_client) compiled_candidates = autotune.compile_dispatches( - args, path_config, candidates, candidate_trackers, punet_candidates + args, path_config, candidates, candidate_trackers, punet_client ) - - exit() + top_candidates = autotune.benchmark_dispatches( - args, path_config, compiled_candidates, candidate_trackers + args, path_config, compiled_candidates, candidate_trackers, punet_client ) punet_candidates = autotune.compile_models( - args, path_config, top_candidates, candidate_trackers + args, path_config, top_candidates, candidate_trackers, punet_client ) - autotune.benchmark_models(args, path_config, punet_candidates, candidate_trackers) + exit() + + autotune.benchmark_models(args, path_config, punet_candidates, candidate_trackers, punet_client) if __name__ == "__main__": main() From a58e823f0d887f71e681d050821f4a43712c19b9 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Fri, 16 Aug 2024 13:23:39 -0500 Subject: [PATCH 05/20] Update pytest --- tuning/autotune.py | 2 +- tuning/punet_autotune.py | 2 -- tuning/test_autotune.py | 4 +++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 49e2787..01ed63b 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -147,7 +147,7 @@ def get_candidate_mlir_path(self, candidate_id: int) -> Path: return self.candidates_dir / f"{candidate_id}.mlir" def get_candidate_spec_mlir_path(self, candidate_id: int) -> Path: - return self.candidates_dir / "configs" / f"{candidate_id}_spec.mlir" + return self.candidates_dir / "specs" / f"{candidate_id}_spec.mlir" def get_exe_format(self, path: Path) -> str: return f"./{path.as_posix()}" diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index 9767d95..bd9733e 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -91,8 +91,6 @@ def main(): args, path_config, top_candidates, candidate_trackers, punet_client ) - exit() - autotune.benchmark_models(args, path_config, punet_candidates, candidate_trackers, punet_client) if __name__ == "__main__": diff --git a/tuning/test_autotune.py b/tuning/test_autotune.py index 4548456..e215b5a 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_autotune.py @@ -264,8 +264,10 @@ def generate_parsed_disptach_benchmark_result( f"{test_list[i][0]} Mean Time: {test_list[i][1]}" for i in random_order ] + mock_tuning_client = MagicMock() + mock_tuning_client.get_candidate_spec_filename.side_effect = lambda i: f'{i}_spec.mlir' parsed_results, dump_list = autotune.parse_dispatch_benchmark_results( - path_config, benchmark_results, candidate_trackers + path_config, benchmark_results, candidate_trackers, mock_tuning_client ) assert parsed_results == expect_parsed_results From 8f26965be359272b30699f0648acd5d71a0f7fb6 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Fri, 16 Aug 2024 13:36:49 -0500 Subject: [PATCH 06/20] Fix pre-commit err --- tuning/autotune.py | 100 +++++++++++++++++++++++++++++++-------- tuning/punet_autotune.py | 30 ++++++++---- tuning/test_autotune.py | 4 +- 3 files changed, 103 insertions(+), 31 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 01ed63b..d9b82b3 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -151,18 +151,20 @@ def get_candidate_spec_mlir_path(self, candidate_id: int) -> Path: def get_exe_format(self, path: Path) -> str: return f"./{path.as_posix()}" - + @dataclass class TuningClient(ABC): @abstractmethod - def get_dispatch_compile_command(self, candidate_tracker: CandidateTracker) -> list[str]: + def get_dispatch_compile_command( + self, candidate_tracker: CandidateTracker + ) -> list[str]: pass @abstractmethod def get_dispatch_benchmark_command(self, candidate_tracker) -> list[str]: pass - + @abstractmethod def get_model_compile_command(self, candidate_tracker) -> list[str]: pass @@ -171,16 +173,49 @@ def get_model_compile_command(self, candidate_tracker) -> list[str]: def get_model_benchmark_command(self, candidate_tracker) -> list[str]: pass + @abstractmethod def get_compiled_dispatch_index(self, file_path: Path) -> int: pass - def get_candidate_spec_filename(self, candidate_id: int) -> Path: + @abstractmethod + def get_candidate_spec_filename(self, candidate_id: int) -> str: pass + @abstractmethod def get_compiled_model_index(self, file_path: Path) -> int: pass +@dataclass +class DefaultTuningClient(TuningClient): + def get_dispatch_compile_command( + self, candidate_tracker: CandidateTracker + ) -> list[str]: + command = [""] + return command + + def get_dispatch_benchmark_command(self, candidate_tracker) -> list[str]: + command = [""] + return command + + def get_model_compile_command(self, candidate_tracker) -> list[str]: + command = [""] + return command + + def get_model_benchmark_command(self, candidate_tracker) -> list[str]: + command = [""] + return command + + def get_compiled_dispatch_index(self, file_path: Path) -> int: + return 0 + + def get_candidate_spec_filename(self, candidate_id: int) -> str: + return "" + + def get_compiled_model_index(self, file_path: Path) -> int: + return 0 + + @dataclass class TaskTuple: args: argparse.Namespace @@ -719,7 +754,7 @@ def generate_candidates( args: argparse.Namespace, path_config: PathConfig, candidate_trackers: list[CandidateTracker], - tuning_client: TuningClient + tuning_client: TuningClient, ) -> list[int]: """Generate candidate files for tuning. Returns the list of candidate indexes""" logging.info("generate_candidates()") @@ -818,7 +853,7 @@ def compile_dispatches( path_config: PathConfig, candidates: list[int], candidate_trackers: list[CandidateTracker], - tuning_client: TuningClient + tuning_client: TuningClient, ) -> list[int]: """Compile candidate files for tuning and record in candidate_vmfbs.txt. Returns the list of compiled candidate indexes.""" logging.info("compile_candidates()") @@ -827,7 +862,14 @@ def compile_dispatches( logging.info("No candidates to compile.") return [] - task_list = [TaskTuple(args, tuning_client.get_dispatch_compile_command(candidate_trackers[i]), check=False) for i in candidates] + task_list = [ + TaskTuple( + args, + tuning_client.get_dispatch_compile_command(candidate_trackers[i]), + check=False, + ) + for i in candidates + ] num_worker = min(args.max_cpu_workers, len(task_list)) multiprocess_progress_wrapper( num_worker=num_worker, task_list=task_list, function=run_command_wrapper @@ -885,7 +927,7 @@ def parse_dispatch_benchmark_results( path_config: PathConfig, benchmark_results: list[TaskResult], candidate_trackers: list[CandidateTracker], - tuning_client: TuningClient + tuning_client: TuningClient, ) -> tuple[list[ParsedDisptachBenchmarkResult], list[str]]: benchmark_result_configs = [] dump_list = [] @@ -899,7 +941,10 @@ def parse_dispatch_benchmark_results( benchmark_time = res.get_benchmark_time() assert candidate_id is not None and benchmark_time is not None candidate_trackers[candidate_id].first_benchmark_time = benchmark_time - candidate_trackers[candidate_id].mlir_spec_path = path_config.spec_dir / tuning_client.get_candidate_spec_filename(candidate_id) + candidate_trackers[candidate_id].mlir_spec_path = ( + path_config.spec_dir + / tuning_client.get_candidate_spec_filename(candidate_id) + ) mlir_path = candidate_trackers[candidate_id].mlir_path mlir_spec_path = candidate_trackers[candidate_id].mlir_spec_path assert mlir_path is not None and mlir_spec_path is not None @@ -940,7 +985,7 @@ def benchmark_dispatches( path_config: PathConfig, compiled_candidates: list[int], candidate_trackers: list[CandidateTracker], - tuning_client: TuningClient + tuning_client: TuningClient, ): """Benchmark the candidate files and store the topN results in file (best.log).""" logging.info("benchmark_top_candidates()") @@ -952,7 +997,15 @@ def benchmark_dispatches( ) else: # Benchmarking dispatch candidates - task_list = [TaskTuple(args, tuning_client.get_dispatch_benchmark_command(candidate_trackers[i]), check=False, command_need_device_id=True) for i in compiled_candidates] + task_list = [ + TaskTuple( + args, + tuning_client.get_dispatch_benchmark_command(candidate_trackers[i]), + check=False, + command_need_device_id=True, + ) + for i in compiled_candidates + ] worker_context_queue = create_worker_context_queue(args.devices) benchmark_results = multiprocess_progress_wrapper( num_worker=len(args.devices), @@ -1006,7 +1059,7 @@ def compile_models( path_config: PathConfig, candidates: list[int], candidate_trackers: list[CandidateTracker], - tuning_client: TuningClient + tuning_client: TuningClient, ) -> list[int]: """Compile U-Net candidates stored in best.log. Return the list of U-Net candidate files.""" logging.info("compile_unet_candidates()") @@ -1017,10 +1070,11 @@ def compile_models( if not candidates: logging.info("No model candidates to compile.") return [] - + task_list = [ - TaskTuple(args, tuning_client.get_model_compile_command(candidate_trackers[i])) - for i in candidates if i != 0 + TaskTuple(args, tuning_client.get_model_compile_command(candidate_trackers[i])) + for i in candidates + if i != 0 ] num_worker = min(args.max_cpu_workers, len(task_list)) multiprocess_progress_wrapper( @@ -1226,7 +1280,7 @@ def benchmark_model( path_config: PathConfig, unet_candidates: list[int], candidate_trackers: list[CandidateTracker], - tuning_client: TuningClient + tuning_client: TuningClient, ): """Benchmark U-Net candidate files and log the results.""" logging.info("benchmark_unet()") @@ -1329,7 +1383,7 @@ def autotune(args: argparse.Namespace) -> None: path_config.output_unilog.touch() candidate_trackers: list[CandidateTracker] = [] - tuning_client = TuningClient() + tuning_client = DefaultTuningClient() stop_after_phase: str = args.stop_after print("Setup logging") @@ -1341,7 +1395,9 @@ def autotune(args: argparse.Namespace) -> None: print("Validation successful!\n") print("Generating candidates...") - candidates = generate_candidates(args, path_config, candidate_trackers, tuning_client) + candidates = generate_candidates( + args, path_config, candidate_trackers, tuning_client + ) print(f"Generated [{len(candidates)}] candidates in {path_config.candidates_dir}\n") if stop_after_phase == ExecutionPhases.generate_candidates: return @@ -1356,7 +1412,7 @@ def autotune(args: argparse.Namespace) -> None: print("Benchmarking compiled candidates...") top_candidates = benchmark_dispatches( - args, path_config, compiled_candidates, candidate_trackers + args, path_config, compiled_candidates, candidate_trackers, tuning_client ) print(f"Stored results in {path_config.output_unilog}\n") @@ -1365,14 +1421,16 @@ def autotune(args: argparse.Namespace) -> None: print(f"Compiling top unet candidates...") unet_candidates = compile_models( - args, path_config, top_candidates, candidate_trackers + args, path_config, top_candidates, candidate_trackers, tuning_client ) print(f"Unet candidates compiled in {path_config.base_dir}\n") if stop_after_phase == ExecutionPhases.compile_unet_candidates: return print("Benchmarking unet candidates...") - benchmark_model(args, path_config, unet_candidates, candidate_trackers) + benchmark_model( + args, path_config, unet_candidates, candidate_trackers, tuning_client + ) print(f"Stored results in {path_config.output_unilog}") if stop_after_phase == ExecutionPhases.benchmark_unet_candidates: return diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index bd9733e..c72e5f7 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -7,7 +7,9 @@ @dataclass class PunetClient(autotune.TuningClient): - def get_dispatch_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + def get_dispatch_compile_command( + self, candidate_tracker: autotune.CandidateTracker + ) -> list[str]: mlir_path = candidate_tracker.mlir_path assert mlir_path is not None command = [ @@ -17,7 +19,9 @@ def get_dispatch_compile_command(self, candidate_tracker: autotune.CandidateTrac ] return command - def get_dispatch_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + def get_dispatch_benchmark_command( + self, candidate_tracker: autotune.CandidateTracker + ) -> list[str]: compiled_vmfb_path = candidate_tracker.compiled_vmfb_path assert compiled_vmfb_path is not None command = [ @@ -26,7 +30,9 @@ def get_dispatch_benchmark_command(self, candidate_tracker: autotune.CandidateTr ] return command - def get_model_compile_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + def get_model_compile_command( + self, candidate_tracker: autotune.CandidateTracker + ) -> list[str]: mlir_spec_path = candidate_tracker.mlir_spec_path assert mlir_spec_path is not None command = [ @@ -35,9 +41,10 @@ def get_model_compile_command(self, candidate_tracker: autotune.CandidateTracker mlir_spec_path.as_posix(), ] return command - - def get_model_benchmark_command(self, candidate_tracker: autotune.CandidateTracker) -> list[str]: + def get_model_benchmark_command( + self, candidate_tracker: autotune.CandidateTracker + ) -> list[str]: unet_candidate_path = candidate_tracker.unet_candidate_path assert unet_candidate_path is not None command = [ @@ -47,9 +54,9 @@ def get_model_benchmark_command(self, candidate_tracker: autotune.CandidateTrack return command def get_compiled_dispatch_index(self, file_path: Path) -> int: - return int(file_path.stem) + return int(file_path.stem) - def get_candidate_spec_filename(self, candidate_id: int) -> Path: + def get_candidate_spec_filename(self, candidate_id: int) -> str: return f"{candidate_id}_spec.mlir" def get_compiled_model_index(self, file_path: Path) -> int: @@ -77,7 +84,9 @@ def main(): autotune.setup_logging(args, path_config) - candidates = autotune.generate_candidates(args, path_config, candidate_trackers, punet_client) + candidates = autotune.generate_candidates( + args, path_config, candidate_trackers, punet_client + ) compiled_candidates = autotune.compile_dispatches( args, path_config, candidates, candidate_trackers, punet_client @@ -91,7 +100,10 @@ def main(): args, path_config, top_candidates, candidate_trackers, punet_client ) - autotune.benchmark_models(args, path_config, punet_candidates, candidate_trackers, punet_client) + autotune.benchmark_models( + args, path_config, punet_candidates, candidate_trackers, punet_client + ) + if __name__ == "__main__": main() diff --git a/tuning/test_autotune.py b/tuning/test_autotune.py index e215b5a..bd0ea46 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_autotune.py @@ -265,7 +265,9 @@ def generate_parsed_disptach_benchmark_result( ] mock_tuning_client = MagicMock() - mock_tuning_client.get_candidate_spec_filename.side_effect = lambda i: f'{i}_spec.mlir' + mock_tuning_client.get_candidate_spec_filename.side_effect = ( + lambda i: f"{i}_spec.mlir" + ) parsed_results, dump_list = autotune.parse_dispatch_benchmark_results( path_config, benchmark_results, candidate_trackers, mock_tuning_client ) From d98a2a9b4524de1acc1b18ad3524ab7a5abad919 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Fri, 16 Aug 2024 13:40:51 -0500 Subject: [PATCH 07/20] Fix function name --- tuning/autotune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index d9b82b3..6552c5e 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -1275,7 +1275,7 @@ def dryrun_benchmark_unet( ) -def benchmark_model( +def benchmark_models( args: argparse.Namespace, path_config: PathConfig, unet_candidates: list[int], @@ -1428,7 +1428,7 @@ def autotune(args: argparse.Namespace) -> None: return print("Benchmarking unet candidates...") - benchmark_model( + benchmark_models( args, path_config, unet_candidates, candidate_trackers, tuning_client ) print(f"Stored results in {path_config.output_unilog}") From 4c9fe83d02a61d297bec0d2689319de6c53fac10 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Fri, 16 Aug 2024 14:07:07 -0500 Subject: [PATCH 08/20] Rename variable name --- tuning/autotune.py | 166 +++++++++++++++++++-------------------- tuning/punet_autotune.py | 8 +- tuning/test_autotune.py | 28 ++++--- 3 files changed, 101 insertions(+), 101 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 6552c5e..f114387 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -40,7 +40,7 @@ Dry Run Test (no gpu requried): -python autotune.py winograd 1286.mlir --num-candidates=64 --num-unet-candidates=10 --dry-run +python autotune.py winograd 1286.mlir --num-candidates=64 --num-model-candidates=10 --dry-run """ @@ -64,19 +64,19 @@ @dataclass class CandidateTracker(ABC): candidate_id: int - mlir_path: Optional[Path] = None - mlir_config_path: Optional[Path] = None + dispatch_mlir_path: Optional[Path] = None + dispatch_config_path: Optional[Path] = None configuration: Optional[tune.Configuration] = None compilation_successful: Optional[bool] = None - compiled_vmfb_path: Optional[Path] = None - compiled_vmfb_hash: Optional[str] = None + compiled_dispatch_path: Optional[Path] = None + compiled_dispatch_hash: Optional[str] = None first_benchmark_time: Optional[float] = None first_benchmark_device_id: Optional[int] = None - mlir_spec_path: Optional[Path] = None - unet_candidate_path: Optional[Path] = None - unet_vmfb_hash: Optional[str] = None - unet_benchmark_time: Optional[float] = None - unet_benchmark_device_id: Optional[int] = None + spec_path: Optional[Path] = None + model_path: Optional[Path] = None + compiled_model_hash: Optional[str] = None + model_benchmark_time: Optional[float] = None + model_benchmark_device_id: Optional[int] = None baseline_benchmark_time: Optional[float] = None calibrated_benchmark_diff: Optional[float] = None @@ -87,10 +87,6 @@ class PathConfig: # Preset constants global_config_prolog_mlir: Path = Path("./config_prolog.mlir") global_config_epilog_mlir: Path = Path("./config_epilog.mlir") - # compile_candidate_sh: Path = Path("./compile_candidate.sh") - # benchmark_dispatch_sh: Path = Path("./benchmark_dispatch.sh") - # compile_unet_candidate_sh: Path = Path("./compile_unet_candidate.sh") - # benchmark_unet_candidate_sh: Path = Path("./benchmark_unet_candidate.sh") model_baseline_vmfb: Path = Path("./unet_baseline.vmfb") # Dynamic paths @@ -289,15 +285,15 @@ def get_tokens(self) -> list[str]: except: return [] - def get_unet_candidate_path(self) -> Optional[str]: + def get_model_candidate_path(self) -> Optional[str]: if len(self.get_tokens()) < 2: return None return self.get_tokens()[1] def get_candidate_id(self) -> Optional[int]: - if self.get_unet_candidate_path(): + if self.get_model_candidate_path(): try: - path_str = self.get_unet_candidate_path() + path_str = self.get_model_candidate_path() return int(path_str.split("_")[-1].split(".")[0]) if path_str else None except ValueError: return None @@ -414,10 +410,10 @@ def validate_devices(user_devices: list[str]) -> None: class ExecutionPhases(str, Enum): dont_stop = "" generate_candidates = "generate-candidates" - compile_candidates = "compile-candidates" - benchmark_candidates = "benchmark-candidates" - compile_unet_candidates = "compile-unet-candidates" - benchmark_unet_candidates = "benchmark-unet-candidates" + compile_dispatches = "compile-dispatches" + benchmark_dispatches = "benchmark-dispatches" + compile_models = "compile-models" + benchmark_models = "benchmark-models" def parse_arguments() -> argparse.Namespace: @@ -454,7 +450,7 @@ def parse_arguments() -> argparse.Namespace: help="Stop execution after specified phase", ) parser.add_argument( - "--num-unet-candidates", + "--num-model-candidates", help="Maximum number of stage 2 candidates", type=int, default=50, @@ -812,14 +808,14 @@ def generate_candidates( candidates.append(int(mlir.stem)) new_candidate = CandidateTracker( candidate_id=int(mlir.stem), - mlir_path=mlir, + dispatch_mlir_path=mlir, configuration=candidate_configs[int(mlir.stem)], ) candidate_trackers.append(new_candidate) else: - candidate_trackers[int(mlir.stem.split("_config")[0])].mlir_config_path = ( - mlir - ) + candidate_trackers[ + int(mlir.stem.split("_config")[0]) + ].dispatch_config_path = mlir handle_error( condition=(len(candidates) == 0), msg="Failed to generate any candidates" @@ -898,11 +894,11 @@ def compile_dispatches( index = tuning_client.get_compiled_dispatch_index(failed_file) compiled_candidates.append(index) candidate_trackers[index].compilation_successful = True - candidate_trackers[index].compiled_vmfb_path = compiled_file - compiled_vmfb_path = candidate_trackers[index].compiled_vmfb_path + candidate_trackers[index].compiled_dispatch_path = compiled_file + compiled_vmfb_path = candidate_trackers[index].compiled_dispatch_path assert compiled_vmfb_path is not None hash_val = calculate_md5(compiled_vmfb_path) - candidate_trackers[index].compiled_vmfb_hash = hash_val + candidate_trackers[index].compiled_dispatch_hash = hash_val compiled_candidates_hash_list.append((index, hash_val)) handle_error( @@ -941,13 +937,13 @@ def parse_dispatch_benchmark_results( benchmark_time = res.get_benchmark_time() assert candidate_id is not None and benchmark_time is not None candidate_trackers[candidate_id].first_benchmark_time = benchmark_time - candidate_trackers[candidate_id].mlir_spec_path = ( + candidate_trackers[candidate_id].spec_path = ( path_config.spec_dir / tuning_client.get_candidate_spec_filename(candidate_id) ) - mlir_path = candidate_trackers[candidate_id].mlir_path - mlir_spec_path = candidate_trackers[candidate_id].mlir_spec_path - assert mlir_path is not None and mlir_spec_path is not None + mlir_path = candidate_trackers[candidate_id].dispatch_mlir_path + spec_path = candidate_trackers[candidate_id].spec_path + assert mlir_path is not None and spec_path is not None dump_list.append(res_str) benchmark_result_configs.append( @@ -956,7 +952,7 @@ def parse_dispatch_benchmark_results( candidate_id, benchmark_time, mlir_path, - mlir_spec_path, + spec_path, ) ) ) @@ -1039,7 +1035,7 @@ def benchmark_dispatches( # Select top candidates best_results = sorted( parsed_benchmark_results, key=lambda x: float(x.benchmark_time_in_seconds) - )[: args.num_unet_candidates] + )[: args.num_model_candidates] logging.critical(f"Selected top[{len(best_results)}]") dump_list = [ @@ -1062,7 +1058,7 @@ def compile_models( tuning_client: TuningClient, ) -> list[int]: """Compile U-Net candidates stored in best.log. Return the list of U-Net candidate files.""" - logging.info("compile_unet_candidates()") + logging.info("compile_models()") if args.dry_run: return candidates @@ -1081,35 +1077,35 @@ def compile_models( num_worker=num_worker, task_list=task_list, function=run_command_wrapper ) - unet_candidates_files = list(path_config.base_dir.glob("*.vmfb")) + model_candidates_files = list(path_config.base_dir.glob("*.vmfb")) - unet_candidates_indexes = [] - unet_candidates_hash_list = [] + model_candidates_indexes = [] + model_candidates_hash_list = [] # Update candidate tracker - for unet_candidate in unet_candidates_files: - assert unet_candidate is not None - index = tuning_client.get_compiled_model_index(unet_candidate) - candidate_trackers[index].unet_candidate_path = unet_candidate - hash_val = calculate_md5(unet_candidate) - candidate_trackers[index].unet_vmfb_hash = hash_val - unet_candidates_hash_list.append((index, hash_val)) - unet_candidates_indexes.append(index) - - # Check if unet candidate produces tbe same .vmfb - collision_detected, unique_unet_candidates_indexes = collision_handler( - unet_candidates_hash_list + for model_candidate in model_candidates_files: + assert model_candidate is not None + index = tuning_client.get_compiled_model_index(model_candidate) + candidate_trackers[index].model_path = model_candidate + hash_val = calculate_md5(model_candidate) + candidate_trackers[index].compiled_model_hash = hash_val + model_candidates_hash_list.append((index, hash_val)) + model_candidates_indexes.append(index) + + # Check if model candidate produces tbe same .vmfb + collision_detected, unique_model_candidates_indexes = collision_handler( + model_candidates_hash_list ) if collision_detected: logging.critical( - f"Remains [{len(unique_unet_candidates_indexes)}] unique candidate indexes" + f"Remains [{len(unique_model_candidates_indexes)}] unique candidate indexes" ) return ( - unique_unet_candidates_indexes + unique_model_candidates_indexes if collision_detected - else unet_candidates_indexes + else model_candidates_indexes ) @@ -1167,9 +1163,9 @@ def parse_grouped_benchmark_results( for same_device_results in grouped_benchmark_results: dump_unsort_list: list[tuple[float, str]] = [] - for unet_candidate_result in same_device_results: + for model_candidate_result in same_device_results: # Skip if benchmark failed. - result_str = unet_candidate_result.result.stdout + result_str = model_candidate_result.result.stdout if result_str is None: continue @@ -1177,10 +1173,10 @@ def parse_grouped_benchmark_results( device_id = res.get_device_id() # Record baseline benchmarking result. - unet_candidate_path = res.get_unet_candidate_path() + model_candidate_path = res.get_model_candidate_path() if ( - unet_candidate_path is not None - and str(path_config.model_baseline_vmfb) in unet_candidate_path + model_candidate_path is not None + and str(path_config.model_baseline_vmfb) in model_candidate_path ): baseline_time = res.get_benchmark_time() if baseline_time is None: @@ -1196,8 +1192,8 @@ def parse_grouped_benchmark_results( if candidate_time is None: incomplete_list.append((c_id, device_id)) continue - candidate_trackers[c_id].unet_benchmark_time = candidate_time - candidate_trackers[c_id].unet_benchmark_device_id = device_id + candidate_trackers[c_id].model_benchmark_time = candidate_time + candidate_trackers[c_id].model_benchmark_device_id = device_id # Skip improvement calculation if no baseline data. if baseline_time is None: dump_unsort_list.append((candidate_time, result_str)) @@ -1212,7 +1208,7 @@ def parse_grouped_benchmark_results( assert dump_str is not None dump_unsort_list.append((candidate_time, dump_str)) - # Sort unet candidate benchmarking result str in ascending time order. + # Sort model candidate benchmarking result str in ascending time order. dump_list = dump_list + [ dump_str for _, dump_str in sorted(dump_unsort_list, key=lambda x: x[0]) ] @@ -1222,7 +1218,7 @@ def parse_grouped_benchmark_results( index_to_path = lambda index: ( f"{path_config.model_baseline_vmfb.as_posix()}" if index == 0 - else f"{candidate_trackers[index].unet_candidate_path}" + else f"{candidate_trackers[index].model_path}" ) error_msg = f"Benchmarking result of {index_to_path(index)} on deivce {device_id} is incomplete" handle_error(condition=True, msg=error_msg, level=logging.WARNING) @@ -1278,18 +1274,18 @@ def dryrun_benchmark_unet( def benchmark_models( args: argparse.Namespace, path_config: PathConfig, - unet_candidates: list[int], + model_candidates: list[int], candidate_trackers: list[CandidateTracker], tuning_client: TuningClient, ): """Benchmark U-Net candidate files and log the results.""" - logging.info("benchmark_unet()") + logging.info("benchmark_models()") if args.dry_run: - dryrun_benchmark_unet(path_config, unet_candidates, candidate_trackers) + dryrun_benchmark_unet(path_config, model_candidates, candidate_trackers) return - # Benchmarking unet candidates + # Benchmarking model candidates worker_context_queue = create_worker_context_queue(args.devices) benchmark_task_list = [ TaskTuple( @@ -1300,7 +1296,7 @@ def benchmark_models( cooling_time=10, result_need_device_id=True, ) - for i in unet_candidates + for i in model_candidates ] benchmark_results = multiprocess_progress_wrapper( num_worker=len(args.devices), @@ -1313,7 +1309,7 @@ def benchmark_models( grouped_benchmark_results = group_benchmark_results_by_device_id(benchmark_results) # Benchmarking baselines on each involved device - candidate_trackers[0].unet_candidate_path = path_config.model_baseline_vmfb + candidate_trackers[0].model_path = path_config.model_baseline_vmfb worker_context_queue = create_worker_context_queue(args.devices) baseline_task_list = [ TaskTuple( @@ -1338,13 +1334,13 @@ def benchmark_models( [x] + y for x, y in zip(baseline_results, grouped_benchmark_results) ] - # Update candidate_tracker and extract strings which will be stored in unet_result_log + # Update candidate_tracker and extract strings which will be stored later dump_list = parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) append_to_file( - dump_list, filepath=path_config.output_unilog, title="Unet Benchmark Results" + dump_list, filepath=path_config.output_unilog, title="Model Benchmark Results" ) @@ -1354,10 +1350,10 @@ def summerize_top_candidates( dump_list = [] top_candidates = [] for candidate in candidate_trackers: - if candidate.candidate_id == 0 or candidate.unet_benchmark_time is None: + if candidate.candidate_id == 0 or candidate.model_benchmark_time is None: continue top_candidates.append( - (candidate.candidate_id, candidate.unet_benchmark_time) + (candidate.candidate_id, candidate.model_benchmark_time) ) # collect (id, time) top_candidates = sorted( @@ -1367,10 +1363,10 @@ def summerize_top_candidates( for candidate_id in top_candidate_ids: candidate = candidate_trackers[candidate_id] - assert candidate.mlir_config_path is not None - with open(candidate.mlir_config_path, "r") as file: + assert candidate.dispatch_config_path is not None + with open(candidate.dispatch_config_path, "r") as file: config_file_contents = file.read() - final_str = f"Candidate {candidate.candidate_id}:\nUnet benchmark time: {candidate.unet_benchmark_time} on device {candidate.unet_benchmark_device_id}\nDispatch benchmark time: {candidate.first_benchmark_time} on device {candidate.unet_benchmark_device_id}\nSpec file path: {candidate.mlir_spec_path}\nSpec contents:{config_file_contents}\n\n" + final_str = f"Candidate {candidate.candidate_id}:\nModel benchmark time: {candidate.model_benchmark_time} on device {candidate.model_benchmark_device_id}\nDispatch benchmark time: {candidate.first_benchmark_time} on device {candidate.model_benchmark_device_id}\nSpec file path: {candidate.spec_path}\nSpec contents:{config_file_contents}\n\n" dump_list.append(final_str) with open(path_config.result_summary_log, "w") as file: @@ -1407,7 +1403,7 @@ def autotune(args: argparse.Namespace) -> None: args, path_config, candidates, candidate_trackers, tuning_client ) print(f"Compiled files are stored in {path_config.compiled_dir}\n") - if stop_after_phase == ExecutionPhases.compile_candidates: + if stop_after_phase == ExecutionPhases.compile_dispatches: return print("Benchmarking compiled candidates...") @@ -1416,23 +1412,23 @@ def autotune(args: argparse.Namespace) -> None: ) print(f"Stored results in {path_config.output_unilog}\n") - if stop_after_phase == ExecutionPhases.benchmark_candidates: + if stop_after_phase == ExecutionPhases.benchmark_dispatches: return - print(f"Compiling top unet candidates...") - unet_candidates = compile_models( + print(f"Compiling top model candidates...") + model_candidates = compile_models( args, path_config, top_candidates, candidate_trackers, tuning_client ) - print(f"Unet candidates compiled in {path_config.base_dir}\n") - if stop_after_phase == ExecutionPhases.compile_unet_candidates: + print(f"Model candidates compiled in {path_config.base_dir}\n") + if stop_after_phase == ExecutionPhases.compile_models: return - print("Benchmarking unet candidates...") + print("Benchmarking model candidates...") benchmark_models( - args, path_config, unet_candidates, candidate_trackers, tuning_client + args, path_config, model_candidates, candidate_trackers, tuning_client ) print(f"Stored results in {path_config.output_unilog}") - if stop_after_phase == ExecutionPhases.benchmark_unet_candidates: + if stop_after_phase == ExecutionPhases.benchmark_models: return summerize_top_candidates(path_config, candidate_trackers) diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index c72e5f7..a505ec8 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -10,7 +10,7 @@ class PunetClient(autotune.TuningClient): def get_dispatch_compile_command( self, candidate_tracker: autotune.CandidateTracker ) -> list[str]: - mlir_path = candidate_tracker.mlir_path + mlir_path = candidate_tracker.dispatch_mlir_path assert mlir_path is not None command = [ "./compile_candidate.sh", @@ -22,7 +22,7 @@ def get_dispatch_compile_command( def get_dispatch_benchmark_command( self, candidate_tracker: autotune.CandidateTracker ) -> list[str]: - compiled_vmfb_path = candidate_tracker.compiled_vmfb_path + compiled_vmfb_path = candidate_tracker.compiled_dispatch_path assert compiled_vmfb_path is not None command = [ "./benchmark_dispatch.sh", @@ -33,7 +33,7 @@ def get_dispatch_benchmark_command( def get_model_compile_command( self, candidate_tracker: autotune.CandidateTracker ) -> list[str]: - mlir_spec_path = candidate_tracker.mlir_spec_path + mlir_spec_path = candidate_tracker.spec_path assert mlir_spec_path is not None command = [ "./compile_unet_candidate.sh", @@ -45,7 +45,7 @@ def get_model_compile_command( def get_model_benchmark_command( self, candidate_tracker: autotune.CandidateTracker ) -> list[str]: - unet_candidate_path = candidate_tracker.unet_candidate_path + unet_candidate_path = candidate_tracker.model_path assert unet_candidate_path is not None command = [ "./benchmark_unet_candidate.sh", diff --git a/tuning/test_autotune.py b/tuning/test_autotune.py index bd0ea46..36597cc 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_autotune.py @@ -125,7 +125,7 @@ def test_UnetBenchmarkResult_get(): "5", "items_per_second=5.50302/s", ] - assert res.get_unet_candidate_path() == "unet_candidate_12.vmfb" + assert res.get_model_candidate_path() == "unet_candidate_12.vmfb" assert res.get_candidate_id() == 12 assert res.get_device_id() == 24 assert res.get_benchmark_time() == 182.0 @@ -139,14 +139,14 @@ def test_UnetBenchmarkResult_get(): "device", "24", ] - assert res.get_unet_candidate_path() == "unet_baseline.vmfb" + assert res.get_model_candidate_path() == "unet_baseline.vmfb" assert res.get_candidate_id() == None assert res.get_device_id() == 24 assert res.get_benchmark_time() == None incomplete_str = "" res = autotune.UnetBenchmarkResult(incomplete_str) assert res.get_tokens() == [] - assert res.get_unet_candidate_path() == None + assert res.get_model_candidate_path() == None assert res.get_candidate_id() == None assert res.get_device_id() == None assert res.get_benchmark_time() == None @@ -154,7 +154,7 @@ def test_UnetBenchmarkResult_get(): bad_str = 12345 res = autotune.UnetBenchmarkResult(bad_str) assert res.get_tokens() == [] - assert res.get_unet_candidate_path() == None + assert res.get_model_candidate_path() == None assert res.get_candidate_id() == None assert res.get_device_id() == None assert res.get_benchmark_time() == None @@ -237,19 +237,23 @@ def generate_parsed_disptach_benchmark_result( path_config = autotune.PathConfig() candidate_trackers = [ - autotune.CandidateTracker(i, mlir_path=path_config.get_candidate_mlir_path(i)) + autotune.CandidateTracker( + i, dispatch_mlir_path=path_config.get_candidate_mlir_path(i) + ) for i in range(total) ] candidate_trackers_before = [ - autotune.CandidateTracker(i, mlir_path=path_config.get_candidate_mlir_path(i)) + autotune.CandidateTracker( + i, dispatch_mlir_path=path_config.get_candidate_mlir_path(i) + ) for i in range(total) ] expect_candidate_trackers = [ autotune.CandidateTracker( i, - mlir_path=path_config.get_candidate_mlir_path(i), - mlir_spec_path=path_config.get_candidate_spec_mlir_path(i), + dispatch_mlir_path=path_config.get_candidate_mlir_path(i), + spec_path=path_config.get_candidate_spec_mlir_path(i), ) for i in range(total) ] @@ -294,8 +298,8 @@ def set_tracker( baseline_benchmark_time: float, calibrated_benchmark_diff=float, ): - tracker.unet_benchmark_time = unet_benchmark_time - tracker.unet_benchmark_device_id = unet_benchmark_device_id + tracker.model_benchmark_time = unet_benchmark_time + tracker.model_benchmark_device_id = unet_benchmark_device_id tracker.baseline_benchmark_time = baseline_benchmark_time tracker.calibrated_benchmark_diff = calibrated_benchmark_diff @@ -366,7 +370,7 @@ def set_tracker( b1 = "Benchmarking: some_dir/unet_baseline.vmfb on device 0 BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] - candidate_trackers[1].unet_candidate_path = "unet_candidate_1.vmfb" + candidate_trackers[1].model_path = "unet_candidate_1.vmfb" dump_list = autotune.parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) @@ -380,7 +384,7 @@ def set_tracker( b1 = "Benchmarking: unet_baseline.vmfb on device 0" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] - candidate_trackers[1].unet_candidate_path = "unet_candidate_1.vmfb" + candidate_trackers[1].model_path = "unet_candidate_1.vmfb" dump_list = autotune.parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) From baf519fb14b882ed84904bc7c4119b7a977cf5d0 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 10:14:09 -0500 Subject: [PATCH 09/20] Move index extraction to autotune --- tuning/autotune.py | 43 +++++++++++++------------------------ tuning/compile_candidate.sh | 4 ++-- tuning/punet_autotune.py | 20 ----------------- 3 files changed, 17 insertions(+), 50 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index f114387..2952a5f 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -62,7 +62,7 @@ @dataclass -class CandidateTracker(ABC): +class CandidateTracker: candidate_id: int dispatch_mlir_path: Optional[Path] = None dispatch_config_path: Optional[Path] = None @@ -81,7 +81,7 @@ class CandidateTracker(ABC): calibrated_benchmark_diff: Optional[float] = None -# @dataclass(frozen=True) +@dataclass(frozen=True) @dataclass class PathConfig: # Preset constants @@ -148,6 +148,15 @@ def get_candidate_spec_mlir_path(self, candidate_id: int) -> Path: def get_exe_format(self, path: Path) -> str: return f"./{path.as_posix()}" + def get_compiled_dispatch_index(self, file_path: Path) -> int: + return int(file_path.stem) + + def get_candidate_spec_filename(self, candidate_id: int) -> str: + return f"{candidate_id}_spec.mlir" + + def get_compiled_model_index(self, file_path: Path) -> int: + return int(file_path.stem.split("_")[-1]) + @dataclass class TuningClient(ABC): @@ -169,18 +178,6 @@ def get_model_compile_command(self, candidate_tracker) -> list[str]: def get_model_benchmark_command(self, candidate_tracker) -> list[str]: pass - @abstractmethod - def get_compiled_dispatch_index(self, file_path: Path) -> int: - pass - - @abstractmethod - def get_candidate_spec_filename(self, candidate_id: int) -> str: - pass - - @abstractmethod - def get_compiled_model_index(self, file_path: Path) -> int: - pass - @dataclass class DefaultTuningClient(TuningClient): @@ -202,15 +199,6 @@ def get_model_benchmark_command(self, candidate_tracker) -> list[str]: command = [""] return command - def get_compiled_dispatch_index(self, file_path: Path) -> int: - return 0 - - def get_candidate_spec_filename(self, candidate_id: int) -> str: - return "" - - def get_compiled_model_index(self, file_path: Path) -> int: - return 0 - @dataclass class TaskTuple: @@ -886,12 +874,12 @@ def compile_dispatches( # Update candidate tracker for failed_file in failed_files: - index = tuning_client.get_compiled_dispatch_index(failed_file) + index = path_config.get_compiled_dispatch_index(failed_file) candidate_trackers[index].compilation_successful = False compiled_candidates = [] compiled_candidates_hash_list = [] for compiled_file in compiled_files: - index = tuning_client.get_compiled_dispatch_index(failed_file) + index = path_config.get_compiled_dispatch_index(failed_file) compiled_candidates.append(index) candidate_trackers[index].compilation_successful = True candidate_trackers[index].compiled_dispatch_path = compiled_file @@ -938,8 +926,7 @@ def parse_dispatch_benchmark_results( assert candidate_id is not None and benchmark_time is not None candidate_trackers[candidate_id].first_benchmark_time = benchmark_time candidate_trackers[candidate_id].spec_path = ( - path_config.spec_dir - / tuning_client.get_candidate_spec_filename(candidate_id) + path_config.spec_dir / path_config.get_candidate_spec_filename(candidate_id) ) mlir_path = candidate_trackers[candidate_id].dispatch_mlir_path spec_path = candidate_trackers[candidate_id].spec_path @@ -1085,7 +1072,7 @@ def compile_models( # Update candidate tracker for model_candidate in model_candidates_files: assert model_candidate is not None - index = tuning_client.get_compiled_model_index(model_candidate) + index = path_config.get_compiled_model_index(model_candidate) candidate_trackers[index].model_path = model_candidate hash_val = calculate_md5(model_candidate) candidate_trackers[index].compiled_model_hash = hash_val diff --git a/tuning/compile_candidate.sh b/tuning/compile_candidate.sh index a50a1c6..9b9b74c 100755 --- a/tuning/compile_candidate.sh +++ b/tuning/compile_candidate.sh @@ -8,11 +8,11 @@ readonly DIR="$(dirname "$INPUT")" readonly BASENAME="$(basename "$INPUT" .mlir)" readonly OUT="${DIR}/compiled/${BASENAME}.vmfb" -mkdir -p "${DIR}/compiled" "${DIR}/failed" "${DIR}/configs" +mkdir -p "${DIR}/compiled" "${DIR}/failed" "${DIR}/specs" timeout 4s ./punet.sh "$INPUT" -o "$OUT" --compile-from=executable-sources 2>/dev/null || (mv "$INPUT" "$DIR/failed" && exit 1) tools/iree-dump-module "$OUT" | grep -q 'rocm-hsaco-fb' || (mv "$INPUT" "$DIR/failed" && rm -f "$OUT" && exit 1) if [ -f "${DIR}/${BASENAME}_config.mlir" ]; then - cat "${DIR}/../config_prolog.mlir" "${DIR}/${BASENAME}_config.mlir" "${DIR}/../config_epilog.mlir" > "${DIR}/configs/${BASENAME}_spec.mlir" + cat "${DIR}/../config_prolog.mlir" "${DIR}/${BASENAME}_config.mlir" "${DIR}/../config_epilog.mlir" > "${DIR}/specs/${BASENAME}_spec.mlir" fi echo "Compiling ${INPUT}: success" diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index a505ec8..6bec650 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -53,24 +53,6 @@ def get_model_benchmark_command( ] return command - def get_compiled_dispatch_index(self, file_path: Path) -> int: - return int(file_path.stem) - - def get_candidate_spec_filename(self, candidate_id: int) -> str: - return f"{candidate_id}_spec.mlir" - - def get_compiled_model_index(self, file_path: Path) -> int: - return int(file_path.stem.split("_")[-1]) - - -def set_path_config(path_config: autotune.PathConfig) -> None: - path_config.model_baseline_vmfb = Path("./unet_baseline.vmfb") - path_config.candidates_dir = path_config.base_dir / "candidates" - path_config.candidate_configs_pkl = path_config.candidates_dir / "configs.pkl" - path_config.compiled_dir = path_config.candidates_dir / "compiled" - path_config.compile_failed_dir = path_config.candidates_dir / "failed" - path_config.spec_dir = path_config.candidates_dir / "configs" - def main(): args = autotune.parse_arguments() @@ -80,8 +62,6 @@ def main(): candidate_trackers: list[autotune.CandidateTracker] = [] punet_client = PunetClient() - set_path_config(path_config) - autotune.setup_logging(args, path_config) candidates = autotune.generate_candidates( From a1f68e6f6fba6a543f7cb7433b36c6561dea7ade Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 10:15:34 -0500 Subject: [PATCH 10/20] Add amd header. Remove unwanted import --- tuning/punet_autotune.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index 6bec650..4de85ff 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -1,5 +1,10 @@ +# Copyright 2024 Advanced Micro Devices, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + import autotune -import subprocess from dataclasses import dataclass from pathlib import Path From 109f39f6788ec184b602fc632ebc19467ae799e9 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 10:24:34 -0500 Subject: [PATCH 11/20] Fix dataclass label bug --- tuning/autotune.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 2952a5f..93d2012 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -82,7 +82,6 @@ class CandidateTracker: @dataclass(frozen=True) -@dataclass class PathConfig: # Preset constants global_config_prolog_mlir: Path = Path("./config_prolog.mlir") From 21f49767b1ea4afd92fbf94bc7f38ed619e0d51b Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 11:19:28 -0500 Subject: [PATCH 12/20] Remove unet strings --- tuning/autotune.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 93d2012..3b97eae 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -86,7 +86,7 @@ class PathConfig: # Preset constants global_config_prolog_mlir: Path = Path("./config_prolog.mlir") global_config_epilog_mlir: Path = Path("./config_epilog.mlir") - model_baseline_vmfb: Path = Path("./unet_baseline.vmfb") + model_baseline_vmfb: Path = Path("./baseline.vmfb") # Dynamic paths base_dir: Path = field(init=False) @@ -260,7 +260,7 @@ def generate_sample_result( @dataclass -class UnetBenchmarkResult: +class ModelBenchmarkResult: result_str: Optional[str] = None def get_tokens(self) -> list[str]: @@ -1155,7 +1155,7 @@ def parse_grouped_benchmark_results( if result_str is None: continue - res = UnetBenchmarkResult(result_str) + res = ModelBenchmarkResult(result_str) device_id = res.get_device_id() # Record baseline benchmarking result. @@ -1224,7 +1224,7 @@ def generate_dryrun_unet_benchmark_results( task_result = subprocess.CompletedProcess( args=[""], returncode=0, - stdout=UnetBenchmarkResult().generate_sample_result( + stdout=ModelBenchmarkResult().generate_sample_result( candidate_vmfb_path_str=candidate_vmfb_path.as_posix(), device_id=device_id, t1=start, From 9e5843fc36b6aa077c2cf120b26b39c5a09301e5 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 11:21:29 -0500 Subject: [PATCH 13/20] Apply change in test --- tuning/test_autotune.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tuning/test_autotune.py b/tuning/test_autotune.py index 36597cc..9fe4320 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_autotune.py @@ -109,7 +109,7 @@ def test_DispatchBenchmarkResult_get(): def test_UnetBenchmarkResult_get(): normal_str = "Benchmarking: unet_candidate_12.vmfb on device 24\nBM_main/process_time/real_time_median 182 ms 183 ms 5 items_per_second=5.50302/s" - res = autotune.UnetBenchmarkResult(normal_str) + res = autotune.ModelBenchmarkResult(normal_str) assert res.result_str == normal_str assert res.get_tokens() == [ "Benchmarking:", @@ -131,7 +131,7 @@ def test_UnetBenchmarkResult_get(): assert res.get_benchmark_time() == 182.0 incomplete_str = "Benchmarking: unet_baseline.vmfb on device 24\n" - res = autotune.UnetBenchmarkResult(incomplete_str) + res = autotune.ModelBenchmarkResult(incomplete_str) assert res.get_tokens() == [ "Benchmarking:", "unet_baseline.vmfb", @@ -144,7 +144,7 @@ def test_UnetBenchmarkResult_get(): assert res.get_device_id() == 24 assert res.get_benchmark_time() == None incomplete_str = "" - res = autotune.UnetBenchmarkResult(incomplete_str) + res = autotune.ModelBenchmarkResult(incomplete_str) assert res.get_tokens() == [] assert res.get_model_candidate_path() == None assert res.get_candidate_id() == None @@ -152,7 +152,7 @@ def test_UnetBenchmarkResult_get(): assert res.get_benchmark_time() == None bad_str = 12345 - res = autotune.UnetBenchmarkResult(bad_str) + res = autotune.ModelBenchmarkResult(bad_str) assert res.get_tokens() == [] assert res.get_model_candidate_path() == None assert res.get_candidate_id() == None @@ -166,7 +166,7 @@ def test_generate_sample_result(): expected = f"1\tMean Time: 3.1\n" assert output == expected, "DispatchBenchmarkResult generates invalid sample string" - res = autotune.UnetBenchmarkResult() + res = autotune.ModelBenchmarkResult() output = res.generate_sample_result( 1, "some_dir/tuning_2024_07_24_20_06/unet_candidate_60.vmfb.vmfb", 576.89 ) @@ -179,7 +179,7 @@ def test_UnetBenchmarkResult_get_calibrated_result_str(): res_time = 304 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" change = (res_time - baseline_time) / baseline_time - output_str = autotune.UnetBenchmarkResult(result_str).get_calibrated_result_str( + output_str = autotune.ModelBenchmarkResult(result_str).get_calibrated_result_str( change ) expect_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median\t {float(res_time)} ms (-28.132%)\t 305 ms\t 5 items_per_second=1.520000/s" @@ -189,7 +189,7 @@ def test_UnetBenchmarkResult_get_calibrated_result_str(): res_time = 218 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" change = (res_time - baseline_time) / baseline_time - output_str = autotune.UnetBenchmarkResult(result_str).get_calibrated_result_str( + output_str = autotune.ModelBenchmarkResult(result_str).get_calibrated_result_str( change ) expect_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median\t {float(res_time)} ms (+0.000%)\t 305 ms\t 5 items_per_second=1.520000/s" @@ -199,7 +199,7 @@ def test_UnetBenchmarkResult_get_calibrated_result_str(): res_time = 345 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" change = (res_time - baseline_time) / baseline_time - output_str = autotune.UnetBenchmarkResult(result_str).get_calibrated_result_str( + output_str = autotune.ModelBenchmarkResult(result_str).get_calibrated_result_str( change ) expect_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median\t {float(res_time)} ms (+180.488%)\t 305 ms\t 5 items_per_second=1.520000/s" From f1a8c6c5f1df30e01bcae6f116b49e2f7bf56573 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 11:25:13 -0500 Subject: [PATCH 14/20] Fix name strings --- tuning/test_autotune.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tuning/test_autotune.py b/tuning/test_autotune.py index 9fe4320..6f6ab69 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_autotune.py @@ -130,16 +130,16 @@ def test_UnetBenchmarkResult_get(): assert res.get_device_id() == 24 assert res.get_benchmark_time() == 182.0 - incomplete_str = "Benchmarking: unet_baseline.vmfb on device 24\n" + incomplete_str = "Benchmarking: baseline.vmfb on device 24\n" res = autotune.ModelBenchmarkResult(incomplete_str) assert res.get_tokens() == [ "Benchmarking:", - "unet_baseline.vmfb", + "baseline.vmfb", "on", "device", "24", ] - assert res.get_model_candidate_path() == "unet_baseline.vmfb" + assert res.get_model_candidate_path() == "baseline.vmfb" assert res.get_candidate_id() == None assert res.get_device_id() == 24 assert res.get_benchmark_time() == None @@ -174,7 +174,7 @@ def test_generate_sample_result(): assert output == expected, "UnetBenchmarkResult generates invalid sample string" -def test_UnetBenchmarkResult_get_calibrated_result_str(): +def test_ModelBenchmarkResult_get_calibrated_result_str(): baseline_time = 423 res_time = 304 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" @@ -303,8 +303,8 @@ def set_tracker( tracker.baseline_benchmark_time = baseline_benchmark_time tracker.calibrated_benchmark_diff = calibrated_benchmark_diff - b1 = "Benchmarking: some_dir/unet_baseline.vmfb on device 0 BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s" - b2 = "Benchmarking: unet_baseline.vmfb on device 1 BM_main/process_time/real_time_median 59.8 ms 15.1 ms 5 items_per_second=16.7114/s" + b1 = "Benchmarking: some_dir/baseline.vmfb on device 0 BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s" + b2 = "Benchmarking: baseline.vmfb on device 1 BM_main/process_time/real_time_median 59.8 ms 15.1 ms 5 items_per_second=16.7114/s" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0 BM_main/process_time/real_time_median 62.4 ms 15.4 ms 5 items_per_second=16.0223/s" s2 = "Benchmarking: some_dir/unet_candidate_2.vmfb on device 1 BM_main/process_time/real_time_median 61.4 ms 11.0 ms 5 items_per_second=16.2958/s" s3 = "Benchmarking: unet_candidate_4.vmfb on device 1 BM_main/process_time/real_time_median 57.4 ms 11.0 ms 5 items_per_second=16.2958/s" @@ -330,11 +330,11 @@ def set_tracker( set_tracker(expect_candidate_trackers[4], 57.4, 1, 59.8, -0.04013377926421403) expect_dump_list = [ - "Benchmarking: some_dir/unet_baseline.vmfb on device 0 " + "Benchmarking: some_dir/baseline.vmfb on device 0 " "BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s", "Benchmarking: unet_candidate_1.vmfb on device 0 " "BM_main/process_time/real_time_median 62.4 ms (+2.801%) 15.4 ms 5 items_per_second=16.0223/s", - "Benchmarking: unet_baseline.vmfb on device 1 " + "Benchmarking: baseline.vmfb on device 1 " "BM_main/process_time/real_time_median 59.8 ms 15.1 ms 5 items_per_second=16.7114/s", "Benchmarking: unet_candidate_4.vmfb on device 1 " "BM_main/process_time/real_time_median 57.4 ms (-4.013%) 11.0 ms 5 items_per_second=16.2958/s", @@ -354,7 +354,7 @@ def set_tracker( candidate_trackers == expect_candidate_trackers ), "candidate_trackers did not change as expected" - b1 = "Benchmarking: unet_baseline.vmfb on device 0" + b1 = "Benchmarking: baseline.vmfb on device 0" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0 BM_main/process_time/real_time_median 62.4 ms 15.4 ms 5 items_per_second=16.0223/s" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] dump_list = autotune.parse_grouped_benchmark_results( @@ -363,11 +363,11 @@ def set_tracker( expect_dump_list = [ "Benchmarking: unet_candidate_1.vmfb on device 0 " "BM_main/process_time/real_time_median 62.4 ms 15.4 ms 5 items_per_second=16.0223/s", - "Benchmarking result of unet_baseline.vmfb on deivce 0 is incomplete\n", + "Benchmarking result of baseline.vmfb on deivce 0 is incomplete\n", ] assert dump_list == expect_dump_list, "fail to parse incomplete baselines" - b1 = "Benchmarking: some_dir/unet_baseline.vmfb on device 0 BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s" + b1 = "Benchmarking: some_dir/baseline.vmfb on device 0 BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] candidate_trackers[1].model_path = "unet_candidate_1.vmfb" @@ -375,13 +375,13 @@ def set_tracker( path_config, grouped_benchmark_results, candidate_trackers ) expect_dump_list = [ - "Benchmarking: some_dir/unet_baseline.vmfb on device 0 " + "Benchmarking: some_dir/baseline.vmfb on device 0 " "BM_main/process_time/real_time_median 60.7 ms 13.5 ms 5 items_per_second=16.4733/s", "Benchmarking result of unet_candidate_1.vmfb on deivce 0 is incomplete\n", ] assert dump_list == expect_dump_list, "fail to parse incomplete candidates" - b1 = "Benchmarking: unet_baseline.vmfb on device 0" + b1 = "Benchmarking: baseline.vmfb on device 0" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] candidate_trackers[1].model_path = "unet_candidate_1.vmfb" @@ -389,7 +389,7 @@ def set_tracker( path_config, grouped_benchmark_results, candidate_trackers ) expect_dump_list = [ - "Benchmarking result of unet_baseline.vmfb on deivce 0 is incomplete\n", + "Benchmarking result of baseline.vmfb on deivce 0 is incomplete\n", "Benchmarking result of unet_candidate_1.vmfb on deivce 0 is incomplete\n", ] assert ( From ae05cd3f22534a31860901f421e7d13123b9f1f7 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 11:29:33 -0500 Subject: [PATCH 15/20] Rename 'unet' var --- tuning/test_autotune.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tuning/test_autotune.py b/tuning/test_autotune.py index 6f6ab69..2a7adfe 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_autotune.py @@ -107,7 +107,7 @@ def test_DispatchBenchmarkResult_get(): assert res.get_benchmark_time() == None -def test_UnetBenchmarkResult_get(): +def test_ModelBenchmarkResult_get(): normal_str = "Benchmarking: unet_candidate_12.vmfb on device 24\nBM_main/process_time/real_time_median 182 ms 183 ms 5 items_per_second=5.50302/s" res = autotune.ModelBenchmarkResult(normal_str) assert res.result_str == normal_str @@ -293,13 +293,13 @@ def generate_res(stdout: str, device_id: int) -> autotune.TaskResult: def set_tracker( tracker: autotune.CandidateTracker, - unet_benchmark_time: float, - unet_benchmark_device_id: int, + model_benchmark_time: float, + model_benchmark_device_id: int, baseline_benchmark_time: float, calibrated_benchmark_diff=float, ): - tracker.model_benchmark_time = unet_benchmark_time - tracker.model_benchmark_device_id = unet_benchmark_device_id + tracker.model_benchmark_time = model_benchmark_time + tracker.model_benchmark_device_id = model_benchmark_device_id tracker.baseline_benchmark_time = baseline_benchmark_time tracker.calibrated_benchmark_diff = calibrated_benchmark_diff From d420c46c0c02b8ce8d6a5f70ba7f95537124f963 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 12:08:03 -0500 Subject: [PATCH 16/20] Complete tuning process in punet_autotune.py --- tuning/autotune.py | 33 +++++++------------------------- tuning/punet_autotune.py | 41 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 3b97eae..7a94d5d 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -178,27 +178,6 @@ def get_model_benchmark_command(self, candidate_tracker) -> list[str]: pass -@dataclass -class DefaultTuningClient(TuningClient): - def get_dispatch_compile_command( - self, candidate_tracker: CandidateTracker - ) -> list[str]: - command = [""] - return command - - def get_dispatch_benchmark_command(self, candidate_tracker) -> list[str]: - command = [""] - return command - - def get_model_compile_command(self, candidate_tracker) -> list[str]: - command = [""] - return command - - def get_model_benchmark_command(self, candidate_tracker) -> list[str]: - command = [""] - return command - - @dataclass class TaskTuple: args: argparse.Namespace @@ -724,6 +703,11 @@ def load_pickle(file_path: Path) -> list[Any]: return loaded_array +def save_pickle(file_path: Path, input_list: list[Any]) -> None: + with open(file_path, "wb") as file: + pickle.dump(input_list, file) + + def append_to_file(lines: list[str], filepath: Path, title: str = "") -> None: """Appends new content to the end of the output.log.""" title_str = "=" * 5 + f" {title} " + "=" * 5 + "\n" if title != "" else "" @@ -1359,13 +1343,12 @@ def summerize_top_candidates( file.writelines(dump_list) -def autotune(args: argparse.Namespace) -> None: +def autotune(args: argparse.Namespace, tuning_client: TuningClient) -> None: path_config = PathConfig() path_config.base_dir.mkdir(parents=True, exist_ok=True) path_config.output_unilog.touch() candidate_trackers: list[CandidateTracker] = [] - tuning_client = DefaultTuningClient() stop_after_phase: str = args.stop_after print("Setup logging") @@ -1397,7 +1380,6 @@ def autotune(args: argparse.Namespace) -> None: args, path_config, compiled_candidates, candidate_trackers, tuning_client ) print(f"Stored results in {path_config.output_unilog}\n") - if stop_after_phase == ExecutionPhases.benchmark_dispatches: return @@ -1420,8 +1402,7 @@ def autotune(args: argparse.Namespace) -> None: summerize_top_candidates(path_config, candidate_trackers) print(f"Stored top candidates info in {path_config.result_summary_log}\n") - with open(path_config.candidate_trackers_pkl, "wb") as file: - pickle.dump(candidate_trackers, file) + save_pickle(path_config.candidate_trackers_pkl, candidate_trackers) print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n") print("Check the detailed execution logs in:") diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index 4de85ff..ede9758 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -66,28 +66,69 @@ def main(): path_config.output_unilog.touch() candidate_trackers: list[autotune.CandidateTracker] = [] punet_client = PunetClient() + stop_after_phase: str = args.stop_after + print("Setup logging") autotune.setup_logging(args, path_config) + print(path_config.run_log, end="\n\n") + print("Validating devices") + autotune.validate_devices(args.devices) + print("Validation successful!\n") + + print("Generating candidates...") candidates = autotune.generate_candidates( args, path_config, candidate_trackers, punet_client ) + print(f"Generated [{len(candidates)}] candidates in {path_config.candidates_dir}\n") + if stop_after_phase == autotune.ExecutionPhases.generate_candidates: + return + print("Compiling candidates...") compiled_candidates = autotune.compile_dispatches( args, path_config, candidates, candidate_trackers, punet_client ) + print(f"Compiled files are stored in {path_config.compiled_dir}\n") + if stop_after_phase == autotune.ExecutionPhases.compile_dispatches: + return + print("Benchmarking compiled candidates...") top_candidates = autotune.benchmark_dispatches( args, path_config, compiled_candidates, candidate_trackers, punet_client ) + print(f"Stored results in {path_config.output_unilog}\n") + if stop_after_phase == ExecutionPhases.benchmark_dispatches: + return + print(f"Compiling top model candidates...") punet_candidates = autotune.compile_models( args, path_config, top_candidates, candidate_trackers, punet_client ) + print(f"Model candidates compiled in {path_config.base_dir}\n") + if stop_after_phase == autotune.ExecutionPhases.compile_models: + return + print("Benchmarking model candidates...") autotune.benchmark_models( args, path_config, punet_candidates, candidate_trackers, punet_client ) + print(f"Stored results in {path_config.output_unilog}") + if stop_after_phase == autotune.ExecutionPhases.benchmark_models: + return + + autotune.summerize_top_candidates(path_config, candidate_trackers) + print(f"Stored top candidates info in {path_config.result_summary_log}\n") + + autotune.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers) + print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n") + + print("Check the detailed execution logs in:") + print(path_config.run_log) + + for candidate in candidate_trackers: + autotune.logging.debug(candidate) + if args.verbose: + print(candidate) if __name__ == "__main__": From cd5ae2ab7bddaeca8a795b0510787996229bf5eb Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 12:43:52 -0500 Subject: [PATCH 17/20] Remove extra dataclass labels --- tuning/autotune.py | 1 - tuning/punet_autotune.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index 7a94d5d..f7b4b0b 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -157,7 +157,6 @@ def get_compiled_model_index(self, file_path: Path) -> int: return int(file_path.stem.split("_")[-1]) -@dataclass class TuningClient(ABC): @abstractmethod def get_dispatch_compile_command( diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index ede9758..ca3b878 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -5,11 +5,9 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import autotune -from dataclasses import dataclass from pathlib import Path -@dataclass class PunetClient(autotune.TuningClient): def get_dispatch_compile_command( @@ -97,7 +95,7 @@ def main(): args, path_config, compiled_candidates, candidate_trackers, punet_client ) print(f"Stored results in {path_config.output_unilog}\n") - if stop_after_phase == ExecutionPhases.benchmark_dispatches: + if stop_after_phase == autotune.ExecutionPhases.benchmark_dispatches: return print(f"Compiling top model candidates...") From 3553a8909133c402381f5bb8513c9e1aec9acc5e Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 12:50:45 -0500 Subject: [PATCH 18/20] Remove main in autotune. Add sample usage in punet_autotune.py --- tuning/autotune.py | 97 ---------------------------------------- tuning/punet_autotune.py | 18 ++++++++ 2 files changed, 18 insertions(+), 97 deletions(-) diff --git a/tuning/autotune.py b/tuning/autotune.py index f7b4b0b..b4823d7 100755 --- a/tuning/autotune.py +++ b/tuning/autotune.py @@ -27,24 +27,6 @@ from abc import ABC, abstractmethod -""" -Sample Usage: - -python autotune.py winograd 1286.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=1,3,5 --num-candidates=64 - - -Recommended Trial Run: - -python autotune.py winograd 1286.mlir --num-candidates=1 - - -Dry Run Test (no gpu requried): - -python autotune.py winograd 1286.mlir --num-candidates=64 --num-model-candidates=10 --dry-run - -""" - - # Default values for num_candidates and devices, change it as needed DEFAULT_NUM_CANDIDATES = 2048 DEFAULT_DEVICE_LIST = ["hip://0"] @@ -1340,82 +1322,3 @@ def summerize_top_candidates( with open(path_config.result_summary_log, "w") as file: file.writelines(dump_list) - - -def autotune(args: argparse.Namespace, tuning_client: TuningClient) -> None: - path_config = PathConfig() - path_config.base_dir.mkdir(parents=True, exist_ok=True) - path_config.output_unilog.touch() - - candidate_trackers: list[CandidateTracker] = [] - stop_after_phase: str = args.stop_after - - print("Setup logging") - setup_logging(args, path_config) - print(path_config.run_log, end="\n\n") - - print("Validating devices") - validate_devices(args.devices) - print("Validation successful!\n") - - print("Generating candidates...") - candidates = generate_candidates( - args, path_config, candidate_trackers, tuning_client - ) - print(f"Generated [{len(candidates)}] candidates in {path_config.candidates_dir}\n") - if stop_after_phase == ExecutionPhases.generate_candidates: - return - - print("Compiling candidates...") - compiled_candidates = compile_dispatches( - args, path_config, candidates, candidate_trackers, tuning_client - ) - print(f"Compiled files are stored in {path_config.compiled_dir}\n") - if stop_after_phase == ExecutionPhases.compile_dispatches: - return - - print("Benchmarking compiled candidates...") - top_candidates = benchmark_dispatches( - args, path_config, compiled_candidates, candidate_trackers, tuning_client - ) - print(f"Stored results in {path_config.output_unilog}\n") - if stop_after_phase == ExecutionPhases.benchmark_dispatches: - return - - print(f"Compiling top model candidates...") - model_candidates = compile_models( - args, path_config, top_candidates, candidate_trackers, tuning_client - ) - print(f"Model candidates compiled in {path_config.base_dir}\n") - if stop_after_phase == ExecutionPhases.compile_models: - return - - print("Benchmarking model candidates...") - benchmark_models( - args, path_config, model_candidates, candidate_trackers, tuning_client - ) - print(f"Stored results in {path_config.output_unilog}") - if stop_after_phase == ExecutionPhases.benchmark_models: - return - - summerize_top_candidates(path_config, candidate_trackers) - print(f"Stored top candidates info in {path_config.result_summary_log}\n") - - save_pickle(path_config.candidate_trackers_pkl, candidate_trackers) - print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n") - - print("Check the detailed execution logs in:") - print(path_config.run_log) - - for candidate in candidate_trackers: - logging.debug(candidate) - if args.verbose: - print(candidate) - - -def main(): - autotune(parse_arguments()) - - -if __name__ == "__main__": - main() diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index ca3b878..495ecbc 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -8,6 +8,24 @@ from pathlib import Path +""" +Sample Usage: + +python punet_autotune.py winograd 1286.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64 + + +Recommended Trial Run: + +python punet_autotune.py winograd 1286.mlir --num-candidates=1 + + +Dry Run Test (no gpu requried): + +python punet_autotune.py winograd 1286.mlir --num-candidates=64 --num-model-candidates=10 --dry-run + +""" + + class PunetClient(autotune.TuningClient): def get_dispatch_compile_command( From 41f8b7d2338c9f20548888d72299cd3f9a3e82f1 Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 13:04:10 -0500 Subject: [PATCH 19/20] Change filenames --- tuning/{tune.py => candidate_gen.py} | 2 +- tuning/{autotune.py => libtuner.py} | 20 +- tuning/punet_autotune.py | 48 +- .../{test_tune.py => test_candidate_gen.py} | 424 +++++++++--------- tuning/{test_autotune.py => test_libtuner.py} | 132 +++--- 5 files changed, 313 insertions(+), 313 deletions(-) rename tuning/{tune.py => candidate_gen.py} (99%) rename tuning/{autotune.py => libtuner.py} (98%) rename tuning/{test_tune.py => test_candidate_gen.py} (54%) rename tuning/{test_autotune.py => test_libtuner.py} (82%) diff --git a/tuning/tune.py b/tuning/candidate_gen.py similarity index 99% rename from tuning/tune.py rename to tuning/candidate_gen.py index 5ae981b..02f8eb9 100755 --- a/tuning/tune.py +++ b/tuning/candidate_gen.py @@ -24,7 +24,7 @@ from iree.compiler.dialects import _linalg_ops_gen, _util_ops_gen """ -Usage: ./tune.py 121.mlir -o "tuning/candidates" -l 1024 --lhs-dims=mk --rhs-dims=nk --tile-dims=mnk +Usage: ./candidate_gen.py 121.mlir -o "tuning/candidates" -l 1024 --lhs-dims=mk --rhs-dims=nk --tile-dims=mnk """ tune_logger = logging.getLogger("tune") diff --git a/tuning/autotune.py b/tuning/libtuner.py similarity index 98% rename from tuning/autotune.py rename to tuning/libtuner.py index b4823d7..8661354 100755 --- a/tuning/autotune.py +++ b/tuning/libtuner.py @@ -15,7 +15,7 @@ import time import multiprocessing import queue -import tune +import candidate_gen from tqdm import tqdm import re import hashlib @@ -48,7 +48,7 @@ class CandidateTracker: candidate_id: int dispatch_mlir_path: Optional[Path] = None dispatch_config_path: Optional[Path] = None - configuration: Optional[tune.Configuration] = None + configuration: Optional[candidate_gen.Configuration] = None compilation_successful: Optional[bool] = None compiled_dispatch_path: Optional[Path] = None compiled_dispatch_hash: Optional[str] = None @@ -408,12 +408,12 @@ def parse_arguments() -> argparse.Namespace: help="Do not attempt to run any modules or initialize the IREE runtime", ) - # tune.tune() options + # candidate_gen.tune() options parser.add_argument( "--num-candidates", type=int, default=DEFAULT_NUM_CANDIDATES, - help=f"Number of candidates to be generated by tune.py (default: {DEFAULT_NUM_CANDIDATES})", + help=f"Number of candidates to be generated by candidate_gen.py (default: {DEFAULT_NUM_CANDIDATES})", ) parser.add_argument( "--num-subgroups", @@ -477,7 +477,7 @@ def format(self, record): verbose_console_handler.setFormatter(file_formatter) logging.getLogger().addHandler(verbose_console_handler) - # config logger in tune.py + # config logger in candidate_gen.py tune_logger = logging.getLogger("tune") tune_logger.setLevel(logging.DEBUG) @@ -725,8 +725,8 @@ def generate_candidates( mlirs = [] try: - logging.debug("Captured messages from tune.py:") - tune.tune( + logging.debug("Captured messages from candidate_gen.py:") + candidate_gen.tune( input=str(path_config.template_mlir), output=str(path_config.candidates_dir), limit=args.num_candidates, @@ -740,14 +740,14 @@ def generate_candidates( ) except Exception as e: logging.error("An error occurred during candidates generation: %s", str(e)) - # Capture and log debug messages from tune.py + # Capture and log debug messages from candidate_gen.py tune_logger = logging.getLogger("tune") for handler in logging.getLogger().handlers: if isinstance(handler, logging.FileHandler): tune_logger.handlers.append(handler) - tune_logger.exception("Error in tune.py:") + tune_logger.exception("Error in candidate_gen.py:") raise - logging.debug("tune.py ends") + logging.debug("candidate_gen.py ends") candidate_configs = load_pickle(path_config.candidate_configs_pkl) candidate_configs.insert(0, None) # No Configuration class for 0.mlir diff --git a/tuning/punet_autotune.py b/tuning/punet_autotune.py index 495ecbc..014c76d 100644 --- a/tuning/punet_autotune.py +++ b/tuning/punet_autotune.py @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import autotune +import libtuner from pathlib import Path @@ -26,10 +26,10 @@ """ -class PunetClient(autotune.TuningClient): +class PunetClient(libtuner.TuningClient): def get_dispatch_compile_command( - self, candidate_tracker: autotune.CandidateTracker + self, candidate_tracker: libtuner.CandidateTracker ) -> list[str]: mlir_path = candidate_tracker.dispatch_mlir_path assert mlir_path is not None @@ -41,7 +41,7 @@ def get_dispatch_compile_command( return command def get_dispatch_benchmark_command( - self, candidate_tracker: autotune.CandidateTracker + self, candidate_tracker: libtuner.CandidateTracker ) -> list[str]: compiled_vmfb_path = candidate_tracker.compiled_dispatch_path assert compiled_vmfb_path is not None @@ -52,7 +52,7 @@ def get_dispatch_benchmark_command( return command def get_model_compile_command( - self, candidate_tracker: autotune.CandidateTracker + self, candidate_tracker: libtuner.CandidateTracker ) -> list[str]: mlir_spec_path = candidate_tracker.spec_path assert mlir_spec_path is not None @@ -64,7 +64,7 @@ def get_model_compile_command( return command def get_model_benchmark_command( - self, candidate_tracker: autotune.CandidateTracker + self, candidate_tracker: libtuner.CandidateTracker ) -> list[str]: unet_candidate_path = candidate_tracker.model_path assert unet_candidate_path is not None @@ -76,73 +76,73 @@ def get_model_benchmark_command( def main(): - args = autotune.parse_arguments() - path_config = autotune.PathConfig() + args = libtuner.parse_arguments() + path_config = libtuner.PathConfig() path_config.base_dir.mkdir(parents=True, exist_ok=True) path_config.output_unilog.touch() - candidate_trackers: list[autotune.CandidateTracker] = [] + candidate_trackers: list[libtuner.CandidateTracker] = [] punet_client = PunetClient() stop_after_phase: str = args.stop_after print("Setup logging") - autotune.setup_logging(args, path_config) + libtuner.setup_logging(args, path_config) print(path_config.run_log, end="\n\n") print("Validating devices") - autotune.validate_devices(args.devices) + libtuner.validate_devices(args.devices) print("Validation successful!\n") print("Generating candidates...") - candidates = autotune.generate_candidates( + candidates = libtuner.generate_candidates( args, path_config, candidate_trackers, punet_client ) print(f"Generated [{len(candidates)}] candidates in {path_config.candidates_dir}\n") - if stop_after_phase == autotune.ExecutionPhases.generate_candidates: + if stop_after_phase == libtuner.ExecutionPhases.generate_candidates: return print("Compiling candidates...") - compiled_candidates = autotune.compile_dispatches( + compiled_candidates = libtuner.compile_dispatches( args, path_config, candidates, candidate_trackers, punet_client ) print(f"Compiled files are stored in {path_config.compiled_dir}\n") - if stop_after_phase == autotune.ExecutionPhases.compile_dispatches: + if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches: return print("Benchmarking compiled candidates...") - top_candidates = autotune.benchmark_dispatches( + top_candidates = libtuner.benchmark_dispatches( args, path_config, compiled_candidates, candidate_trackers, punet_client ) print(f"Stored results in {path_config.output_unilog}\n") - if stop_after_phase == autotune.ExecutionPhases.benchmark_dispatches: + if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches: return print(f"Compiling top model candidates...") - punet_candidates = autotune.compile_models( + punet_candidates = libtuner.compile_models( args, path_config, top_candidates, candidate_trackers, punet_client ) print(f"Model candidates compiled in {path_config.base_dir}\n") - if stop_after_phase == autotune.ExecutionPhases.compile_models: + if stop_after_phase == libtuner.ExecutionPhases.compile_models: return print("Benchmarking model candidates...") - autotune.benchmark_models( + libtuner.benchmark_models( args, path_config, punet_candidates, candidate_trackers, punet_client ) print(f"Stored results in {path_config.output_unilog}") - if stop_after_phase == autotune.ExecutionPhases.benchmark_models: + if stop_after_phase == libtuner.ExecutionPhases.benchmark_models: return - autotune.summerize_top_candidates(path_config, candidate_trackers) + libtuner.summerize_top_candidates(path_config, candidate_trackers) print(f"Stored top candidates info in {path_config.result_summary_log}\n") - autotune.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers) + libtuner.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers) print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n") print("Check the detailed execution logs in:") print(path_config.run_log) for candidate in candidate_trackers: - autotune.logging.debug(candidate) + libtuner.logging.debug(candidate) if args.verbose: print(candidate) diff --git a/tuning/test_tune.py b/tuning/test_candidate_gen.py similarity index 54% rename from tuning/test_tune.py rename to tuning/test_candidate_gen.py index 3e52863..5adca21 100644 --- a/tuning/test_tune.py +++ b/tuning/test_candidate_gen.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import pytest -import tune +import candidate_gen """ Usage: python -m pytest test_tune.py @@ -13,30 +13,30 @@ def test_get_shaped_type_element_bitwidth(): - assert tune.ShapedType([1024, 2048], tune.ElementType.i8).bitwidth == 8 - assert tune.ShapedType([2048], tune.ElementType.i32).bitwidth == 32 - assert tune.ShapedType([2048, 512, 384], tune.ElementType.f8).bitwidth == 8 - assert tune.ShapedType([1, 1], tune.ElementType.f16).bitwidth == 16 + assert candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8).bitwidth == 8 + assert candidate_gen.ShapedType([2048], candidate_gen.ElementType.i32).bitwidth == 32 + assert candidate_gen.ShapedType([2048, 512, 384], candidate_gen.ElementType.f8).bitwidth == 8 + assert candidate_gen.ShapedType([1, 1], candidate_gen.ElementType.f16).bitwidth == 16 def test_get_shaped_type_to_str(): - assert str(tune.ShapedType([1024, 2048], tune.ElementType.i8)) == "1024x2048xi8" - assert str(tune.ShapedType([1024], tune.ElementType.f32)) == "1024xf32" - assert str(tune.ShapedType([1, 2, 3], tune.ElementType.f16)) == "1x2x3xf16" - assert str(tune.ShapedType([-1, 2, 3], tune.ElementType.f16)) == "?x2x3xf16" + assert str(candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8)) == "1024x2048xi8" + assert str(candidate_gen.ShapedType([1024], candidate_gen.ElementType.f32)) == "1024xf32" + assert str(candidate_gen.ShapedType([1, 2, 3], candidate_gen.ElementType.f16)) == "1x2x3xf16" + assert str(candidate_gen.ShapedType([-1, 2, 3], candidate_gen.ElementType.f16)) == "?x2x3xf16" def test_parse_tensor_type(): - assert tune.parse_tensor_type("tensor<1x2x3xf32>") == tune.ShapedType( - [1, 2, 3], tune.ElementType.f32 + assert candidate_gen.parse_tensor_type("tensor<1x2x3xf32>") == candidate_gen.ShapedType( + [1, 2, 3], candidate_gen.ElementType.f32 ) - assert tune.parse_tensor_type("tensor<123xi8>") == tune.ShapedType( - [123], tune.ElementType.i8 + assert candidate_gen.parse_tensor_type("tensor<123xi8>") == candidate_gen.ShapedType( + [123], candidate_gen.ElementType.i8 ) def test_get_mmt_tile_sizes(): - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=0, workgroup_size=[], intrinsic="", @@ -45,11 +45,11 @@ def test_get_mmt_tile_sizes(): subgroup_n_count=0, waves_per_eu=0, ) - assert tune.get_mmt_tile_sizes(config) == [128, 320, 32] + assert candidate_gen.get_mmt_tile_sizes(config) == [128, 320, 32] def test_get_conv_tile_sizes(): - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[256, 1, 1], intrinsic="#iree_gpu.mma_layout", @@ -58,11 +58,11 @@ def test_get_conv_tile_sizes(): subgroup_n_count=4, waves_per_eu=1, ) - assert tune.get_conv_tile_sizes(config) == [1, 1, 464, 320, 1, 1, 16] + assert candidate_gen.get_conv_tile_sizes(config) == [1, 1, 464, 320, 1, 1, 16] def test_get_contract_tile_sizes(): - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=32, workgroup_size=[16, 16, 1], intrinsic="", @@ -71,14 +71,14 @@ def test_get_contract_tile_sizes(): subgroup_n_count=1, waves_per_eu=2, ) - assert tune.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16] - assert tune.get_contract_tile_sizes(config, ["n", "m", "k"]) == [8, 4, 16] - assert tune.get_contract_tile_sizes(config, ["k", "n", "m"]) == [16, 8, 4] - assert tune.get_contract_tile_sizes(config, ["k", "k", "k"]) == [16, 16, 16] + assert candidate_gen.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16] + assert candidate_gen.get_contract_tile_sizes(config, ["n", "m", "k"]) == [8, 4, 16] + assert candidate_gen.get_contract_tile_sizes(config, ["k", "n", "m"]) == [16, 8, 4] + assert candidate_gen.get_contract_tile_sizes(config, ["k", "k", "k"]) == [16, 16, 16] def test_get_pipeline_config(): - config1 = tune.Configuration( + config1 = candidate_gen.Configuration( subgroup_size=32, workgroup_size=[16, 16, 1], intrinsic="", @@ -87,7 +87,7 @@ def test_get_pipeline_config(): subgroup_n_count=1, waves_per_eu=2, ) - config2 = tune.Configuration( + config2 = candidate_gen.Configuration( subgroup_size=32, workgroup_size=[16, 16, 1], intrinsic="", @@ -96,9 +96,9 @@ def test_get_pipeline_config(): subgroup_n_count=1, waves_per_eu=4, ) - assert tune.get_pipeline_config(config1) == ", prefetch_shared_memory" + assert candidate_gen.get_pipeline_config(config1) == ", prefetch_shared_memory" assert ( - tune.get_pipeline_config(config2) + candidate_gen.get_pipeline_config(config2) == ', prefetch_shared_memory, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}' ) @@ -110,12 +110,12 @@ def test_get_shapes_mmt(): r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} {', r"^bb0(%in: f16, %in_0: f16, %out: f32):", ] - assert tune.get_shapes_mmt(template) == tune.ProblemSize( - tune.MatmulSize(2048, 1280, 1280), - tune.ShapedType([2048, 1280], tune.ElementType.f16), - tune.ShapedType([1280, 1280], tune.ElementType.f16), - tune.ShapedType([2048, 1280], tune.ElementType.f32), - tune.DispatchKind.mmt, + assert candidate_gen.get_shapes_mmt(template) == candidate_gen.ProblemSize( + candidate_gen.MatmulSize(2048, 1280, 1280), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.mmt, ) @@ -125,12 +125,12 @@ def test_get_shapes_conv(): r"%8 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%5, %6 : tensor<1x3x34x1280xf16>, tensor<3x3x1280x256xf16>) outs(%7 : tensor<1x1x32x256xf32>) -> tensor<1x1x32x256xf32>", r"flow.dispatch.tensor.store %8, %2, offsets = [%workgroup_id_z, %workgroup_id_y, 0, %3], sizes = [1, 1, 32, 256], strides = [1, 1, 1, 1] : tensor<1x1x32x256xf32> -> !flow.dispatch.tensor>", ] - assert tune.get_shapes_conv(template) == tune.ProblemSize( - tune.MatmulSize(32, 256, 11520), - tune.ShapedType([1, 3, 34, 1280], tune.ElementType.f16), - tune.ShapedType([3, 3, 1280, 256], tune.ElementType.f16), - tune.ShapedType([1, 1, 32, 256], tune.ElementType.f32), - tune.DispatchKind.conv, + assert candidate_gen.get_shapes_conv(template) == candidate_gen.ProblemSize( + candidate_gen.MatmulSize(32, 256, 11520), + candidate_gen.ShapedType([1, 3, 34, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([3, 3, 1280, 256], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([1, 1, 32, 256], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.conv, ) @@ -141,12 +141,12 @@ def test_get_shapes_contract(): r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} {', r"^bb0(%in: f16, %in_0: f16, %out: f32):", ] - assert tune.get_shapes_contract(template, "mk", "nk") == tune.ProblemSize( - tune.MatmulSize(2048, 1280, 1280), - tune.ShapedType([2048, 1280], tune.ElementType.f16), - tune.ShapedType([1280, 1280], tune.ElementType.f16), - tune.ShapedType([2048, 1280], tune.ElementType.f32), - tune.DispatchKind.contraction, + assert candidate_gen.get_shapes_contract(template, "mk", "nk") == candidate_gen.ProblemSize( + candidate_gen.MatmulSize(2048, 1280, 1280), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.contraction, ) @@ -156,12 +156,12 @@ def test_get_shapes_batch_matmul(): "%11 = linalg.batch_matmul ins(%8, %9 : tensor<1x32x1024xf32>, tensor<1x1024x32xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>", "flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : tensor<1x32x32xf32> -> !flow.dispatch.tensor>", ] - assert tune.get_shapes_batch_matmul(template, "bmk", "bkn") == tune.ProblemSize( - tune.MatmulSize(32, 32, 1024, 1), - tune.ShapedType([1, 32, 1024], tune.ElementType.f32), - tune.ShapedType([1, 1024, 32], tune.ElementType.f32), - tune.ShapedType([1, 32, 32], tune.ElementType.f32), - tune.DispatchKind.batch_matmul, + assert candidate_gen.get_shapes_batch_matmul(template, "bmk", "bkn") == candidate_gen.ProblemSize( + candidate_gen.MatmulSize(32, 32, 1024, 1), + candidate_gen.ShapedType([1, 32, 1024], candidate_gen.ElementType.f32), + candidate_gen.ShapedType([1, 1024, 32], candidate_gen.ElementType.f32), + candidate_gen.ShapedType([1, 32, 32], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.batch_matmul, ) @@ -171,122 +171,122 @@ def test_get_shapes_batch_mmt(): r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<2x4096x640xi8>, tensor<2x640x640xi8>) outs(%19 : tensor<2x4096x640xi32>) attrs = {lowering_config = #iree_codegen.lowering_config} {', r"flow.dispatch.tensor.store %21, %10, offsets = [0, 0, 0], sizes = [2, 4096, 640], strides = [1, 1, 1] : tensor<2x4096x640xf16> -> !flow.dispatch.tensor>", ] - assert tune.get_shapes_batch_mmt(template) == tune.ProblemSize( - tune.MatmulSize(4096, 640, 640, 2), - tune.ShapedType([2, 4096, 640], tune.ElementType.i8), - tune.ShapedType([2, 640, 640], tune.ElementType.i8), - tune.ShapedType([2, 4096, 640], tune.ElementType.i32), - tune.DispatchKind.batch_mmt, + assert candidate_gen.get_shapes_batch_mmt(template) == candidate_gen.ProblemSize( + candidate_gen.MatmulSize(4096, 640, 640, 2), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([2, 640, 640], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i32), + candidate_gen.DispatchKind.batch_mmt, ) def test_mfma_intrinsic_to_str(): - assert str(tune.MfmaIntrinsic.mfma_f16_16x16x16_f32()) == "MFMA_F16_16x16x16_F32" - assert str(tune.MfmaIntrinsic.mfma_i8_32x32x16_i32()) == "MFMA_I8_32x32x16_I32" + assert str(candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32()) == "MFMA_F16_16x16x16_F32" + assert str(candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32()) == "MFMA_I8_32x32x16_I32" def test_get_compatible_mfma_intrinsics(): - assert tune.get_compatible_mfma_intrinsics( - tune.ProblemSize( - tune.MatmulSize(2048, 1280, 1280), - tune.ShapedType([2048, 1280], tune.ElementType.f16), - tune.ShapedType([1280, 1280], tune.ElementType.f16), - tune.ShapedType([2048, 1280], tune.ElementType.f32), - tune.DispatchKind.mmt, + assert candidate_gen.get_compatible_mfma_intrinsics( + candidate_gen.ProblemSize( + candidate_gen.MatmulSize(2048, 1280, 1280), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.mmt, ) ) == [ - tune.MfmaIntrinsic.mfma_f16_16x16x16_f32(), - tune.MfmaIntrinsic.mfma_f16_32x32x8_f32(), + candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(), + candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(), ] - assert tune.get_compatible_mfma_intrinsics( - tune.ProblemSize( - tune.MatmulSize(2048, 1280, 1280), - tune.ShapedType([2048, 1280], tune.ElementType.i8), - tune.ShapedType([1280, 1280], tune.ElementType.i8), - tune.ShapedType([2048, 1280], tune.ElementType.i32), - tune.DispatchKind.mmt, + assert candidate_gen.get_compatible_mfma_intrinsics( + candidate_gen.ProblemSize( + candidate_gen.MatmulSize(2048, 1280, 1280), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.i32), + candidate_gen.DispatchKind.mmt, ) ) == [ - tune.MfmaIntrinsic.mfma_i8_16x16x32_i32(), - tune.MfmaIntrinsic.mfma_i8_32x32x16_i32(), + candidate_gen.MfmaIntrinsic.mfma_i8_16x16x32_i32(), + candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32(), ] - assert tune.get_compatible_mfma_intrinsics( - tune.ProblemSize( - tune.MatmulSize(968, 320, 640, 64), - tune.ShapedType([64, 968, 640], tune.ElementType.f32), - tune.ShapedType([64, 640, 320], tune.ElementType.f32), - tune.ShapedType([64, 968, 320], tune.ElementType.f32), - tune.DispatchKind.batch_matmul, + assert candidate_gen.get_compatible_mfma_intrinsics( + candidate_gen.ProblemSize( + candidate_gen.MatmulSize(968, 320, 640, 64), + candidate_gen.ShapedType([64, 968, 640], candidate_gen.ElementType.f32), + candidate_gen.ShapedType([64, 640, 320], candidate_gen.ElementType.f32), + candidate_gen.ShapedType([64, 968, 320], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.batch_matmul, ) ) == [ - tune.MfmaIntrinsic.mfma_f16_16x16x16_f32(), - tune.MfmaIntrinsic.mfma_f16_32x32x8_f32(), + candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(), + candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(), ] def test_generate_solutions(): - matmul_size = tune.MatmulSize(2048, 3840, 1280) - lhs_type = tune.ShapedType([2048, 1280], tune.ElementType.f16) - rhs_type = tune.ShapedType([3840, 1280], tune.ElementType.f16) - res_type = tune.ShapedType([2048, 3840], tune.ElementType.f32) - problem_size = tune.ProblemSize( - matmul_size, lhs_type, rhs_type, res_type, tune.DispatchKind.mmt - ) - configs = tune.generate_solutions(problem_size, 4) + matmul_size = candidate_gen.MatmulSize(2048, 3840, 1280) + lhs_type = candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16) + rhs_type = candidate_gen.ShapedType([3840, 1280], candidate_gen.ElementType.f16) + res_type = candidate_gen.ShapedType([2048, 3840], candidate_gen.ElementType.f32) + problem_size = candidate_gen.ProblemSize( + matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt + ) + configs = candidate_gen.generate_solutions(problem_size, 4) assert configs is not None def test_calculate_shared_memory_usage_in_bytes(): - matmul_size = tune.MatmulSize(1024, 1024, 1024) - lhs_type = tune.ShapedType([1024, 1024], tune.ElementType.f16) - rhs_type = tune.ShapedType([1024, 1024], tune.ElementType.f16) - res_type = tune.ShapedType([1024, 1024], tune.ElementType.f32) - problem_size = tune.ProblemSize( - matmul_size, lhs_type, rhs_type, res_type, tune.DispatchKind.mmt + matmul_size = candidate_gen.MatmulSize(1024, 1024, 1024) + lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16) + rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16) + res_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f32) + problem_size = candidate_gen.ProblemSize( + matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) assert ( - tune.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128) + candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128) == 147456 ) - lhs_type = tune.ShapedType([1024, 1024], tune.ElementType.i8) - problem_size = tune.ProblemSize( - matmul_size, lhs_type, rhs_type, res_type, tune.DispatchKind.mmt + lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.i8) + problem_size = candidate_gen.ProblemSize( + matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) assert ( - tune.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128) == 81920 + candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128) == 81920 ) - rhs_type = tune.ShapedType([1024, 1024], tune.ElementType.i32) - problem_size = tune.ProblemSize( - matmul_size, lhs_type, rhs_type, res_type, tune.DispatchKind.mmt + rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.i32) + problem_size = candidate_gen.ProblemSize( + matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) assert ( - tune.calculate_shared_memory_usage_in_bytes(problem_size, 128, 64, 32) == 12288 + candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 128, 64, 32) == 12288 ) def test_generate_constraints_valid_input(): - matmul_size = tune.MatmulSize(1024, 1024, 1024) - lhs_type = tune.ShapedType([1024, 1024], tune.ElementType.f16) - rhs_type = tune.ShapedType([1024, 1024], tune.ElementType.f16) - res_type = tune.ShapedType([1024, 1024], tune.ElementType.f32) - problem_size = tune.ProblemSize( - matmul_size, lhs_type, rhs_type, res_type, tune.DispatchKind.mmt + matmul_size = candidate_gen.MatmulSize(1024, 1024, 1024) + lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16) + rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16) + res_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f32) + problem_size = candidate_gen.ProblemSize( + matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) # Define input parameters as z3 Ints - m, n, k = tune.z3.Int("m"), tune.z3.Int("n"), tune.z3.Int("k") - subgroup_size = tune.z3.Int("subgroup_size") - intrinsic_mn = tune.z3.Int("intrinsic_mn") - intrinsic_k = tune.z3.Int("intrinsic_k") - wg_x, wg_y, wg_z = tune.z3.Int("wg_x"), tune.z3.Int("wg_y"), tune.z3.Int("wg_z") - sg_m_cnt = tune.z3.Int("sg_m_cnt") - sg_n_cnt = tune.z3.Int("sg_n_cnt") - waves_per_eu = tune.z3.Int("waves_per_eu") - - constraints = tune.generate_constraints( + m, n, k = candidate_gen.z3.Int("m"), candidate_gen.z3.Int("n"), candidate_gen.z3.Int("k") + subgroup_size = candidate_gen.z3.Int("subgroup_size") + intrinsic_mn = candidate_gen.z3.Int("intrinsic_mn") + intrinsic_k = candidate_gen.z3.Int("intrinsic_k") + wg_x, wg_y, wg_z = candidate_gen.z3.Int("wg_x"), candidate_gen.z3.Int("wg_y"), candidate_gen.z3.Int("wg_z") + sg_m_cnt = candidate_gen.z3.Int("sg_m_cnt") + sg_n_cnt = candidate_gen.z3.Int("sg_n_cnt") + waves_per_eu = candidate_gen.z3.Int("waves_per_eu") + + constraints = candidate_gen.generate_constraints( problem_size, [m, n, k], 4, @@ -298,32 +298,32 @@ def test_generate_constraints_valid_input(): waves_per_eu, ) - solver = tune.z3.Solver() + solver = candidate_gen.z3.Solver() solver.add(constraints) # Check if the constraints are satisfiable - assert solver.check() == tune.z3.sat + assert solver.check() == candidate_gen.z3.sat def test_generate_constraints_invalid_input(): # Define input parameters that should lead to unsatisfiable constraints - matmul_size = tune.MatmulSize(1024, 1024, 1024) - lhs_type = tune.ShapedType([1024, 1024], tune.ElementType.f16) - rhs_type = tune.ShapedType([1024, 1024], tune.ElementType.f16) - res_type = tune.ShapedType([1024, 1024], tune.ElementType.f32) - problem_size = tune.ProblemSize( - matmul_size, lhs_type, rhs_type, res_type, tune.DispatchKind.mmt - ) - m, n, k = tune.z3.Int("m"), tune.z3.Int("n"), tune.z3.Int("k") - subgroup_size = tune.z3.Int("subgroup_size") - intrinsic_mn = tune.z3.Int("intrinsic_mn") - intrinsic_k = tune.z3.Int("intrinsic_k") - wg_x, wg_y, wg_z = tune.z3.Int("wg_x"), tune.z3.Int("wg_y"), tune.z3.Int("wg_z") - sg_m_cnt = tune.z3.Int("sg_m_cnt") - sg_n_cnt = tune.z3.Int("sg_n_cnt") - waves_per_eu = tune.z3.Int("waves_per_eu") - - constraints = tune.generate_constraints( + matmul_size = candidate_gen.MatmulSize(1024, 1024, 1024) + lhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16) + rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f16) + res_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.f32) + problem_size = candidate_gen.ProblemSize( + matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt + ) + m, n, k = candidate_gen.z3.Int("m"), candidate_gen.z3.Int("n"), candidate_gen.z3.Int("k") + subgroup_size = candidate_gen.z3.Int("subgroup_size") + intrinsic_mn = candidate_gen.z3.Int("intrinsic_mn") + intrinsic_k = candidate_gen.z3.Int("intrinsic_k") + wg_x, wg_y, wg_z = candidate_gen.z3.Int("wg_x"), candidate_gen.z3.Int("wg_y"), candidate_gen.z3.Int("wg_z") + sg_m_cnt = candidate_gen.z3.Int("sg_m_cnt") + sg_n_cnt = candidate_gen.z3.Int("sg_n_cnt") + waves_per_eu = candidate_gen.z3.Int("waves_per_eu") + + constraints = candidate_gen.generate_constraints( problem_size, [m, n, k], 4, @@ -336,11 +336,11 @@ def test_generate_constraints_invalid_input(): ) constraints.append(m > 1000) # Adding an additional unsatisfiable constraint - solver = tune.z3.Solver() + solver = candidate_gen.z3.Solver() solver.add(constraints) # Check if the constraints are unsatisfiable - assert solver.check() == tune.z3.unsat + assert solver.check() == candidate_gen.z3.unsat def test_apply_params_mmt(): @@ -353,24 +353,24 @@ def test_apply_params_mmt(): M, N, K = 2048, 1280, 1280 - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=16, workgroup_size=[16, 16, 1], - intrinsic=tune.MfmaIntrinsic.mfma_f16_16x16x16_f32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(), tile_sizes=[8, 8, 8], subgroup_m_count=16, subgroup_n_count=16, waves_per_eu=8, ) - problem_size = tune.ProblemSize( - tune.MatmulSize(M, N, K), - tune.ShapedType([M, K], tune.ElementType.f16), - tune.ShapedType([N, K], tune.ElementType.f16), - tune.ShapedType([M, N], tune.ElementType.f32), - tune.DispatchKind.mmt, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(M, N, K), + candidate_gen.ShapedType([M, K], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([N, K], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([M, N], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.mmt, ) - modified, embeddable = tune.apply_params_mmt(problem_size, mlir_template, config) + modified, embeddable = candidate_gen.apply_params_mmt(problem_size, mlir_template, config) assert modified assert embeddable @@ -396,24 +396,24 @@ def test_apply_params_conv(): n, oh, ow, oc, fh, fw, ic = 2, 64, 64, 640, 3, 3, 640 - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[256, 1, 1], - intrinsic=tune.MfmaIntrinsic.mfma_f16_16x16x16_f32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(), tile_sizes=[464, 320, 16], subgroup_m_count=1, subgroup_n_count=4, waves_per_eu=2, ) - problem_size = tune.ProblemSize( - tune.MatmulSize(oh * ow, oc, fh * fw * ic), - tune.ShapedType([n, oh + 2, ow + 2, oc], tune.ElementType.f16), - tune.ShapedType([fh, fw, ic, oc], tune.ElementType.f16), - tune.ShapedType([n, oh, ow, oc], tune.ElementType.f32), - tune.DispatchKind.conv, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(oh * ow, oc, fh * fw * ic), + candidate_gen.ShapedType([n, oh + 2, ow + 2, oc], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([fh, fw, ic, oc], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([n, oh, ow, oc], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.conv, ) - modified, embeddable = tune.apply_params_conv(problem_size, mlir_template, config) + modified, embeddable = candidate_gen.apply_params_conv(problem_size, mlir_template, config) assert modified assert embeddable @@ -438,25 +438,25 @@ def test_apply_params_contract(): ] tile_dims = "*mnk" - problem_size = tune.ProblemSize( - tune.MatmulSize(2048, 3840, 1280), - tune.ShapedType([2, 1024, 1280], tune.ElementType.f16), - tune.ShapedType([3, 20, 64, 1280], tune.ElementType.f16), - tune.ShapedType([3, 2, 20, 1024, 64], tune.ElementType.f32), - tune.DispatchKind.contraction, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(2048, 3840, 1280), + candidate_gen.ShapedType([2, 1024, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([3, 20, 64, 1280], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([3, 2, 20, 1024, 64], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.contraction, ) - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[256, 1, 1], - intrinsic=tune.MfmaIntrinsic.mfma_f16_32x32x8_f32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(), tile_sizes=[480, 384, 32], subgroup_m_count=1, subgroup_n_count=4, waves_per_eu=2, ) - new_mlir, _embeddable = tune.apply_params_contract( + new_mlir, _embeddable = candidate_gen.apply_params_contract( problem_size, tile_dims, mlir_template, config ) @@ -482,25 +482,25 @@ def test_apply_params_batch_matmul(): ] tile_dims = "bmnk" - problem_size = tune.ProblemSize( - tune.MatmulSize(968, 320, 640, 64), - tune.ShapedType([64, 968, 640], tune.ElementType.f16), - tune.ShapedType([64, 640, 320], tune.ElementType.f16), - tune.ShapedType([64, 968, 320], tune.ElementType.f32), - tune.DispatchKind.batch_matmul, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(968, 320, 640, 64), + candidate_gen.ShapedType([64, 968, 640], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([64, 640, 320], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([64, 968, 320], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.batch_matmul, ) - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[128, 2, 1], - intrinsic=tune.MfmaIntrinsic.mfma_f16_32x32x8_f32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_32x32x8_f32(), tile_sizes=[416, 320, 128], subgroup_m_count=2, subgroup_n_count=2, waves_per_eu=2, ) - modified, embeddable = tune.apply_params_batch_matmul( + modified, embeddable = candidate_gen.apply_params_batch_matmul( problem_size, tile_dims, mlir_template, config ) @@ -526,25 +526,25 @@ def test_apply_params_batch_mmt_float(): '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}', ] - problem_size = tune.ProblemSize( - tune.MatmulSize(4096, 640, 640, 2), - tune.ShapedType([2, 4096, 640], tune.ElementType.f16), - tune.ShapedType([2, 640, 640], tune.ElementType.f16), - tune.ShapedType([2, 4096, 640], tune.ElementType.f32), - tune.DispatchKind.batch_mmt, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(4096, 640, 640, 2), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([2, 640, 640], candidate_gen.ElementType.f16), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.f32), + candidate_gen.DispatchKind.batch_mmt, ) - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[128, 2, 1], - intrinsic=tune.MfmaIntrinsic.mfma_f16_16x16x16_f32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32(), tile_sizes=[128, 64, 128], subgroup_m_count=2, subgroup_n_count=2, waves_per_eu=2, ) - modified, embeddable = tune.apply_params_batch_mmt( + modified, embeddable = candidate_gen.apply_params_batch_mmt( problem_size, mlir_template, config ) @@ -570,25 +570,25 @@ def test_apply_params_batch_mmt_int(): '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}', ] - problem_size = tune.ProblemSize( - tune.MatmulSize(4096, 640, 640, 2), - tune.ShapedType([2, 4096, 640], tune.ElementType.i8), - tune.ShapedType([2, 640, 640], tune.ElementType.i8), - tune.ShapedType([2, 4096, 640], tune.ElementType.i32), - tune.DispatchKind.batch_mmt, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(4096, 640, 640, 2), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([2, 640, 640], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i32), + candidate_gen.DispatchKind.batch_mmt, ) - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[128, 2, 1], - intrinsic=tune.MfmaIntrinsic.mfma_i8_32x32x16_i32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32(), tile_sizes=[128, 64, 128], subgroup_m_count=2, subgroup_n_count=2, waves_per_eu=4, ) - modified, embeddable = tune.apply_params_batch_mmt( + modified, embeddable = candidate_gen.apply_params_batch_mmt( problem_size, mlir_template, config ) @@ -635,25 +635,25 @@ def test_apply_params_broadcast_rhs_mmt(): '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}', ] - problem_size = tune.ProblemSize( - tune.MatmulSize(4096, 640, 640, 2), - tune.ShapedType([2, 4096, 640], tune.ElementType.i8), - tune.ShapedType([640, 640], tune.ElementType.i8), - tune.ShapedType([2, 4096, 640], tune.ElementType.i32), - tune.DispatchKind.broadcast_rhs_mmt, + problem_size = candidate_gen.ProblemSize( + candidate_gen.MatmulSize(4096, 640, 640, 2), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([640, 640], candidate_gen.ElementType.i8), + candidate_gen.ShapedType([2, 4096, 640], candidate_gen.ElementType.i32), + candidate_gen.DispatchKind.broadcast_rhs_mmt, ) - config = tune.Configuration( + config = candidate_gen.Configuration( subgroup_size=64, workgroup_size=[128, 2, 1], - intrinsic=tune.MfmaIntrinsic.mfma_i8_32x32x16_i32(), + intrinsic=candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32(), tile_sizes=[128, 64, 128], subgroup_m_count=2, subgroup_n_count=2, waves_per_eu=4, ) - modified, embeddable = tune.apply_params_broadcast_rhs_mmt( + modified, embeddable = candidate_gen.apply_params_broadcast_rhs_mmt( problem_size, mlir_template, config ) @@ -702,7 +702,7 @@ def test_detect_broadcast_rhs_mmt(): r"%19 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%c0_i32 : i32) outs(%18 : tensor<2x1024x10240xi32>) -> tensor<2x1024x10240xi32>", r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<2x1024x1280xi8>, tensor<10240x1280xi8>) outs(%19 : tensor<2x1024x10240xi32>) attrs = {lowering_config = #iree_codegen.lowering_config} {', ] - assert tune.is_broadcast_rhs_mmt(mlir_lines) + assert candidate_gen.is_broadcast_rhs_mmt(mlir_lines) def test_parse_mlir(): @@ -714,7 +714,7 @@ def test_parse_mlir(): } } """ - mlir_module = tune.parse_mlir(mlir_str) + mlir_module = candidate_gen.parse_mlir(mlir_str) assert mlir_module != None - assert isinstance(mlir_module, tune.ireec._mlir_libs._mlir.ir.Module) - assert isinstance(mlir_module.body.operations[0], tune.ireec.dialects.func.FuncOp) + assert isinstance(mlir_module, candidate_gen.ireec._mlir_libs._mlir.ir.Module) + assert isinstance(mlir_module.body.operations[0], candidate_gen.ireec.dialects.func.FuncOp) diff --git a/tuning/test_autotune.py b/tuning/test_libtuner.py similarity index 82% rename from tuning/test_autotune.py rename to tuning/test_libtuner.py index 2a7adfe..394208c 100644 --- a/tuning/test_autotune.py +++ b/tuning/test_libtuner.py @@ -7,22 +7,22 @@ import argparse import pytest from unittest.mock import call, patch, MagicMock -import autotune +import libtuner """ -Usage: python -m pytest test_autotune.py +Usage: python -m pytest test_libtuner.py """ def test_group_benchmark_results_by_device_id(): - def generate_res(res_arg: str, device_id: int) -> autotune.TaskResult: - result: autotune.subprocess.CompletedProcess = ( - autotune.subprocess.CompletedProcess( + def generate_res(res_arg: str, device_id: int) -> libtuner.TaskResult: + result: libtuner.subprocess.CompletedProcess = ( + libtuner.subprocess.CompletedProcess( args=[res_arg], returncode=0, ) ) - return autotune.TaskResult(result=result, device_id=device_id) + return libtuner.TaskResult(result=result, device_id=device_id) test_input = [ generate_res("str1", 3), @@ -39,7 +39,7 @@ def generate_res(res_arg: str, device_id: int) -> autotune.TaskResult: [generate_res("str5", 7)], ] - actual_output = autotune.group_benchmark_results_by_device_id(test_input) + actual_output = libtuner.group_benchmark_results_by_device_id(test_input) for a, e in zip(actual_output, expect_output): for res1, res2 in zip(a, e): @@ -48,7 +48,7 @@ def generate_res(res_arg: str, device_id: int) -> autotune.TaskResult: def test_sort_candidates_by_first_benchmark_times(): - candidate_trackers = [autotune.CandidateTracker(i) for i in range(5)] + candidate_trackers = [libtuner.CandidateTracker(i) for i in range(5)] candidate_trackers[0].first_benchmark_time = 35 candidate_trackers[1].first_benchmark_time = 2141 candidate_trackers[2].first_benchmark_time = 231 @@ -57,7 +57,7 @@ def test_sort_candidates_by_first_benchmark_times(): test_input = [i for i in range(5)] expect_output = [0, 4, 2, 3, 1] assert ( - autotune.sort_candidates_by_first_benchmark_times( + libtuner.sort_candidates_by_first_benchmark_times( test_input, candidate_trackers ) == expect_output @@ -66,9 +66,9 @@ def test_sort_candidates_by_first_benchmark_times(): def test_find_collisions(): input = [(1, "abc"), (2, "def"), (3, "abc")] - assert autotune.find_collisions(input) == (True, [("abc", [1, 3]), ("def", [2])]) + assert libtuner.find_collisions(input) == (True, [("abc", [1, 3]), ("def", [2])]) input = [(1, "abc"), (2, "def"), (3, "hig")] - assert autotune.find_collisions(input) == ( + assert libtuner.find_collisions(input) == ( False, [("abc", [1]), ("def", [2]), ("hig", [3])], ) @@ -76,32 +76,32 @@ def test_find_collisions(): def test_collision_handler(): input = [(1, "abc"), (2, "def"), (3, "abc"), (4, "def"), (5, "hig")] - assert autotune.collision_handler(input) == (True, [1, 2, 5]) + assert libtuner.collision_handler(input) == (True, [1, 2, 5]) input = [(1, "abc"), (2, "def"), (3, "hig")] - assert autotune.collision_handler(input) == (False, []) + assert libtuner.collision_handler(input) == (False, []) def test_DispatchBenchmarkResult_get(): normal_str = "2 Mean Time: 586.0" - res = autotune.DispatchBenchmarkResult(normal_str) + res = libtuner.DispatchBenchmarkResult(normal_str) assert res.result_str == normal_str assert res.get_tokens() == ["2", "Mean", "Time:", "586.0"] assert res.get_candidate_id() == 2 assert res.get_benchmark_time() == 586.0 incomplete_str = "2 Mean Time:" - res = autotune.DispatchBenchmarkResult(incomplete_str) + res = libtuner.DispatchBenchmarkResult(incomplete_str) assert res.get_tokens() == ["2", "Mean", "Time:"] assert res.get_candidate_id() == 2 assert res.get_benchmark_time() == None incomplete_str = "" - res = autotune.DispatchBenchmarkResult(incomplete_str) + res = libtuner.DispatchBenchmarkResult(incomplete_str) assert res.get_tokens() == [] assert res.get_candidate_id() == None assert res.get_benchmark_time() == None bad_str = 12345 - res = autotune.DispatchBenchmarkResult(bad_str) + res = libtuner.DispatchBenchmarkResult(bad_str) assert res.get_tokens() == [] assert res.get_candidate_id() == None assert res.get_benchmark_time() == None @@ -109,7 +109,7 @@ def test_DispatchBenchmarkResult_get(): def test_ModelBenchmarkResult_get(): normal_str = "Benchmarking: unet_candidate_12.vmfb on device 24\nBM_main/process_time/real_time_median 182 ms 183 ms 5 items_per_second=5.50302/s" - res = autotune.ModelBenchmarkResult(normal_str) + res = libtuner.ModelBenchmarkResult(normal_str) assert res.result_str == normal_str assert res.get_tokens() == [ "Benchmarking:", @@ -131,7 +131,7 @@ def test_ModelBenchmarkResult_get(): assert res.get_benchmark_time() == 182.0 incomplete_str = "Benchmarking: baseline.vmfb on device 24\n" - res = autotune.ModelBenchmarkResult(incomplete_str) + res = libtuner.ModelBenchmarkResult(incomplete_str) assert res.get_tokens() == [ "Benchmarking:", "baseline.vmfb", @@ -144,7 +144,7 @@ def test_ModelBenchmarkResult_get(): assert res.get_device_id() == 24 assert res.get_benchmark_time() == None incomplete_str = "" - res = autotune.ModelBenchmarkResult(incomplete_str) + res = libtuner.ModelBenchmarkResult(incomplete_str) assert res.get_tokens() == [] assert res.get_model_candidate_path() == None assert res.get_candidate_id() == None @@ -152,7 +152,7 @@ def test_ModelBenchmarkResult_get(): assert res.get_benchmark_time() == None bad_str = 12345 - res = autotune.ModelBenchmarkResult(bad_str) + res = libtuner.ModelBenchmarkResult(bad_str) assert res.get_tokens() == [] assert res.get_model_candidate_path() == None assert res.get_candidate_id() == None @@ -161,12 +161,12 @@ def test_ModelBenchmarkResult_get(): def test_generate_sample_result(): - res = autotune.DispatchBenchmarkResult() + res = libtuner.DispatchBenchmarkResult() output = res.generate_sample_result(1, 3.14) expected = f"1\tMean Time: 3.1\n" assert output == expected, "DispatchBenchmarkResult generates invalid sample string" - res = autotune.ModelBenchmarkResult() + res = libtuner.ModelBenchmarkResult() output = res.generate_sample_result( 1, "some_dir/tuning_2024_07_24_20_06/unet_candidate_60.vmfb.vmfb", 576.89 ) @@ -179,7 +179,7 @@ def test_ModelBenchmarkResult_get_calibrated_result_str(): res_time = 304 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" change = (res_time - baseline_time) / baseline_time - output_str = autotune.ModelBenchmarkResult(result_str).get_calibrated_result_str( + output_str = libtuner.ModelBenchmarkResult(result_str).get_calibrated_result_str( change ) expect_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median\t {float(res_time)} ms (-28.132%)\t 305 ms\t 5 items_per_second=1.520000/s" @@ -189,7 +189,7 @@ def test_ModelBenchmarkResult_get_calibrated_result_str(): res_time = 218 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" change = (res_time - baseline_time) / baseline_time - output_str = autotune.ModelBenchmarkResult(result_str).get_calibrated_result_str( + output_str = libtuner.ModelBenchmarkResult(result_str).get_calibrated_result_str( change ) expect_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median\t {float(res_time)} ms (+0.000%)\t 305 ms\t 5 items_per_second=1.520000/s" @@ -199,7 +199,7 @@ def test_ModelBenchmarkResult_get_calibrated_result_str(): res_time = 345 result_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median {float(res_time)} ms 305 ms 5 items_per_second=1.520000/s" change = (res_time - baseline_time) / baseline_time - output_str = autotune.ModelBenchmarkResult(result_str).get_calibrated_result_str( + output_str = libtuner.ModelBenchmarkResult(result_str).get_calibrated_result_str( change ) expect_str = f"Benchmarking: tuning_2024_07_22_16_29/unet_candidate_16.vmfb on device 0\nBM_run_forward/process_time/real_time_median\t {float(res_time)} ms (+180.488%)\t 305 ms\t 5 items_per_second=1.520000/s" @@ -207,18 +207,18 @@ def test_ModelBenchmarkResult_get_calibrated_result_str(): def test_parse_dispatch_benchmark_results(): - def generate_res(stdout: str) -> autotune.TaskResult: - result = autotune.subprocess.CompletedProcess( + def generate_res(stdout: str) -> libtuner.TaskResult: + result = libtuner.subprocess.CompletedProcess( args=[""], stdout=stdout, returncode=0, ) - return autotune.TaskResult(result) + return libtuner.TaskResult(result) def generate_parsed_disptach_benchmark_result( time: float, i: int - ) -> autotune.ParsedDisptachBenchmarkResult: - return autotune.ParsedDisptachBenchmarkResult( + ) -> libtuner.ParsedDisptachBenchmarkResult: + return libtuner.ParsedDisptachBenchmarkResult( i, time, path_config.get_candidate_mlir_path(i), @@ -234,23 +234,23 @@ def generate_parsed_disptach_benchmark_result( for i in random_order ] - path_config = autotune.PathConfig() + path_config = libtuner.PathConfig() candidate_trackers = [ - autotune.CandidateTracker( + libtuner.CandidateTracker( i, dispatch_mlir_path=path_config.get_candidate_mlir_path(i) ) for i in range(total) ] candidate_trackers_before = [ - autotune.CandidateTracker( + libtuner.CandidateTracker( i, dispatch_mlir_path=path_config.get_candidate_mlir_path(i) ) for i in range(total) ] expect_candidate_trackers = [ - autotune.CandidateTracker( + libtuner.CandidateTracker( i, dispatch_mlir_path=path_config.get_candidate_mlir_path(i), spec_path=path_config.get_candidate_spec_mlir_path(i), @@ -272,7 +272,7 @@ def generate_parsed_disptach_benchmark_result( mock_tuning_client.get_candidate_spec_filename.side_effect = ( lambda i: f"{i}_spec.mlir" ) - parsed_results, dump_list = autotune.parse_dispatch_benchmark_results( + parsed_results, dump_list = libtuner.parse_dispatch_benchmark_results( path_config, benchmark_results, candidate_trackers, mock_tuning_client ) @@ -283,16 +283,16 @@ def generate_parsed_disptach_benchmark_result( def test_parse_grouped_benchmark_results(): - def generate_res(stdout: str, device_id: int) -> autotune.TaskResult: - result = autotune.subprocess.CompletedProcess( + def generate_res(stdout: str, device_id: int) -> libtuner.TaskResult: + result = libtuner.subprocess.CompletedProcess( args=[""], stdout=stdout, returncode=0, ) - return autotune.TaskResult(result=result, device_id=device_id) + return libtuner.TaskResult(result=result, device_id=device_id) def set_tracker( - tracker: autotune.CandidateTracker, + tracker: libtuner.CandidateTracker, model_benchmark_time: float, model_benchmark_device_id: int, baseline_benchmark_time: float, @@ -319,12 +319,12 @@ def set_tracker( ], ] - path_config = autotune.PathConfig() + path_config = libtuner.PathConfig() - candidate_trackers = [autotune.CandidateTracker(i) for i in range(5)] + candidate_trackers = [libtuner.CandidateTracker(i) for i in range(5)] - candidate_trackers_before = [autotune.CandidateTracker(i) for i in range(5)] - expect_candidate_trackers = [autotune.CandidateTracker(i) for i in range(5)] + candidate_trackers_before = [libtuner.CandidateTracker(i) for i in range(5)] + expect_candidate_trackers = [libtuner.CandidateTracker(i) for i in range(5)] set_tracker(expect_candidate_trackers[1], 62.4, 0, 60.7, 0.028006589785831888) set_tracker(expect_candidate_trackers[2], 61.4, 1, 59.8, 0.02675585284280939) set_tracker(expect_candidate_trackers[4], 57.4, 1, 59.8, -0.04013377926421403) @@ -342,7 +342,7 @@ def set_tracker( "BM_main/process_time/real_time_median 61.4 ms (+2.676%) 11.0 ms 5 items_per_second=16.2958/s", ] - dump_list = autotune.parse_grouped_benchmark_results( + dump_list = libtuner.parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) @@ -357,7 +357,7 @@ def set_tracker( b1 = "Benchmarking: baseline.vmfb on device 0" s1 = "Benchmarking: unet_candidate_1.vmfb on device 0 BM_main/process_time/real_time_median 62.4 ms 15.4 ms 5 items_per_second=16.0223/s" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] - dump_list = autotune.parse_grouped_benchmark_results( + dump_list = libtuner.parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) expect_dump_list = [ @@ -371,7 +371,7 @@ def set_tracker( s1 = "Benchmarking: unet_candidate_1.vmfb on device 0" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] candidate_trackers[1].model_path = "unet_candidate_1.vmfb" - dump_list = autotune.parse_grouped_benchmark_results( + dump_list = libtuner.parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) expect_dump_list = [ @@ -385,7 +385,7 @@ def set_tracker( s1 = "Benchmarking: unet_candidate_1.vmfb on device 0" grouped_benchmark_results = [[generate_res(b1, 0), generate_res(s1, 0)]] candidate_trackers[1].model_path = "unet_candidate_1.vmfb" - dump_list = autotune.parse_grouped_benchmark_results( + dump_list = libtuner.parse_grouped_benchmark_results( path_config, grouped_benchmark_results, candidate_trackers ) expect_dump_list = [ @@ -401,7 +401,7 @@ def test_extract_driver_names(): user_devices = ["hip://0", "local-sync://default", "cuda://default"] expected_output = {"hip", "local-sync", "cuda"} - assert autotune.extract_driver_names(user_devices) == expected_output + assert libtuner.extract_driver_names(user_devices) == expected_output def test_fetch_available_devices_success(): @@ -412,7 +412,7 @@ def test_fetch_available_devices_success(): "cuda": [{"path": "default"}], } - with patch("autotune.ireert.get_driver") as mock_get_driver: + with patch("libtuner.ireert.get_driver") as mock_get_driver: mock_driver = MagicMock() def get_mock_driver(name): @@ -421,7 +421,7 @@ def get_mock_driver(name): mock_get_driver.side_effect = get_mock_driver - actual_output = autotune.fetch_available_devices(drivers) + actual_output = libtuner.fetch_available_devices(drivers) expected_output = ["hip://0", "local-sync://default", "cuda://default"] assert actual_output == expected_output @@ -435,8 +435,8 @@ def test_fetch_available_devices_failure(): "cuda": [{"path": "default"}], } - with patch("autotune.ireert.get_driver") as mock_get_driver: - with patch("autotune.handle_error") as mock_handle_error: + with patch("libtuner.ireert.get_driver") as mock_get_driver: + with patch("libtuner.handle_error") as mock_handle_error: mock_driver = MagicMock() def get_mock_driver(name): @@ -452,7 +452,7 @@ def get_mock_driver(name): mock_get_driver.side_effect = get_mock_driver - actual_output = autotune.fetch_available_devices(drivers) + actual_output = libtuner.fetch_available_devices(drivers) expected_output = ["hip://0", "cuda://default"] assert actual_output == expected_output @@ -468,8 +468,8 @@ def test_parse_devices(): user_devices_str = "hip://0, local-sync://default, cuda://default" expected_output = ["hip://0", "local-sync://default", "cuda://default"] - with patch("autotune.handle_error") as mock_handle_error: - actual_output = autotune.parse_devices(user_devices_str) + with patch("libtuner.handle_error") as mock_handle_error: + actual_output = libtuner.parse_devices(user_devices_str) assert actual_output == expected_output mock_handle_error.assert_not_called() @@ -484,8 +484,8 @@ def test_parse_devices_with_invalid_input(): "cuda://default", ] - with patch("autotune.handle_error") as mock_handle_error: - actual_output = autotune.parse_devices(user_devices_str) + with patch("libtuner.handle_error") as mock_handle_error: + actual_output = libtuner.parse_devices(user_devices_str) assert actual_output == expected_output mock_handle_error.assert_called_once_with( @@ -499,13 +499,13 @@ def test_validate_devices(): user_devices = ["hip://0", "local-sync://default"] user_drivers = {"hip", "local-sync"} - with patch("autotune.extract_driver_names", return_value=user_drivers): + with patch("libtuner.extract_driver_names", return_value=user_drivers): with patch( - "autotune.fetch_available_devices", + "libtuner.fetch_available_devices", return_value=["hip://0", "local-sync://default"], ): - with patch("autotune.handle_error") as mock_handle_error: - autotune.validate_devices(user_devices) + with patch("libtuner.handle_error") as mock_handle_error: + libtuner.validate_devices(user_devices) assert all( call[1]["condition"] is False for call in mock_handle_error.call_args_list @@ -516,13 +516,13 @@ def test_validate_devices_with_invalid_device(): user_devices = ["hip://0", "local-sync://default", "cuda://default"] user_drivers = {"hip", "local-sync", "cuda"} - with patch("autotune.extract_driver_names", return_value=user_drivers): + with patch("libtuner.extract_driver_names", return_value=user_drivers): with patch( - "autotune.fetch_available_devices", + "libtuner.fetch_available_devices", return_value=["hip://0", "local-sync://default"], ): - with patch("autotune.handle_error") as mock_handle_error: - autotune.validate_devices(user_devices) + with patch("libtuner.handle_error") as mock_handle_error: + libtuner.validate_devices(user_devices) expected_call = call( condition=True, msg=f"Invalid device specified: cuda://default\nFetched available devices: ['hip://0', 'local-sync://default']", From 65c37313855f88d5254e871335c1e422649076cc Mon Sep 17 00:00:00 2001 From: Amily Wu Date: Mon, 19 Aug 2024 13:08:02 -0500 Subject: [PATCH 20/20] Fix lint err --- tuning/test_candidate_gen.py | 122 ++++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 29 deletions(-) diff --git a/tuning/test_candidate_gen.py b/tuning/test_candidate_gen.py index 5adca21..ad9b97e 100644 --- a/tuning/test_candidate_gen.py +++ b/tuning/test_candidate_gen.py @@ -13,26 +13,50 @@ def test_get_shaped_type_element_bitwidth(): - assert candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8).bitwidth == 8 - assert candidate_gen.ShapedType([2048], candidate_gen.ElementType.i32).bitwidth == 32 - assert candidate_gen.ShapedType([2048, 512, 384], candidate_gen.ElementType.f8).bitwidth == 8 - assert candidate_gen.ShapedType([1, 1], candidate_gen.ElementType.f16).bitwidth == 16 + assert ( + candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8).bitwidth + == 8 + ) + assert ( + candidate_gen.ShapedType([2048], candidate_gen.ElementType.i32).bitwidth == 32 + ) + assert ( + candidate_gen.ShapedType( + [2048, 512, 384], candidate_gen.ElementType.f8 + ).bitwidth + == 8 + ) + assert ( + candidate_gen.ShapedType([1, 1], candidate_gen.ElementType.f16).bitwidth == 16 + ) def test_get_shaped_type_to_str(): - assert str(candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8)) == "1024x2048xi8" - assert str(candidate_gen.ShapedType([1024], candidate_gen.ElementType.f32)) == "1024xf32" - assert str(candidate_gen.ShapedType([1, 2, 3], candidate_gen.ElementType.f16)) == "1x2x3xf16" - assert str(candidate_gen.ShapedType([-1, 2, 3], candidate_gen.ElementType.f16)) == "?x2x3xf16" + assert ( + str(candidate_gen.ShapedType([1024, 2048], candidate_gen.ElementType.i8)) + == "1024x2048xi8" + ) + assert ( + str(candidate_gen.ShapedType([1024], candidate_gen.ElementType.f32)) + == "1024xf32" + ) + assert ( + str(candidate_gen.ShapedType([1, 2, 3], candidate_gen.ElementType.f16)) + == "1x2x3xf16" + ) + assert ( + str(candidate_gen.ShapedType([-1, 2, 3], candidate_gen.ElementType.f16)) + == "?x2x3xf16" + ) def test_parse_tensor_type(): - assert candidate_gen.parse_tensor_type("tensor<1x2x3xf32>") == candidate_gen.ShapedType( - [1, 2, 3], candidate_gen.ElementType.f32 - ) - assert candidate_gen.parse_tensor_type("tensor<123xi8>") == candidate_gen.ShapedType( - [123], candidate_gen.ElementType.i8 - ) + assert candidate_gen.parse_tensor_type( + "tensor<1x2x3xf32>" + ) == candidate_gen.ShapedType([1, 2, 3], candidate_gen.ElementType.f32) + assert candidate_gen.parse_tensor_type( + "tensor<123xi8>" + ) == candidate_gen.ShapedType([123], candidate_gen.ElementType.i8) def test_get_mmt_tile_sizes(): @@ -74,7 +98,11 @@ def test_get_contract_tile_sizes(): assert candidate_gen.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16] assert candidate_gen.get_contract_tile_sizes(config, ["n", "m", "k"]) == [8, 4, 16] assert candidate_gen.get_contract_tile_sizes(config, ["k", "n", "m"]) == [16, 8, 4] - assert candidate_gen.get_contract_tile_sizes(config, ["k", "k", "k"]) == [16, 16, 16] + assert candidate_gen.get_contract_tile_sizes(config, ["k", "k", "k"]) == [ + 16, + 16, + 16, + ] def test_get_pipeline_config(): @@ -141,7 +169,9 @@ def test_get_shapes_contract(): r'%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13, %14 : tensor<2048x1280xf16>, tensor<1280x1280xf16>) outs(%19 : tensor<2048x1280xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} {', r"^bb0(%in: f16, %in_0: f16, %out: f32):", ] - assert candidate_gen.get_shapes_contract(template, "mk", "nk") == candidate_gen.ProblemSize( + assert candidate_gen.get_shapes_contract( + template, "mk", "nk" + ) == candidate_gen.ProblemSize( candidate_gen.MatmulSize(2048, 1280, 1280), candidate_gen.ShapedType([2048, 1280], candidate_gen.ElementType.f16), candidate_gen.ShapedType([1280, 1280], candidate_gen.ElementType.f16), @@ -156,7 +186,9 @@ def test_get_shapes_batch_matmul(): "%11 = linalg.batch_matmul ins(%8, %9 : tensor<1x32x1024xf32>, tensor<1x1024x32xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>", "flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : tensor<1x32x32xf32> -> !flow.dispatch.tensor>", ] - assert candidate_gen.get_shapes_batch_matmul(template, "bmk", "bkn") == candidate_gen.ProblemSize( + assert candidate_gen.get_shapes_batch_matmul( + template, "bmk", "bkn" + ) == candidate_gen.ProblemSize( candidate_gen.MatmulSize(32, 32, 1024, 1), candidate_gen.ShapedType([1, 32, 1024], candidate_gen.ElementType.f32), candidate_gen.ShapedType([1, 1024, 32], candidate_gen.ElementType.f32), @@ -181,8 +213,14 @@ def test_get_shapes_batch_mmt(): def test_mfma_intrinsic_to_str(): - assert str(candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32()) == "MFMA_F16_16x16x16_F32" - assert str(candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32()) == "MFMA_I8_32x32x16_I32" + assert ( + str(candidate_gen.MfmaIntrinsic.mfma_f16_16x16x16_f32()) + == "MFMA_F16_16x16x16_F32" + ) + assert ( + str(candidate_gen.MfmaIntrinsic.mfma_i8_32x32x16_i32()) + == "MFMA_I8_32x32x16_I32" + ) def test_get_compatible_mfma_intrinsics(): @@ -256,7 +294,8 @@ def test_calculate_shared_memory_usage_in_bytes(): matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) assert ( - candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128) == 81920 + candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 512, 64, 128) + == 81920 ) rhs_type = candidate_gen.ShapedType([1024, 1024], candidate_gen.ElementType.i32) @@ -264,7 +303,8 @@ def test_calculate_shared_memory_usage_in_bytes(): matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) assert ( - candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 128, 64, 32) == 12288 + candidate_gen.calculate_shared_memory_usage_in_bytes(problem_size, 128, 64, 32) + == 12288 ) @@ -277,11 +317,19 @@ def test_generate_constraints_valid_input(): matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) # Define input parameters as z3 Ints - m, n, k = candidate_gen.z3.Int("m"), candidate_gen.z3.Int("n"), candidate_gen.z3.Int("k") + m, n, k = ( + candidate_gen.z3.Int("m"), + candidate_gen.z3.Int("n"), + candidate_gen.z3.Int("k"), + ) subgroup_size = candidate_gen.z3.Int("subgroup_size") intrinsic_mn = candidate_gen.z3.Int("intrinsic_mn") intrinsic_k = candidate_gen.z3.Int("intrinsic_k") - wg_x, wg_y, wg_z = candidate_gen.z3.Int("wg_x"), candidate_gen.z3.Int("wg_y"), candidate_gen.z3.Int("wg_z") + wg_x, wg_y, wg_z = ( + candidate_gen.z3.Int("wg_x"), + candidate_gen.z3.Int("wg_y"), + candidate_gen.z3.Int("wg_z"), + ) sg_m_cnt = candidate_gen.z3.Int("sg_m_cnt") sg_n_cnt = candidate_gen.z3.Int("sg_n_cnt") waves_per_eu = candidate_gen.z3.Int("waves_per_eu") @@ -314,11 +362,19 @@ def test_generate_constraints_invalid_input(): problem_size = candidate_gen.ProblemSize( matmul_size, lhs_type, rhs_type, res_type, candidate_gen.DispatchKind.mmt ) - m, n, k = candidate_gen.z3.Int("m"), candidate_gen.z3.Int("n"), candidate_gen.z3.Int("k") + m, n, k = ( + candidate_gen.z3.Int("m"), + candidate_gen.z3.Int("n"), + candidate_gen.z3.Int("k"), + ) subgroup_size = candidate_gen.z3.Int("subgroup_size") intrinsic_mn = candidate_gen.z3.Int("intrinsic_mn") intrinsic_k = candidate_gen.z3.Int("intrinsic_k") - wg_x, wg_y, wg_z = candidate_gen.z3.Int("wg_x"), candidate_gen.z3.Int("wg_y"), candidate_gen.z3.Int("wg_z") + wg_x, wg_y, wg_z = ( + candidate_gen.z3.Int("wg_x"), + candidate_gen.z3.Int("wg_y"), + candidate_gen.z3.Int("wg_z"), + ) sg_m_cnt = candidate_gen.z3.Int("sg_m_cnt") sg_n_cnt = candidate_gen.z3.Int("sg_n_cnt") waves_per_eu = candidate_gen.z3.Int("waves_per_eu") @@ -370,7 +426,9 @@ def test_apply_params_mmt(): candidate_gen.ShapedType([M, N], candidate_gen.ElementType.f32), candidate_gen.DispatchKind.mmt, ) - modified, embeddable = candidate_gen.apply_params_mmt(problem_size, mlir_template, config) + modified, embeddable = candidate_gen.apply_params_mmt( + problem_size, mlir_template, config + ) assert modified assert embeddable @@ -408,12 +466,16 @@ def test_apply_params_conv(): problem_size = candidate_gen.ProblemSize( candidate_gen.MatmulSize(oh * ow, oc, fh * fw * ic), - candidate_gen.ShapedType([n, oh + 2, ow + 2, oc], candidate_gen.ElementType.f16), + candidate_gen.ShapedType( + [n, oh + 2, ow + 2, oc], candidate_gen.ElementType.f16 + ), candidate_gen.ShapedType([fh, fw, ic, oc], candidate_gen.ElementType.f16), candidate_gen.ShapedType([n, oh, ow, oc], candidate_gen.ElementType.f32), candidate_gen.DispatchKind.conv, ) - modified, embeddable = candidate_gen.apply_params_conv(problem_size, mlir_template, config) + modified, embeddable = candidate_gen.apply_params_conv( + problem_size, mlir_template, config + ) assert modified assert embeddable @@ -717,4 +779,6 @@ def test_parse_mlir(): mlir_module = candidate_gen.parse_mlir(mlir_str) assert mlir_module != None assert isinstance(mlir_module, candidate_gen.ireec._mlir_libs._mlir.ir.Module) - assert isinstance(mlir_module.body.operations[0], candidate_gen.ireec.dialects.func.FuncOp) + assert isinstance( + mlir_module.body.operations[0], candidate_gen.ireec.dialects.func.FuncOp + )