From 0046184026ab4bf607ddf67f84ff383b3101e72e Mon Sep 17 00:00:00 2001 From: aekul Date: Tue, 5 Apr 2022 00:56:16 -0400 Subject: [PATCH 01/63] Add GPU autoscheduler --- apps/cuda_mat_mul/mat_mul_generator.cpp | 52 +- src/autoschedulers/CMakeLists.txt | 1 + src/autoschedulers/anderson2021/ASLog.cpp | 52 + src/autoschedulers/anderson2021/ASLog.h | 37 + .../anderson2021/AutoSchedule.cpp | 715 +++ .../anderson2021/AutoSchedule.h | 102 + src/autoschedulers/anderson2021/CostModel.h | 43 + .../anderson2021/DefaultCostModel.cpp | 401 ++ .../anderson2021/DefaultCostModel.h | 78 + src/autoschedulers/anderson2021/Errors.h | 26 + .../anderson2021/Featurization.h | 507 +++ .../anderson2021/FunctionDAG.cpp | 1283 ++++++ src/autoschedulers/anderson2021/FunctionDAG.h | 674 +++ .../anderson2021/GPULoopInfo.cpp | 108 + src/autoschedulers/anderson2021/GPULoopInfo.h | 55 + src/autoschedulers/anderson2021/GPUMemInfo.h | 412 ++ src/autoschedulers/anderson2021/LoopNest.cpp | 4024 +++++++++++++++++ src/autoschedulers/anderson2021/LoopNest.h | 582 +++ .../anderson2021/LoopNestParser.h | 212 + src/autoschedulers/anderson2021/Makefile | 292 ++ src/autoschedulers/anderson2021/NetworkSize.h | 12 + .../anderson2021/PerfectHashMap.h | 415 ++ .../anderson2021/SearchSpace.cpp | 685 +++ src/autoschedulers/anderson2021/SearchSpace.h | 111 + .../anderson2021/SearchSpaceOptions.h | 68 + src/autoschedulers/anderson2021/State.cpp | 1291 ++++++ src/autoschedulers/anderson2021/State.h | 253 ++ src/autoschedulers/anderson2021/Statistics.h | 133 + src/autoschedulers/anderson2021/ThreadInfo.h | 275 ++ src/autoschedulers/anderson2021/Tiling.cpp | 331 ++ src/autoschedulers/anderson2021/Tiling.h | 58 + src/autoschedulers/anderson2021/Weights.cpp | 170 + src/autoschedulers/anderson2021/Weights.h | 54 + .../anderson2021/autotune_loop.sh | 575 +++ .../anderson2021/baseline.weights | Bin 0 -> 20400 bytes .../anderson2021/check_weights.cpp | 59 + .../anderson2021/cost_model_generator.cpp | 676 +++ .../anderson2021/cost_model_schedule.h | 620 +++ .../anderson2021/demo_generator.cpp | 51 + .../anderson2021/featurization_to_sample.cpp | 42 + .../anderson2021/generate_data.sh | 211 + .../anderson2021/get_host_target.cpp | 21 + .../included_schedule_file.schedule.h | 60 + .../included_schedule_file_generator.cpp | 54 + .../anderson2021/retrain_cost_model.cpp | 728 +++ .../anderson2021/scripts/average_times.sh | 67 + .../anderson2021/scripts/predict_all.sh | 37 + .../anderson2021/scripts/utils.sh | 410 ++ src/autoschedulers/anderson2021/test.cpp | 435 ++ .../anderson2021/test/bounds.cpp | 319 ++ .../anderson2021/test/parser.cpp | 20 + .../anderson2021/test/state.cpp | 51 + .../anderson2021/test/storage_strides.cpp | 462 ++ src/autoschedulers/anderson2021/test/test.h | 33 + .../anderson2021/test/thread_info.cpp | 108 + .../anderson2021/test/tiling.cpp | 192 + .../anderson2021/test_function_dag.cpp | 174 + .../anderson2021/test_perfect_hash_map.cpp | 71 + .../anderson2021/weights/README.md | 6 + .../anderson2021/weights/bgu.weights | Bin 0 -> 20400 bytes .../weights/bilateral_grid.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/camera_pipe.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/conv_layer.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/cuda_mat_mul.weights | Bin 0 -> 20400 bytes .../weights/depthwise_separable_conv.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/gpu.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/harris.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/hist.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/iir_blur.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/interpolate.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/lens_blur.weights | Bin 0 -> 20400 bytes .../weights/local_laplacian.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/max_filter.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/nl_means.weights | Bin 0 -> 20400 bytes .../weights/stencil_chain.weights | Bin 0 -> 20400 bytes .../anderson2021/weights/unsharp.weights | Bin 0 -> 20400 bytes .../weightsdir_to_weightsfile.cpp | 29 + 77 files changed, 19002 insertions(+), 21 deletions(-) create mode 100644 src/autoschedulers/anderson2021/ASLog.cpp create mode 100644 src/autoschedulers/anderson2021/ASLog.h create mode 100644 src/autoschedulers/anderson2021/AutoSchedule.cpp create mode 100644 src/autoschedulers/anderson2021/AutoSchedule.h create mode 100644 src/autoschedulers/anderson2021/CostModel.h create mode 100644 src/autoschedulers/anderson2021/DefaultCostModel.cpp create mode 100644 src/autoschedulers/anderson2021/DefaultCostModel.h create mode 100644 src/autoschedulers/anderson2021/Errors.h create mode 100644 src/autoschedulers/anderson2021/Featurization.h create mode 100644 src/autoschedulers/anderson2021/FunctionDAG.cpp create mode 100644 src/autoschedulers/anderson2021/FunctionDAG.h create mode 100644 src/autoschedulers/anderson2021/GPULoopInfo.cpp create mode 100644 src/autoschedulers/anderson2021/GPULoopInfo.h create mode 100644 src/autoschedulers/anderson2021/GPUMemInfo.h create mode 100644 src/autoschedulers/anderson2021/LoopNest.cpp create mode 100644 src/autoschedulers/anderson2021/LoopNest.h create mode 100644 src/autoschedulers/anderson2021/LoopNestParser.h create mode 100644 src/autoschedulers/anderson2021/Makefile create mode 100644 src/autoschedulers/anderson2021/NetworkSize.h create mode 100644 src/autoschedulers/anderson2021/PerfectHashMap.h create mode 100644 src/autoschedulers/anderson2021/SearchSpace.cpp create mode 100644 src/autoschedulers/anderson2021/SearchSpace.h create mode 100644 src/autoschedulers/anderson2021/SearchSpaceOptions.h create mode 100644 src/autoschedulers/anderson2021/State.cpp create mode 100644 src/autoschedulers/anderson2021/State.h create mode 100644 src/autoschedulers/anderson2021/Statistics.h create mode 100644 src/autoschedulers/anderson2021/ThreadInfo.h create mode 100644 src/autoschedulers/anderson2021/Tiling.cpp create mode 100644 src/autoschedulers/anderson2021/Tiling.h create mode 100644 src/autoschedulers/anderson2021/Weights.cpp create mode 100644 src/autoschedulers/anderson2021/Weights.h create mode 100644 src/autoschedulers/anderson2021/autotune_loop.sh create mode 100644 src/autoschedulers/anderson2021/baseline.weights create mode 100644 src/autoschedulers/anderson2021/check_weights.cpp create mode 100644 src/autoschedulers/anderson2021/cost_model_generator.cpp create mode 100644 src/autoschedulers/anderson2021/cost_model_schedule.h create mode 100644 src/autoschedulers/anderson2021/demo_generator.cpp create mode 100644 src/autoschedulers/anderson2021/featurization_to_sample.cpp create mode 100644 src/autoschedulers/anderson2021/generate_data.sh create mode 100644 src/autoschedulers/anderson2021/get_host_target.cpp create mode 100644 src/autoschedulers/anderson2021/included_schedule_file.schedule.h create mode 100644 src/autoschedulers/anderson2021/included_schedule_file_generator.cpp create mode 100644 src/autoschedulers/anderson2021/retrain_cost_model.cpp create mode 100644 src/autoschedulers/anderson2021/scripts/average_times.sh create mode 100644 src/autoschedulers/anderson2021/scripts/predict_all.sh create mode 100644 src/autoschedulers/anderson2021/scripts/utils.sh create mode 100644 src/autoschedulers/anderson2021/test.cpp create mode 100644 src/autoschedulers/anderson2021/test/bounds.cpp create mode 100644 src/autoschedulers/anderson2021/test/parser.cpp create mode 100644 src/autoschedulers/anderson2021/test/state.cpp create mode 100644 src/autoschedulers/anderson2021/test/storage_strides.cpp create mode 100644 src/autoschedulers/anderson2021/test/test.h create mode 100644 src/autoschedulers/anderson2021/test/thread_info.cpp create mode 100644 src/autoschedulers/anderson2021/test/tiling.cpp create mode 100644 src/autoschedulers/anderson2021/test_function_dag.cpp create mode 100644 src/autoschedulers/anderson2021/test_perfect_hash_map.cpp create mode 100644 src/autoschedulers/anderson2021/weights/README.md create mode 100644 src/autoschedulers/anderson2021/weights/bgu.weights create mode 100644 src/autoschedulers/anderson2021/weights/bilateral_grid.weights create mode 100644 src/autoschedulers/anderson2021/weights/camera_pipe.weights create mode 100644 src/autoschedulers/anderson2021/weights/conv_layer.weights create mode 100644 src/autoschedulers/anderson2021/weights/cuda_mat_mul.weights create mode 100644 src/autoschedulers/anderson2021/weights/depthwise_separable_conv.weights create mode 100644 src/autoschedulers/anderson2021/weights/gpu.weights create mode 100644 src/autoschedulers/anderson2021/weights/harris.weights create mode 100644 src/autoschedulers/anderson2021/weights/hist.weights create mode 100644 src/autoschedulers/anderson2021/weights/iir_blur.weights create mode 100644 src/autoschedulers/anderson2021/weights/interpolate.weights create mode 100644 src/autoschedulers/anderson2021/weights/lens_blur.weights create mode 100644 src/autoschedulers/anderson2021/weights/local_laplacian.weights create mode 100644 src/autoschedulers/anderson2021/weights/max_filter.weights create mode 100644 src/autoschedulers/anderson2021/weights/nl_means.weights create mode 100644 src/autoschedulers/anderson2021/weights/stencil_chain.weights create mode 100644 src/autoschedulers/anderson2021/weights/unsharp.weights create mode 100644 src/autoschedulers/anderson2021/weightsdir_to_weightsfile.cpp diff --git a/apps/cuda_mat_mul/mat_mul_generator.cpp b/apps/cuda_mat_mul/mat_mul_generator.cpp index 1214ffbbbe8f..68001ce7a135 100644 --- a/apps/cuda_mat_mul/mat_mul_generator.cpp +++ b/apps/cuda_mat_mul/mat_mul_generator.cpp @@ -34,28 +34,38 @@ class MatMul : public Halide::Generator { Var xi, yi, xio, xii, yii, xo, yo, x_pair, xiio, ty; RVar rxo, rxi; - out.bound(x, 0, size) - .bound(y, 0, size) - .tile(x, y, xi, yi, 64, 16) - .tile(xi, yi, xii, yii, 4, 8) - .gpu_blocks(x, y) - .gpu_threads(xi, yi) - .unroll(xii) - .unroll(yii); - prod.compute_at(out, xi) - .vectorize(x) - .unroll(y) - .update() - .reorder(x, y, r) - .vectorize(x) - .unroll(y) - .unroll(r, 8); - A.in().compute_at(prod, r).vectorize(_0).unroll(_1); - B.in().compute_at(prod, r).vectorize(_0).unroll(_1); + if (!auto_schedule) { + out.bound(x, 0, size) + .bound(y, 0, size) + .tile(x, y, xi, yi, 64, 16) + .tile(xi, yi, xii, yii, 4, 8) + .gpu_blocks(x, y) + .gpu_threads(xi, yi) + .unroll(xii) + .unroll(yii); + prod.compute_at(out, xi) + .vectorize(x) + .unroll(y) + .update() + .reorder(x, y, r) + .vectorize(x) + .unroll(y) + .unroll(r, 8); + A.in().compute_at(prod, r).vectorize(_0).unroll(_1); + B.in().compute_at(prod, r).vectorize(_0).unroll(_1); - set_alignment_and_bounds(A, size); - set_alignment_and_bounds(B, size); - set_alignment_and_bounds(out, size); + set_alignment_and_bounds(A, size); + set_alignment_and_bounds(B, size); + set_alignment_and_bounds(out, size); + } else { + A.dim(0).set_estimate(0, size).dim(1).set_estimate(0, size); + B.dim(0).set_estimate(0, size).dim(1).set_estimate(0, size); + } + + // Always specify bounds for outputs, whether autoscheduled or not + out + .bound(x, 0, size) + .bound(y, 0, size); } }; diff --git a/src/autoschedulers/CMakeLists.txt b/src/autoschedulers/CMakeLists.txt index 9b88f0a664a1..6d2cb033ea7b 100644 --- a/src/autoschedulers/CMakeLists.txt +++ b/src/autoschedulers/CMakeLists.txt @@ -27,3 +27,4 @@ add_subdirectory(common) add_subdirectory(adams2019) add_subdirectory(li2018) add_subdirectory(mullapudi2016) +add_subdirectory(anderson2021) diff --git a/src/autoschedulers/anderson2021/ASLog.cpp b/src/autoschedulers/anderson2021/ASLog.cpp new file mode 100644 index 000000000000..601ceabd3c71 --- /dev/null +++ b/src/autoschedulers/anderson2021/ASLog.cpp @@ -0,0 +1,52 @@ +#include "ASLog.h" + +namespace Halide { +namespace Internal { + +namespace { + +std::string get_env_variable(char const *env_var_name) { + if (!env_var_name) { + return ""; + } + +#ifdef _MSC_VER + // call getenv_s without a buffer to determine the correct string length: + size_t length = 0; + if ((getenv_s(&length, NULL, 0, env_var_name) != 0) || (length == 0)) { + return ""; + } + // call it again to retrieve the value of the environment variable; + // note that 'length' already accounts for the null-terminator + std::string lvl(length - 1, '@'); + size_t read = 0; + if ((getenv_s(&read, &lvl[0], length, env_var_name) != 0) || (read != length)) { + return ""; + } + return lvl; +#else + char *lvl = getenv(env_var_name); + if (lvl) return std::string(lvl); +#endif + + return ""; +} + +} // namespace + +int aslog::aslog_level() { + static int cached_aslog_level = ([]() -> int { + // If HL_DEBUG_AUTOSCHEDULE is defined, use that value. + std::string lvl = get_env_variable("HL_DEBUG_AUTOSCHEDULE"); + if (!lvl.empty()) { + return atoi(lvl.c_str()); + } + // Otherwise, use HL_DEBUG_CODEGEN. + lvl = get_env_variable("HL_DEBUG_CODEGEN"); + return !lvl.empty() ? atoi(lvl.c_str()) : 0; + })(); + return cached_aslog_level; +} + +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/ASLog.h b/src/autoschedulers/anderson2021/ASLog.h new file mode 100644 index 000000000000..9ba9844ce342 --- /dev/null +++ b/src/autoschedulers/anderson2021/ASLog.h @@ -0,0 +1,37 @@ +#ifndef ASLOG_H +#define ASLOG_H + +// This class is used by train_cost_model, which doesn't link to +// libHalide, so (despite the namespace) we are better off not +// including Halide.h, lest we reference something we won't have available + +#include +#include +#include + +namespace Halide { +namespace Internal { + +class aslog { + const bool logging; + +public: + aslog(int verbosity) + : logging(verbosity <= aslog_level()) { + } + + template + aslog &operator<<(T &&x) { + if (logging) { + std::cerr << std::forward(x); + } + return *this; + } + + static int aslog_level(); +}; + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp new file mode 100644 index 000000000000..70b84757bfd2 --- /dev/null +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -0,0 +1,715 @@ +/* + This file is the core of the autoscheduler. Most of the code here is + about navigating the search space and computing the + featurization. This also contains the top-level interface into the + autoscheduler. + + The most interesting classes to look at are: + + LoopNest Represents one node in our tree representation of loop nests. + State A state in the beam search. Holds a root loop nest. + + Interesting functions below are: + + generate_schedule The top-level entrypoint, which computes and applies a schedule to a Halide pipeline + optimal_schedule Runs the passes of the coarse-to-fine beam search + optimal_schedule_pass Runs a single pass of beam search + LoopNest::compute_features Recursively walks over a loop nest tree, computing our featurization using Halide's analysis tools. + LoopNest::apply Actually apply a computed schedule to a Halide pipeline + State::generate_children Generates successor states to a state in the beam search + + Environment variables used (directly or indirectly): + + HL_BEAM_SIZE + Beam size to use in the beam search. Defaults to 32. Use 1 to get a greedy search instead. + + HL_CYOS + "Choose-your-own-schedule". If set to 1, lets you navigate the search tree by hand in the terminal. Whee! This is for debugging the autoscheduler. + + HL_FEATURE_FILE -> output + *** DEPRECATED *** use the 'featurization' output from Generator instead + Write out a training featurization for the selected schedule into this file. + Needs to be converted to a sample file with the runtime using featurization_to_sample before it can be used to train. + + HL_MACHINE_PARAMS + An architecture description string. Used by Halide master to configure the cost model. We only use the first term. Set it to the number of cores to target. + + HL_PERMIT_FAILED_UNROLL + Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. + + HL_SCHEDULE_FILE + *** DEPRECATED *** use the 'schedule' output from Generator instead + Write out a human-and-machine readable block of scheduling source code for the selected schedule into this file. + + HL_RANDOM_DROPOUT + percent chance of accepting each state in the beam. Normalized by the number of decisions made, so 5 would be there's a 5 percent chance of never rejecting any states. + + HL_SEED + Random seed used by the random dropout. + + HL_WEIGHTS_DIR + When training or schedule, read weights from this directory or file + (if path ends in `.weights` it is written as a single file, otherwise a directory of files) + + HL_NO_SUBTILING + If set to 1, limits the search space to that of Mullapudi et al. + + HL_DEBUG_AUTOSCHEDULE + If set, is used for the debug log level for auto-schedule generation (overriding the + value of HL_DEBUG_CODEGEN, if any). + + HL_SEARCH_SPACE_OPTIONS + Allow/disallow search space options to be considered by the autoscheduler. + Expects a string of four 0/1 values that allow/disallow the following options: compute root, inline, compute at the block level, compute at the thread level e.g. 1000 would allow compute root only + + HL_RANDOMIZE_TILINGS + If set, only a random subset of the generated tilings for each stage will be accepted into the beam + + HL_FREEZE_INLINE_COMPUTE_ROOT + If set, run a pre-pass where only compute_root and inline scheduling options are considered. The cheapest stages (according to the cost model) have these decisions 'frozen' for the remaining autoscheduling passes + + TODO: expose these settings by adding some means to pass args to + generator plugins instead of environment vars. +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ASLog.h" +#include "AutoSchedule.h" +#include "CostModel.h" +#include "DefaultCostModel.h" +#include "Errors.h" +#include "Featurization.h" +#include "FunctionDAG.h" +#include "Halide.h" +#include "LoopNest.h" +#include "LoopNestParser.h" +#include "NetworkSize.h" +#include "PerfectHashMap.h" +#include "State.h" + +#ifdef _WIN32 +#include +#define _isatty isatty; +#endif + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +using std::string; +using std::vector; +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; + +// Get the HL_RANDOM_DROPOUT environment variable. Purpose of this is described above. +double get_dropout_threshold() { + string random_dropout_str = get_env_variable("HL_RANDOM_DROPOUT"); + if (!random_dropout_str.empty()) { + return atof(random_dropout_str.c_str()); + } else { + return 100; + } +} + +// Decide whether or not to drop a beam search state. Used for +// randomly exploring the search tree for autotuning and to generate +// training data. +bool random_dropout(std::mt19937 &rng, size_t num_decisions) { + static double random_dropout_threshold = std::max(0.0, get_dropout_threshold()); + if (random_dropout_threshold >= 100) return false; + + // The random dropout threshold is the chance that we operate + // entirely greedily and never discard anything. + double t = random_dropout_threshold; + t /= 100; + t = std::pow(t, 1.0f / num_decisions); + t *= 100; + + double r = rng() % 100; + bool drop_it = r >= t; + return drop_it; +} + +// Get the HL_SEARCH_SPACE_OPTIONS environment variable. Described above +std::string get_search_space_options() { + std::string options = get_env_variable("HL_SEARCH_SPACE_OPTIONS"); + if (options.empty()) { + return "1111"; + } + return options; +} + +// Configure a cost model to process a specific pipeline. +void configure_pipeline_features(const FunctionDAG &dag, + const MachineParams ¶ms, + CostModel *cost_model) { + cost_model->reset(); + cost_model->set_pipeline_features(dag, params); +} + +AutoSchedule::AutoSchedule(const FunctionDAG &dag, + const MachineParams ¶ms, + const Target &target, + const std::vector& outputs, + std::mt19937 &rng, + CostModel *cost_model, + Statistics &stats, + SearchSpace &search_space, + const LoopNestParser* partial_schedule) + : dag{dag} + , params{params} + , target{target} + , outputs{outputs} + , rng{rng} + , cost_model{cost_model} + , stats{stats} + , search_space{search_space} + , partial_schedule{partial_schedule} +{ + configure_pipeline_features(dag, params, cost_model); +} + +// A single pass of coarse-to-fine beam search. +IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, + int pass_idx, + int num_passes, + ProgressBar &tick, + std::unordered_set &permitted_hashes) { + StateQueue q, pending; + + // The initial state, with no decisions made + { + IntrusivePtr initial{new State}; + initial->root = new LoopNest; + q.emplace(std::move(initial)); + } + + int expanded = 0; + + std::function &&)> enqueue_new_children = + [&](IntrusivePtr &&s) { + // aslog(0) << "\n** Generated child: "; + // s->dump(); + // s->calculate_cost(dag, params, nullptr, true); + + // Each child should have one more decision made than its parent state. + internal_assert(s->num_decisions_made == s->parent->num_decisions_made + 1); + + int progress = s->num_decisions_made * beam_size + expanded; + size_t max_progress = dag.nodes.size() * beam_size * 2; + + // Update the progress bar + tick.set(double(progress) / max_progress); + s->penalized = false; + + ++stats.num_states_added; + + // Add the state to the list of states to evaluate + q.emplace(std::move(s)); + }; + + string cyos_str = get_env_variable("HL_CYOS"); + string cyos_from_file_str = get_env_variable("HL_CYOS_FROM_FILE"); + bool cyos_from_file = !cyos_from_file_str.empty(); + bool cyos_is_enabled = cyos_from_file || cyos_str == "1"; + + std::unique_ptr target_loop_nest; + if (cyos_from_file) { + target_loop_nest = LoopNestParser::from_file(cyos_from_file_str); + } + + // This loop is beam search over the sequence of decisions to make. + for (int i = 0;; i++) { + std::unordered_map hashes; + q.swap(pending); + + if (pending.empty()) { + if ((false) && beam_size < 1000) { // Intentional dead code. Extra parens to pacify clang-tidy. + // Total mortality. Double the beam size and + // restart. Disabled for now because total mortality + // may indicate a bug. + return optimal_schedule_pass(beam_size * 2, + pass_idx, + num_passes, + tick, + permitted_hashes); + } else { + internal_error << "Ran out of legal states with beam size " << beam_size << "\n"; + } + } + + if ((int)pending.size() > beam_size * 10000) { + aslog(0) << "Warning: Huge number of states generated (" << pending.size() << ").\n"; + } + + expanded = 0; + while (expanded < beam_size && !pending.empty()) { + + IntrusivePtr state{pending.pop()}; + + if (beam_size > 1 && num_passes > 1 && pass_idx >= 0) { + // We are doing coarse-to-fine beam search using the + // hashing strategy mentioned in the paper. + // + // We will lazily apply cost penalties to the queue + // according to structural uniqueness. + if (!state->penalized) { + uint64_t h1 = state->structural_hash(pass_idx + 1); + uint64_t h0 = state->structural_hash(pass_idx - 1); + // We penalize the cost of a state proportionately + // to how many states we've already seen with that + // hash. + int penalty = ++hashes[h1]; + if (pass_idx > 0 && !permitted_hashes.count(h0)) { + // It's possible to get yourself into a state + // where the only things in the beam that match + // the hash were quick-rejected due to details not + // captured in the hash, so we apply a huge + // penalty, but leave the impermissible state in + // the beam. + penalty += 10; + } + if (penalty > 1) { + state->penalized = true; + state->cost *= penalty; + for (auto& c : state->cost_per_stage) { + c *= penalty; + } + // After penalizing this state, if it's no + // longer the best, defer it. We set the + // 'penalized' flag so that we know not to + // penalize and defer it again. + if (!pending.empty() && state->cost > pending.top()->cost) { + pending.emplace(std::move(state)); + continue; + } + } + } + } + + // Random dropout + if (pending.size() > 1 && random_dropout(rng, dag.nodes.size() * 2)) { + continue; + } + + if (state->num_decisions_made == 2 * (int)dag.nodes.size()) { + // We've reached the end of the pass. The first state + // must be the best, because we're pulling off a + // priority queue. + auto best = state; + + // Bless the reasonable stuff in the beam as + // permissible states to visit again. We define + // reasonable as having a cost no more than 20% higher + // than the cost of the best thing. Only do this if + // there are more coarse-to-fine passes yet to come. + if (pass_idx >= 0 && pass_idx + 1 < num_passes) { + int blessed = 0; + while (state->cost <= 1.2 * best->cost && blessed < beam_size) { + const State *s = state.get(); + while (s) { + uint64_t h1 = s->structural_hash(pass_idx); + permitted_hashes.insert(h1); + s = s->parent.get(); + } + if (pending.empty()) break; + state = pending.pop(); + blessed++; + } + } + + return best; + } + + Timer timer; + search_space.generate_children(state, enqueue_new_children, pass_idx, pass_idx == -1); + stats.generate_children_time += timer.elapsed(); + expanded++; + } + + // Drop the other states unconsidered. + pending.clear(); + + int cur_node = (q[0]->num_decisions_made - 1) / 2; + const FunctionDAG::Node *node = &dag.nodes[cur_node]; + if (partial_schedule && partial_schedule->is_in_partial_schedule(node)) { + bool found = false; + for (int i = (int)q.size() - 1; i >= 0; i--) { + auto state = q[i]; + LoopNestParser option = LoopNestParser::from_string(state->root->to_string()); + + if (partial_schedule->contains_sub_loop_nest_for_shared_stages(option)) { + if (cost_model) { + cost_model->evaluate_costs(); + } + + auto selected = q[i]; + q.clear(); + q.emplace(std::move(selected)); + found = true; + break; + } + } + + if (found) { + continue; + } + + aslog(0) << "Options:\n"; + for (int i = (int)q.size() - 1; i >= 0; i--) { + auto state = q[i]; + LoopNestParser option = LoopNestParser::from_string(state->root->to_string()); + aslog(0) << "Option " << i << ":\n"; + option.dump(); + } + aslog(0) << "\nTarget partial schedule:\n"; + partial_schedule->dump(); + internal_assert(false) << "Partial schedule not found"; + } + + if (cost_model) { + // Now evaluate all the costs and re-sort them in the priority queue + Timer timer; + cost_model->evaluate_costs(); + stats.cost_model_evaluation_time += timer.elapsed(); + q.resort(); + } + + if (cyos_is_enabled) { + int selection = -1; + bool found = false; + if (cyos_from_file) { + for (int choice_label = (int)q.size() - 1; choice_label >= 0; choice_label--) { + auto state = q[choice_label]; + LoopNestParser option = LoopNestParser::from_string(state->root->to_string()); + + if (target_loop_nest->contains_sub_loop_nest(option)) { + found = true; + selection = choice_label; + aslog(0) << "\nFound matching option\n"; + break; + } + } + + } + + if (!cyos_from_file || !found) { + // The user has set HL_CYOS, and wants to navigate the + // search space manually. Discard everything in the queue + // except for the user-chosen option. + aslog(0) << "\n--------------------\n"; + aslog(0) << "Select a schedule:\n"; + for (int choice_label = (int)q.size() - 1; choice_label >= 0; choice_label--) { + auto state = q[choice_label]; + aslog(0) << "\n[" << choice_label << "]:\n"; + state->dump(); + } + + int next_node = q[0]->num_decisions_made / 2; + if (next_node < (int)dag.nodes.size()) { + const FunctionDAG::Node *node = &dag.nodes[next_node]; + aslog(0) << "\nNext node to be scheduled: " << node->func.name() << "\n"; + } + } + cost_model->evaluate_costs(); + + if (cyos_from_file && !found) { + aslog(0) << "\nTarget loop nest was not found.\n"; + } + + if (!cyos_from_file || !found) { + // Select next partial schedule to expand. + while (selection < 0 || selection >= (int)q.size()) { + aslog(0) << "\nEnter selection: "; + std::cin >> selection; + } + } + + auto selected = q[selection]; + selected->dump(); + q.clear(); + q.emplace(std::move(selected)); + } + } +} + +// Performance coarse-to-fine beam search and return the best state found. +IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { + IntrusivePtr best; + + std::unordered_set permitted_hashes; + + // If the beam size is one, it's pointless doing multiple passes. + int num_passes = (beam_size == 1) ? 1 : 5; + + string cyos_str = get_env_variable("HL_CYOS"); + string cyos_from_file_str = get_env_variable("HL_CYOS_FROM_FILE"); + if (!cyos_from_file_str.empty()) { + cyos_str = "1"; + } + if (cyos_str == "1") { + // If the user is manually navigating the search space, don't + // ask them to do more than one pass. + num_passes = 1; + } + + string num_passes_str = get_env_variable("HL_NUM_PASSES"); + if (!num_passes_str.empty()) { + // The user has requested a non-standard number of passes. + num_passes = std::atoi(num_passes_str.c_str()); + } + + bool use_pre_pass = get_env_variable("HL_FREEZE_INLINE_COMPUTE_ROOT") == "1"; + int pass_idx = 0; + + if (use_pre_pass && num_passes > 1) { + pass_idx = -1; + --num_passes; + } + + for (; pass_idx < num_passes; pass_idx++) { + ProgressBar tick; + + auto pass = optimal_schedule_pass(beam_size, pass_idx, num_passes, tick, permitted_hashes); + + tick.clear(); + + if (aslog::aslog_level() == 0) { + aslog(0) << "Pass " << pass_idx + 1 << " of " << num_passes << ", cost: " << pass->cost << "\n"; + } else { + aslog(0) << "Pass " << pass_idx + 1 << " result: "; + pass->dump(); + } + + if (pass_idx == -1) { + search_space.freeze_lowest_cost_stages(pass); + } + + if (pass_idx >= 0 && (pass_idx == 0 || pass->cost < best->cost)) { + // Track which pass produced the lowest-cost state. It's + // not necessarily the final one. + best = pass; + } + } + + aslog(0) << "Best cost: " << best->cost << "\n"; + + return best; +} + +// The main entrypoint to generate a schedule for a pipeline. +void generate_schedule(const std::vector &outputs, + const Target &target, + const MachineParams ¶ms, + AutoSchedulerResults *auto_scheduler_results) { + internal_assert(target.has_gpu_feature()) << "Specified target (" << target.to_string() << ") does not support GPU"; + + Timer timer; + aslog(0) << "generate_schedule for target=" << target.to_string() << "\n"; + + // Start a timer + HALIDE_TIC; + + // Get the seed for random dropout + string seed_str = get_env_variable("HL_SEED"); + // Or use the time, if not set. + int seed = (int)time(NULL); + if (!seed_str.empty()) { + seed = atoi(seed_str.c_str()); + } + + aslog(1) << "Dropout seed = " << seed << "\n"; + + // Get the beam size + string beam_size_str = get_env_variable("HL_BEAM_SIZE"); + // Defaults to 32 + size_t beam_size = 32; + if (!beam_size_str.empty()) { + beam_size = atoi(beam_size_str.c_str()); + } + + string weights_in_path = get_env_variable("HL_WEIGHTS_DIR"); + string weights_out_path; // deliberately empty + + string randomize_weights_str = get_env_variable("HL_RANDOMIZE_WEIGHTS"); + bool randomize_weights = randomize_weights_str == "1"; + + // Analyse the Halide algorithm and construct our abstract representation of it + FunctionDAG dag(outputs, params, target); + if (aslog::aslog_level() > 0) { + dag.dump(); + } + + Statistics stats; + + // Construct a cost model to use to evaluate states. Currently we + // just have the one, but it's an abstract interface, so others + // can be slotted in for experimentation. + std::unique_ptr cost_model = make_default_cost_model(stats, weights_in_path, weights_out_path, randomize_weights); + internal_assert(cost_model != nullptr); + + IntrusivePtr optimal; + + string partial_schedule_filename = get_env_variable("PARTIAL_SCHEDULE"); + std::unique_ptr partial_schedule; + if (!partial_schedule_filename.empty()) { + aslog(0) << "Loading partial schedule from " << partial_schedule_filename << "\n"; + partial_schedule = LoopNestParser::from_file(partial_schedule_filename); + aslog(0) << "Partial schedule:\n"; + partial_schedule->dump(); + aslog(0) << "\n"; + } + + std::mt19937 rng{(uint32_t)seed}; + SearchSpace search_space{dag, params, target, get_search_space_options(), rng, cost_model.get(), stats, partial_schedule.get()}; + + AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()}; + + // Run beam search + optimal = autoschedule.optimal_schedule(beam_size); + + HALIDE_TOC; + + // Dump the schedule found + aslog(1) << "** Optimal schedule:\n"; + + // Just to get the debugging prints to fire + optimal->calculate_cost(dag, params, target, cost_model.get(), stats, aslog::aslog_level() > 0); + + // Apply the schedules to the pipeline + optimal->apply_schedule(dag, params, target); + + // Print out the schedule + if (aslog::aslog_level() > 0) { + aslog(0) << "BEGIN Final generated loop nest and schedule:\n"; + optimal->dump(); + aslog(0) << "END Final generated loop nest and schedule\n"; + optimal->print_compute_locations(); + } + + string schedule_file = get_env_variable("HL_SCHEDULE_FILE"); + if (!schedule_file.empty()) { + user_warning << "HL_SCHEDULE_FILE is deprecated; use the schedule output from Generator instead\n"; + aslog(1) << "Writing schedule to " << schedule_file << "...\n"; + std::ofstream f(schedule_file); + f << "// --- BEGIN machine-generated schedule\n" + << optimal->schedule_source + << "// --- END machine-generated schedule\n"; + f.close(); + internal_assert(!f.fail()) << "Failed to write " << schedule_file; + } + + // Save the featurization, so that we can use this schedule as + // training data (once we've benchmarked it). + string feature_file = get_env_variable("HL_FEATURE_FILE"); + if (!feature_file.empty()) { + user_warning << "HL_FEATURE_FILE is deprecated; use the featurization output from Generator instead\n"; + std::ofstream binfile(feature_file, std::ios::binary | std::ios_base::trunc); + optimal->save_featurization(dag, params, target, binfile); + binfile.close(); + internal_assert(!binfile.fail()) << "Failed to write " << feature_file; + } + + if (auto_scheduler_results) { + auto_scheduler_results->scheduler_name = "Anderson2021"; + auto_scheduler_results->schedule_source = optimal->schedule_source; + { + std::ostringstream out; + optimal->save_featurization(dag, params, target, out); + auto_scheduler_results->featurization.resize(out.str().size()); + memcpy(auto_scheduler_results->featurization.data(), out.str().data(), out.str().size()); + } + } + + aslog(1) << "Number of states added: " << stats.num_states_added << '\n'; + aslog(1) << "Number of featurizations computed: " << stats.num_featurizations << '\n'; + aslog(1) << "Number of memoization hits: " << stats.num_memoization_hits << '\n'; + aslog(1) << "Number of memoization misses: " << stats.num_memoization_misses << '\n'; + aslog(1) << "Number of block memoization hits: " << stats.num_block_memoization_hits << '\n'; + aslog(1) << "Number of block memoization misses: " << stats.num_block_memoization_misses << '\n'; + aslog(1) << "Total featurization time (ms): " << stats.total_featurization_time() << "\n"; + aslog(1) << "Average featurization time (ms): " << stats.average_featurization_time() << "\n"; + aslog(1) << "Total enqueue time (ms): " << stats.total_enqueue_time() << "\n"; + aslog(1) << "Total calculate cost time (ms): " << stats.total_calculate_cost_time() << "\n"; + aslog(1) << "Total feature write time (ms): " << stats.total_feature_write_time() << "\n"; + aslog(1) << "Total generate children time (ms): " << stats.total_generate_children_time() << "\n"; + aslog(1) << "Total compute in tiles time (ms): " << stats.total_compute_in_tiles_time() << "\n"; + aslog(1) << "Total filter thread tiles time (ms): " << stats.total_filter_thread_tiles_time() << "\n"; + aslog(1) << "Total filter parallel tiles time (ms): " << stats.total_filter_parallel_tiles_time() << "\n"; + + aslog(1) << "Number of schedules evaluated by cost model: " << stats.num_schedules_enqueued << '\n'; + aslog(1) << "Number of tilings generated: " << stats.num_tilings_generated << '\n'; + aslog(1) << "Number of tilings accepted: " << stats.num_tilings_accepted << '\n'; + aslog(1) << "Total cost model evaluation time (ms): " << stats.total_cost_model_evaluation_time() << "\n"; + aslog(1) << "Average cost model evaluation time (ms): " << stats.average_cost_model_evaluation_time() << "\n"; + std::chrono::duration total_time = timer.elapsed(); + aslog(1) << "Time taken for autoscheduler (s): " << std::chrono::duration_cast(total_time).count() / 1000.0 << '\n'; +} + +// Halide uses a plugin architecture for registering custom +// autoschedulers. We register our autoscheduler using a static +// constructor. +struct RegisterAutoscheduler { + RegisterAutoscheduler() { + aslog(1) << "Registering autoscheduler 'Anderson2021'...\n"; + Pipeline::add_autoscheduler("Anderson2021", *this); + } + + void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms, AutoSchedulerResults *results) { + std::vector outputs; + for (Func f : p.outputs()) { + outputs.push_back(f.function()); + } + Autoscheduler::generate_schedule(outputs, target, params, results); + } +} register_auto_scheduler; + +// An alternative entrypoint for other uses +void find_and_apply_schedule(FunctionDAG &dag, + const std::vector &outputs, + const MachineParams ¶ms, + const Target &target, + CostModel* cost_model, + int beam_size, + StageMap *schedule_features) { + + Statistics stats; + std::mt19937 rng{(uint32_t)12345}; + + string partial_schedule_filename = get_env_variable("PARTIAL_SCHEDULE"); + std::unique_ptr partial_schedule; + if (!partial_schedule_filename.empty()) { + aslog(0) << "Loading partial schedule from " << partial_schedule_filename << "\n"; + partial_schedule = LoopNestParser::from_file(partial_schedule_filename); + aslog(0) << "Partial schedule:\n"; + partial_schedule->dump(); + aslog(0) << "\n"; + } + + SearchSpace search_space{dag, params, target, get_env_variable("HL_SEARCH_SPACE_OPTIONS"), rng, cost_model, stats, partial_schedule.get()}; + AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()}; + + IntrusivePtr optimal = autoschedule.optimal_schedule(beam_size); + + // Apply the schedules + optimal->apply_schedule(dag, params, target); + + if (schedule_features) { + optimal->compute_featurization(dag, params, target, schedule_features, stats); + } +} + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/AutoSchedule.h b/src/autoschedulers/anderson2021/AutoSchedule.h new file mode 100644 index 000000000000..a068265a8db1 --- /dev/null +++ b/src/autoschedulers/anderson2021/AutoSchedule.h @@ -0,0 +1,102 @@ +#ifndef AUTO_SCHEDULE_H +#define AUTO_SCHEDULE_H + +#include +#include + +#include "CostModel.h" +#include "Featurization.h" +#include "FunctionDAG.h" +#include "Halide.h" +#include "PerfectHashMap.h" +#include "SearchSpace.h" +#include "State.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +struct ProgressBar { + void set(double progress) { + if (!draw_progress_bar) return; + counter++; + const int bits = 11; + if (counter & ((1 << bits) - 1)) return; + const int pos = (int)(progress * 78); + aslog(0) << '['; + for (int j = 0; j < 78; j++) { + if (j < pos) { + aslog(0) << '.'; + } else if (j - 1 < pos) { + aslog(0) << "/-\\|"[(counter >> bits) % 4]; + } else { + aslog(0) << ' '; + } + } + aslog(0) << ']'; + for (int j = 0; j < 80; j++) { + aslog(0) << '\b'; + } + } + + void clear() { + if (counter) { + for (int j = 0; j < 80; j++) { + aslog(0) << ' '; + } + for (int j = 0; j < 80; j++) { + aslog(0) << '\b'; + } + } + } + +private: + uint32_t counter = 0; + const bool draw_progress_bar = isatty(2); +}; + + +typedef PerfectHashMap StageMapOfScheduleFeatures; + +struct AutoSchedule { + const FunctionDAG &dag; + const MachineParams ¶ms; + const Target ⌖ + const std::vector& outputs; + std::mt19937 &rng; + CostModel *cost_model; + Statistics &stats; + SearchSpace &search_space; + const LoopNestParser* partial_schedule; + + AutoSchedule(const FunctionDAG &dag, + const MachineParams ¶ms, + const Target &target, + const std::vector& outputs, + std::mt19937 &rng, + CostModel *cost_model, + Statistics &stats, + SearchSpace &search_space, + const LoopNestParser* partial_schedule); + + bool use_partial_schedule() const { + return partial_schedule; + } + + IntrusivePtr optimal_schedule_pass(int beam_size, + int pass_idx, + int num_passes, + ProgressBar &tick, + std::unordered_set &permitted_hashes); + + // Performance coarse-to-fine beam search and return the best state found. + IntrusivePtr optimal_schedule(int beam_size); +}; + +void find_and_apply_schedule(FunctionDAG& dag, const std::vector &outputs, const MachineParams ¶ms, const Target &target, CostModel* cost_model, int beam_size, StageMapOfScheduleFeatures* schedule_features); + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // AUTO_SCHEDULE_H diff --git a/src/autoschedulers/anderson2021/CostModel.h b/src/autoschedulers/anderson2021/CostModel.h new file mode 100644 index 000000000000..784c292a9ccc --- /dev/null +++ b/src/autoschedulers/anderson2021/CostModel.h @@ -0,0 +1,43 @@ +#ifndef COST_MODEL_H +#define COST_MODEL_H + +#include + +#include "FunctionDAG.h" +#include "HalideBuffer.h" +#include "PerfectHashMap.h" + +// An abstract base class for a cost model. +namespace Halide { + +namespace Internal { +namespace Autoscheduler { +typedef PerfectHashMap StageMapOfScheduleFeatures; +} // namespace Autoscheduler +} // namespace Internal + +class CostModel { +public: + virtual ~CostModel() = default; + + // Configure the cost model for the algorithm to be scheduled. + virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, + const MachineParams ¶ms) = 0; + + // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place. + // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features. + virtual void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, + const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, + double *cost_ptr, + std::vector* cost_per_stage_ptr) = 0; + + // Evaluate all schedules in the queue. + virtual void evaluate_costs() = 0; + + // Discard all schedules in the queue. + virtual void reset() = 0; +}; + +} // namespace Halide + +#endif // COST_MODEL_H diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp new file mode 100644 index 000000000000..e68f4ecca7aa --- /dev/null +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -0,0 +1,401 @@ +// This file is a wrapper around the cost model that loads and saves +// weights, and maintains state of various kinds. For the actual cost +// model, see cost_model_generator.cpp + +#include +#include +#include +#include +#include +#include +#include + +#include "ASLog.h" +#include "DefaultCostModel.h" +#include "HalideBuffer.h" +#include "NetworkSize.h" +#include "cost_model.h" +#include "train_cost_model.h" + +// This is an embedded version of `baseline.weights`. +// The embedding is done using binary2cpp. + +extern "C" unsigned char baseline_weights[]; +extern "C" int baseline_weights_length; + +namespace Halide { +namespace { + +using Halide::Internal::aslog; +using Halide::Internal::PipelineFeatures; +using Halide::Internal::ScheduleFeatures; +using Halide::Internal::Weights; +using Halide::Runtime::Buffer; + +bool ends_with(const std::string &str, const std::string &suffix) { + if (str.size() < suffix.size()) return false; + size_t off = str.size() - suffix.size(); + for (size_t i = 0; i < suffix.size(); i++) { + if (str[off + i] != suffix[i]) return false; + } + return true; +} + +} // namespace + +void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, + const MachineParams ¶ms) { + + const int pipeline_feat_size = head1_w * head1_h; + // We ignore the first seven pipeline features in the cost + // model. It's just a mask of which types are in use. + static_assert(sizeof(PipelineFeatures) - 7 * sizeof(int) == + sizeof(int) * pipeline_feat_size, + "Incorrect size for pipeline features"); + int num_stages = 0; + for (const auto &n : dag.nodes) { + if (!n.is_input) num_stages += (int)n.stages.size(); + } + Runtime::Buffer pipeline_features(head1_w, head1_h, num_stages); + int stage = 0; + for (const auto &n : dag.nodes) { + if (n.is_input) continue; + for (auto it = n.stages.rbegin(); it != n.stages.rend(); it++) { + const auto &s = *it; + const int *pipeline_feats = (const int *)(&(s.features)) + 7; + // skip the first 7 features + for (int i = 0; i < pipeline_feat_size; i++) { + int x = i / 7; + int y = i % 7; + pipeline_features(x, y, stage) = pipeline_feats[i]; + } + stage += 1; + } + } + internal_assert(stage == num_stages); + pipeline_feat_queue = pipeline_features; + internal_assert(params.parallelism > 0); + num_cores = params.parallelism; +} + +void DefaultCostModel::set_pipeline_features(const Runtime::Buffer &pipeline_feats, int n) { + pipeline_feat_queue = pipeline_feats; + internal_assert(n > 0); + num_cores = n; +} + +void DefaultCostModel::enqueue(const Internal::Autoscheduler::FunctionDAG &dag, + const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, + double *cost_ptr, + std::vector *cost_per_stage_ptr) { + num_stages = (int)schedule_feats.size(); + + Runtime::Buffer schedule_features; + + // Tell the cost model about this state. It won't actually + // evaluate it until we call evaluate_costs (or if it runs out + // of internal buffer space), so that the evaluations can be + // batched. + Internal::Autoscheduler::Timer timer; + enqueue(num_stages, &schedule_features, cost_ptr, cost_per_stage_ptr); + stats.enqueue_time += timer.elapsed(); + ++stats.num_schedules_enqueued; + + timer.restart(); + // index of current stage whose features we are reading + int stage = 0; + // load schedule features into input buffer + for (const auto &n : dag.nodes) { + + // Inputs are computed outside of the pipeline and don't count. + if (n.is_input) continue; + + // The remaining stages are not yet + // scheduled. Optimistically assume their internal costs + // will not depend on the decisions made already, so + // there's no point adding it on to the total because it's + // the same across all states. An underestimate of the + // cost for loading from these unscheduled stages is + // already baked into the scheduled stages that consume + // them. + if (stage >= num_stages) break; + + // Load up the schedule features for all stages of this Func. + for (auto it = n.stages.rbegin(); it != n.stages.rend(); it++) { + internal_assert(schedule_feats.contains(&*it)) << n.func.name() << "\n"; + const auto &feat = schedule_feats.get(&*it); + for (size_t i = 0; i < ScheduleFeatures::num_features(); i++) { + schedule_features(i, stage) = feat[i]; + } + stage += 1; + } + } + stats.feature_write_time += timer.elapsed(); + // Check we considered everything we were supposed to. + internal_assert(stage == num_stages); +} + +void DefaultCostModel::enqueue(int ns, Runtime::Buffer *schedule_feats, double *cost_ptr, std::vector *cost_per_stage_ptr) { + num_stages = ns; + + // We know the most stages that will ever be enqueued from the schedule features + internal_assert(pipeline_feat_queue.data() && "Call set_schedule_features before calling enqueue\n"); + const int max_num_stages = pipeline_feat_queue.dim(2).extent(); + internal_assert(num_stages <= max_num_stages) + << "schedule features has more stages (" << num_stages + << ") than pipeline features (" << max_num_stages << ")\n"; + + const int batch_size = 1024; + if (!schedule_feat_queue.data() || + schedule_feat_queue.dim(2).extent() < max_num_stages) { + internal_assert(cursor == 0); + schedule_feat_queue = Runtime::Buffer(batch_size, head2_w, max_num_stages); + costs_per_stage = Runtime::Buffer(batch_size, max_num_stages); + if (!costs.data()) { + internal_assert(!cost_ptrs.data()); + costs = Runtime::Buffer(batch_size); + cost_ptrs = Runtime::Buffer(batch_size); + cost_per_stage_ptrs.resize(batch_size); + } + } + + if (cursor == batch_size) { + evaluate_costs(); + } + + *schedule_feats = schedule_feat_queue.sliced(0, cursor); + cost_ptrs(cursor) = cost_ptr; + cost_per_stage_ptr->resize(num_stages, 0); + cost_per_stage_ptrs[cursor] = cost_per_stage_ptr; + + cursor++; +} // namespace Halide + +// Backprop state. To run ADAM we need a running average of the +// gradients and gradients squared. We add an outer dimension of +// size 3 to the new weight outputs to track this state. So buf(_, +// 0) is the new weight, buf(_, 1) is the ADAM running average of +// the first moment, and buf(_, 2) is the ADAM running average of +// the second moment. +float DefaultCostModel::backprop(const Runtime::Buffer &true_runtimes, float learning_rate) { + internal_assert(cursor != 0); + internal_assert(pipeline_feat_queue.data()); + internal_assert(schedule_feat_queue.data()); + + auto loss = Runtime::Buffer::make_scalar(); + + if (!head1_filter_update.data()) { + auto weight_update_buffer = [](const Runtime::Buffer &w) { + std::vector size; + for (int i = 0; i < w.dimensions(); i++) { + size.push_back(w.dim(i).extent()); + } + size.push_back(4); + auto buf = Runtime::Buffer(size); + buf.fill(0.0f); + return buf; + }; + + head1_filter_update = weight_update_buffer(weights.head1_filter); + head1_bias_update = weight_update_buffer(weights.head1_bias); + head2_filter_update = weight_update_buffer(weights.head2_filter); + head2_bias_update = weight_update_buffer(weights.head2_bias); + conv1_filter_update = weight_update_buffer(weights.conv1_filter); + conv1_bias_update = weight_update_buffer(weights.conv1_bias); + timestep = 0; + } + + Runtime::Buffer dst = costs.cropped(0, 0, cursor); + Runtime::Buffer dst_costs_per_stage = costs_per_stage.cropped({{0, cursor}, {0, num_stages}}); + + int fastest_idx = 0; + for (int i = 0; i < cursor; i++) { + if (true_runtimes(i) < true_runtimes(fastest_idx)) { + fastest_idx = i; + } + } + + int result = train_cost_model(num_stages, + cursor, + num_cores, + batch_id, + pipeline_feat_queue, + schedule_feat_queue, + weights.head1_filter, weights.head1_bias, + weights.head2_filter, weights.head2_bias, + weights.conv1_filter, weights.conv1_bias, + learning_rate, timestep++, + fastest_idx, + true_runtimes.alias(), + head1_filter_update, head1_bias_update, + head2_filter_update, head2_bias_update, + conv1_filter_update, conv1_bias_update, + dst, + dst_costs_per_stage, + loss); + (void)result; + internal_assert(result == 0); + + bool any_nans = false; + for (int i = 0; i < cursor; i++) { + internal_assert(cost_ptrs(i)); + *(cost_ptrs(i)) = dst(i); + if (std::isnan(dst(i))) { + any_nans = true; + aslog(0) << "Prediction " << i << " is NaN. True runtime is " << true_runtimes(i) << "\n"; + aslog(0) << "Checking pipeline features for NaNs...\n"; + pipeline_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); + aslog(0) << "None found\n"; + aslog(0) << "Checking schedule features for NaNs...\n"; + schedule_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); + aslog(0) << "None found\n"; + aslog(0) << "Checking network weights for NaNs...\n"; + weights.for_each_buffer([&](const Runtime::Buffer &buf) { + buf.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); + }); + aslog(0) << "None found\n"; + } + internal_assert(true_runtimes(i) > 0); + } + if (any_nans) abort(); + + // Update weights locally + auto update_weight = [](const Runtime::Buffer &src, Runtime::Buffer &dst) { + dst.copy_from(src.sliced(src.dimensions() - 1, 0)); + }; + update_weight(head1_filter_update, weights.head1_filter); + update_weight(head1_bias_update, weights.head1_bias); + update_weight(head2_filter_update, weights.head2_filter); + update_weight(head2_bias_update, weights.head2_bias); + update_weight(conv1_filter_update, weights.conv1_filter); + update_weight(conv1_bias_update, weights.conv1_bias); + + internal_assert(cursor != 0); + + return loss(); +} + +void DefaultCostModel::evaluate_costs() { + if (cursor == 0 || !schedule_feat_queue.data()) return; + + internal_assert(pipeline_feat_queue.data()); + internal_assert(schedule_feat_queue.data()); + + Runtime::Buffer dst = costs.cropped(0, 0, cursor); + Runtime::Buffer dst_costs_per_stage = costs_per_stage.cropped({{0, cursor}, {0, num_stages}}); + + auto loss = Runtime::Buffer::make_scalar(); + + int result = cost_model(num_stages, + cursor, + num_cores, + batch_id++, + pipeline_feat_queue, + schedule_feat_queue, + weights.head1_filter, weights.head1_bias, + weights.head2_filter, weights.head2_bias, + weights.conv1_filter, weights.conv1_bias, + 0.0f, 0, 0, nullptr, + dst, dst_costs_per_stage, loss); + (void)result; + internal_assert(result == 0); + + for (int i = 0; i < cursor; i++) { + internal_assert(cost_ptrs(i)); + *(cost_ptrs(i)) = dst(i); + for (int s = 0; s < num_stages; ++s) { + (*cost_per_stage_ptrs[i])[s] = dst_costs_per_stage(i, s); + } + } + + cursor = 0; + cost_per_stage_ptrs.clear(); +} + +void DefaultCostModel::load_weights() { + bool need_randomize = randomize_weights; + + if (weights_in_path.empty()) { + aslog(1) << "Loading weights from built-in data...\n"; + // This copy shouldn't be necessary, but std::istream in C++ doesn't seem + // to have a convenient wrap-around-constant-data variant... and since + // this isn't much data, just copy it. + const std::string baseline_weights_data((const char *)&baseline_weights[0], baseline_weights_length); + std::istringstream i(baseline_weights_data); + if (!weights.load(i)) { + std::cerr << "The built-in baseline weights should never fail to load\n"; + internal_assert(0); + } + } else if (ends_with(weights_in_path, ".weights")) { + aslog(1) << "Loading weights from " << weights_in_path << " ...\n"; + if (!weights.load_from_file(weights_in_path)) { + // Emit to cout (rather than cerr) because the latter is hidden during the autotune loop, + // and we want this to be seen. + std::cout << "WARNING, error in reading weights from " << weights_in_path << ", randomizing...\n"; + need_randomize = true; + } + } else { + aslog(1) << "Loading weights from directory " << weights_in_path << " ...\n"; + std::cerr << "Loading weights from a directory is deprecated; please convert to a .weights file\n"; + if (!weights.load_from_dir(weights_in_path)) { + std::cout << "WARNING, error in reading weights from " << weights_in_path << ", randomizing...\n"; + need_randomize = true; + } + } + + if (!need_randomize && weights.pipeline_features_version != PipelineFeatures::version()) { + // Emit to cout (rather than cerr) because the latter is hidden during the autotune loop, + // and we want this to be seen. + std::cout << "WARNING: loaded weights have pipeline_version = " + << weights.pipeline_features_version + << " but current pipeline_version is " << PipelineFeatures::version() + << "; the weights may be invalid. Using anyway.\n"; + } + + if (!need_randomize && weights.schedule_features_version != ScheduleFeatures::version()) { + // Emit to cout (rather than cerr) because the latter is hidden during the autotune loop, + // and we want this to be seen. + std::cout << "WARNING: loaded weights have schedule_features_version = " + << weights.schedule_features_version + << " but current schedule_features_version is " << ScheduleFeatures::version() + << "; the weights may be invalid. Using anyway.\n"; + } + + if (need_randomize) { + auto seed = time(NULL); + std::cout << "Randomizing weights using seed = " << seed << "\n"; + weights.randomize((uint32_t)seed); + } + + // Update so that any version of this we save will have the current version + weights.pipeline_features_version = PipelineFeatures::version(); + weights.schedule_features_version = ScheduleFeatures::version(); +} + +void DefaultCostModel::save_weights() { + internal_assert(!weights_out_path.empty()) + << "Unable to save weights: no output path specified\n"; + + if (ends_with(weights_out_path, ".weights")) { + internal_assert(weights.save_to_file(weights_out_path)) + << "Unable to save weights to file: " << weights_out_path << "\n"; + } else { + std::cerr << "Saving weights to a directory is deprecated; please convert to a .weights file\n"; + internal_assert(weights.save_to_dir(weights_out_path)) + << "Unable to save weights to file: " << weights_out_path << "\n"; + } +} + +// Discard any enqueued but unevaluated schedules +void DefaultCostModel::reset() { + cursor = 0; +} + +std::unique_ptr make_default_cost_model(Internal::Autoscheduler::Statistics& stats, + const std::string &weights_in_path, + const std::string &weights_out_path, + bool randomize_weights) { + return std::unique_ptr(new DefaultCostModel(weights_in_path, weights_out_path, randomize_weights, stats)); +} + +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.h b/src/autoschedulers/anderson2021/DefaultCostModel.h new file mode 100644 index 000000000000..7f5b56327568 --- /dev/null +++ b/src/autoschedulers/anderson2021/DefaultCostModel.h @@ -0,0 +1,78 @@ +#ifndef DEFAULT_COST_MODEL_H +#define DEFAULT_COST_MODEL_H + +#include "CostModel.h" +#include "Statistics.h" +#include "Weights.h" +#include + +namespace Halide { + +class DefaultCostModel : public CostModel { +private: + Internal::Weights weights; + Runtime::Buffer schedule_feat_queue, pipeline_feat_queue, costs, costs_per_stage; + Runtime::Buffer cost_ptrs; + std::vector*> cost_per_stage_ptrs; + int cursor, num_stages, num_cores; + int batch_id{0}; + + const std::string weights_in_path, weights_out_path; + const bool randomize_weights; + + Runtime::Buffer + head1_filter_update, head1_bias_update, + head2_filter_update, head2_bias_update, + conv1_filter_update, conv1_bias_update; + int timestep = 0; + + Internal::Autoscheduler::Statistics &stats; + +public: + DefaultCostModel(const std::string &weights_in_path, + const std::string &weights_out_path, + bool randomize_weights, + Internal::Autoscheduler::Statistics& stats) + : weights_in_path(weights_in_path), + weights_out_path(weights_out_path), + randomize_weights(randomize_weights), + stats{stats} + { + load_weights(); + } + virtual ~DefaultCostModel() = default; + + // Configure the cost model for the algorithm to be scheduled. + void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, + const MachineParams ¶ms) override; + void set_pipeline_features(const Runtime::Buffer &, int n); + + // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of + // schedule_features that should be filled in by the caller. + void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, + const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, + double *cost_ptr, + std::vector *cost_per_stage_ptr) override; + void enqueue(int ns, Runtime::Buffer *schedule_feats, double *cost_ptr, std::vector *cost_per_stage_ptr); + + // Evaluate all schedules in the queue. + void evaluate_costs() override; + + // Discard all schedules in the queue. + void reset() override; + + // Update model weights using true measured runtimes. + float backprop(const Runtime::Buffer &true_runtimes, float learning_rate); + + // Save/Load the model weights to/from disk. + void save_weights(); + void load_weights(); +}; + +std::unique_ptr make_default_cost_model(Internal::Autoscheduler::Statistics& stats, + const std::string &weights_in_dir = "", + const std::string &weights_out_dir = "", + bool randomize_weights = false); +} // namespace Halide + +#endif // DEFAULT_COST_MODEL_H diff --git a/src/autoschedulers/anderson2021/Errors.h b/src/autoschedulers/anderson2021/Errors.h new file mode 100644 index 000000000000..0057b2fbc3a9 --- /dev/null +++ b/src/autoschedulers/anderson2021/Errors.h @@ -0,0 +1,26 @@ +#ifndef ERRORS_H +#define ERRORS_H + +#include "Halide.h" + +#ifndef user_error +#define user_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User) +#endif + +#ifndef user_warning +#define user_warning Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Warning) +#endif + +#ifndef user_assert +#define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) +#endif + +#ifndef internal_assert +#define internal_assert(c) _halide_internal_assertion(c, 0) +#endif + +#ifndef internal_error +#define internal_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, 0) +#endif + +#endif diff --git a/src/autoschedulers/anderson2021/Featurization.h b/src/autoschedulers/anderson2021/Featurization.h new file mode 100644 index 000000000000..94c4cb00044a --- /dev/null +++ b/src/autoschedulers/anderson2021/Featurization.h @@ -0,0 +1,507 @@ +#ifndef FEATURIZATION_H +#define FEATURIZATION_H + +#include +#include +#include + +#include "ASLog.h" + +namespace Halide { +namespace Internal { + +// The algorithm-specific features. For legacy reasons these are +// called PipelineFeatures in the code. +struct PipelineFeatures { + static constexpr size_t num_features() { + return sizeof(PipelineFeatures) / sizeof(int); + } + + static constexpr uint32_t version() { + return 3; + } + + // Access them by index. + int &operator[](int idx) { + return ((int *)(this))[idx]; + } + + int operator[](int idx) const { + return ((const int *)(this))[idx]; + } + + enum class OpType { + Const, + Cast, + Variable, + Param, + Add, + Sub, + Mod, + Mul, + Div, + Min, + Max, + EQ, + NE, + LT, + LE, + And, + Or, + Not, + Select, + ImageCall, // Loads to an input buffer + FuncCall, // Calls to another pipeline stage + SelfCall, // Recursive calls from a Func to itself + ExternCall, // Math intrinsics, typically + Let, + NumOpTypes + }; + + enum class ScalarType { + Bool, + UInt8, // or Int8 + UInt16, // or Int16 + UInt32, // or Int32 + UInt64, // or Int64 + Float, + Double, + NumScalarTypes + }; + + // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things + int types_in_use[(int)ScalarType::NumScalarTypes] = {}; + + int op_histogram[(int)OpType::NumOpTypes][(int)ScalarType::NumScalarTypes] = {}; + + enum class AccessType { + LoadFunc, + LoadSelf, + LoadImage, + Store, + NumAccessTypes + }; + + // Finer granularity call/store node properties. These are a + // function of the matrix of derivatives of each arg to a + // call w.r.t the loop variables of the Stage. Each row of + // the matrix corresponds to one of the call arguments. In + // each case we illustrate such a call, assuming that the + // variables of this Func are x, y, z, and that the + // dimension vectorized over is the first (x). + + // Square identity matrix. f(x - 2, y + 8, z + param) + int pointwise_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; + // Square permutation matrix. f(y + 1, z - 3, x) + int transpose_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; + // Each row sums to 1. Each column sums to 1 or 0. f(y, x) + int broadcast_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; + // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4) + int slice_accesses[(int)AccessType::NumAccessTypes][(int)ScalarType::NumScalarTypes] = {}; + + template + void dump(OS &os) const { + for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) { + const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"}; + // Skip printing for types not used + if (!types_in_use[i]) continue; + + os << " Featurization for type " << type_names[i] << "\n" + << " Op histogram:\n" + << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n" + << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n" + << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n" + << " Param: " << op_histogram[(int)OpType::Param][i] << "\n" + << " Add: " << op_histogram[(int)OpType::Add][i] << "\n" + << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n" + << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n" + << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n" + << " Div: " << op_histogram[(int)OpType::Div][i] << "\n" + << " Min: " << op_histogram[(int)OpType::Min][i] << "\n" + << " Max: " << op_histogram[(int)OpType::Max][i] << "\n" + << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n" + << " NE: " << op_histogram[(int)OpType::NE][i] << "\n" + << " LT: " << op_histogram[(int)OpType::LT][i] << "\n" + << " LE: " << op_histogram[(int)OpType::LE][i] << "\n" + << " And: " << op_histogram[(int)OpType::And][i] << "\n" + << " Or: " << op_histogram[(int)OpType::Or][i] << "\n" + << " Not: " << op_histogram[(int)OpType::Not][i] << "\n" + << " Select: " << op_histogram[(int)OpType::Select][i] << "\n" + << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n" + << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n" + << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n" + << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n" + << " Let: " << op_histogram[(int)OpType::Let][i] << "\n" + << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n" + << " Pointwise: " + << pointwise_accesses[0][i] << " " + << pointwise_accesses[1][i] << " " + << pointwise_accesses[2][i] << " " + << pointwise_accesses[3][i] << "\n" + << " Transpose: " + << transpose_accesses[0][i] << " " + << transpose_accesses[1][i] << " " + << transpose_accesses[2][i] << " " + << transpose_accesses[3][i] << "\n" + << " Broadcast: " + << broadcast_accesses[0][i] << " " + << broadcast_accesses[1][i] << " " + << broadcast_accesses[2][i] << " " + << broadcast_accesses[3][i] << "\n" + << " Slice: " + << slice_accesses[0][i] << " " + << slice_accesses[1][i] << " " + << slice_accesses[2][i] << " " + << slice_accesses[3][i] << "\n"; + } + } + void dump() const { + auto os = aslog(0); + dump(os); + } +}; + +// The schedule-dependent portion of the featurization of a stage +struct ScheduleFeatures { + static constexpr size_t num_features() { + return sizeof(ScheduleFeatures) / sizeof(double); + } + + static constexpr uint32_t version() { + return 3; + } + + double &operator[](int idx) { + return ((double *)(this))[idx]; + } + + double operator[](int idx) const { + return ((const double *)(this))[idx]; + } + + // The number of times storage for this stage is allocated. The + // product of outer loops at store_at site + double num_realizations = 0; + + // The number of times a tile of the stage is computed. The + // pProduct of outer loops at compute_at site. Always at least as + // large as num_realizations. + double num_productions = 0; + + // Number of times the innermost loop happens per allocation. + double points_computed_per_realization = 0; + + // Number of times the innermost stmt happens per tile computed. + double points_computed_per_production = 0; + + double points_computed_per_thread = 0; + + // The total trip count of the innermost loop over the entire program. + // == num_realizations * points_computed_per_realization + // ~= num_productions * points_computed_per_production + // Only approximately equal because of the simplifications made + // regarding the modeling of sliding window + double points_computed_total = 0; + + // The minimum number of points that are actually required to be + // computed to produce a correct output. Not actually a function + // of the schedule, but a useful reference point to see if a + // schedule has gone off the rails. + double points_computed_minimum = 0; + + // Trip count of innermost loop nest. + double innermost_loop_extent = 0; + + // Trip count of just the pure loops in the innermost loop + // (i.e. excludes loops representing reductions). + double innermost_pure_loop_extent = 0; + + // If this is to be unrolled, what is the product of the unrolling + // factors. + double unrolled_loop_extent = 0; + + // The number of parallel jobs launched in the production of this + // stage. Always 1 unless the Func is compute_root, because we + // place all parallelism at the outermost level. + double inner_parallelism = 0; + + // The number of times this Func could be realized in parallel. 1 + // when the Func is compute_root. Product of the containing + // parallel loops for other stages. + double outer_parallelism = 0; + + // Size of the region computed at the store_at site, measured in + // bytes. Does not take storage-folding optimizations into account. + double bytes_at_realization = 0; + + // Size of the region computed per tile (at the compute_at site), + // measured in bytes. This includes the effect of storage-folding, + // so it's a better number to look at to estimate memory usage. + double bytes_at_production = 0; + + // If the stage were hypothetically scheduled at root, how much + // memory would it consumed. Doesn't vary w.r.t. the schedule, but + // a useful reference. + double bytes_at_root = 0; + + // Same as the above, but only measuring the extent along the + // innermost dimension, so that we can reason about spatial + // locality, cache lines, prefetchers, etc. + double innermost_bytes_at_realization = 0; + double innermost_bytes_at_production = 0; + double innermost_bytes_at_root = 0; + + // For inlined Funcs, how many calls are made to this Func total. + double inlined_calls = 0; + + // Number of unique bytes and unique continguous segments of + // memory loaded from all inputs over a single trip of the loop + // containing the allocation site. + double unique_global_bytes_read_per_realization = 0; + double unique_shared_bytes_read_per_realization = 0; + double unique_register_bytes_read_per_realization = 0; + double unique_global_lines_read_per_realization = 0; + double unique_shared_lines_read_per_realization = 0; + double unique_register_lines_read_per_realization = 0; + + double unique_global_bytes_read_per_thread = 0; + double unique_shared_bytes_read_per_thread = 0; + double unique_register_bytes_read_per_thread = 0; + double unique_global_lines_read_per_thread = 0; + double unique_shared_lines_read_per_thread = 0; + double unique_register_lines_read_per_thread = 0; + + // The sum of the sizes of the allocations accessed at this + // site. Gives a hint as to the likely locality of it. + double global_allocation_bytes_read_per_realization = 0; + double shared_allocation_bytes_read_per_realization = 0; + double register_allocation_bytes_read_per_realization = 0; + + // The sum of the sizes of the temporary allocations while + // computing one tile of this Func. Probably a good thing if it + // fits in cache. + double working_set = 0; + + // Number of scalars computed (e.g. from tails of loops) + double num_scalars = 0; + + // The memory footprint written over one per parallel task. The + // union of the regions if the stage is computed at finer + // granularity that one parallel task of some consumer. + double global_bytes_at_task = 0; + double shared_bytes_at_task = 0; + double register_bytes_at_task = 0; + double global_innermost_bytes_at_task = 0; + double shared_innermost_bytes_at_task = 0; + double register_innermost_bytes_at_task = 0; + + // The memory footprint accessed while computing a single point + double unique_bytes_read_per_point = 0; + double unique_lines_read_per_point = 0; + + // The memory footprint accessed per parallel task. Only counts + // loads from things computed outside of that parallel task (to + // measure the amount of traffic coming from another core). + double unique_bytes_read_per_task = 0; + double unique_lines_read_per_task = 0; + + // The sum of the sizes of all live allocations at various sites. + double working_set_at_task = 0; + double working_set_at_production = 0; + double working_set_at_realization = 0; + double working_set_at_root = 0; + + double num_blocks = 1; + double num_warps_per_block = 0; + double block_occupancy = 1.0 / 1024.0; + + double warp_lane_utilization = 1.0 / 32.0; + double num_active_warps_per_block = 0; + double warp_lane_utilization_at_block_y = 1; + double warp_lane_utilization_at_block_z = 1; + double idle_lane_wastage = 0; + + double num_shared_mem_loads_per_block = 0; + double num_global_mem_loads_per_block = 0; + double num_shared_mem_stores_per_block = 0; + double num_global_mem_stores_per_block = 0; + + double shared_mem_store_efficiency = 1; + double shared_mem_load_efficiency = 1; + + double global_mem_store_efficiency = 1; + double global_mem_load_efficiency = 1; + + double working_set_at_thread = 0; + + double shared_mem_occupancy = 0; + double shared_mem_block_limit_factor = 1; + double max_warp_occupancy = 0; + double max_block_occupancy = 0; + + double num_threads_per_block = 0; + double expr_branching = 0; + + template + void dump(OS &os) const { + os << " num_realizations: " << num_realizations << "\n" + << " num_productions: " << num_productions << "\n" + << " points_computed_per_realization: " << points_computed_per_realization << "\n" + << " points_computed_per_production: " << points_computed_per_production << "\n" + << " points_computed_per_thread: " << points_computed_per_thread << "\n" + << " points_computed_total: " << points_computed_total << "\n" + << " points_computed_minimum: " << points_computed_minimum << "\n" + << " innermost_loop_extent: " << innermost_loop_extent << "\n" + << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n" + << " unrolled_loop_extent: " << unrolled_loop_extent << "\n" + << " inner_parallelism: " << inner_parallelism << "\n" + << " outer_parallelism: " << outer_parallelism << "\n" + << " bytes_at_realization: " << bytes_at_realization << "\n" + << " bytes_at_production: " << bytes_at_production << "\n" + << " bytes_at_root: " << bytes_at_root << "\n" + << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n" + << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n" + << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n" + << " inlined_calls: " << inlined_calls << "\n" + << " unique_global_bytes_read_per_realization: " << unique_global_bytes_read_per_realization << "\n" + << " unique_shared_bytes_read_per_realization: " << unique_shared_bytes_read_per_realization << "\n" + << " unique_register_bytes_read_per_realization: " << unique_register_bytes_read_per_realization << "\n" + << " unique_global_lines_read_per_realization: " << unique_global_lines_read_per_realization << "\n" + << " unique_shared_lines_read_per_realization: " << unique_shared_lines_read_per_realization << "\n" + << " unique_register_lines_read_per_realization: " << unique_register_lines_read_per_realization << "\n" + << " unique_global_bytes_read_per_thread: " << unique_global_bytes_read_per_thread << "\n" + << " unique_shared_bytes_read_per_thread: " << unique_shared_bytes_read_per_thread << "\n" + << " unique_register_bytes_read_per_thread: " << unique_register_bytes_read_per_thread << "\n" + << " unique_global_lines_read_per_thread: " << unique_global_lines_read_per_thread << "\n" + << " unique_shared_lines_read_per_thread: " << unique_shared_lines_read_per_thread << "\n" + << " unique_register_lines_read_per_thread: " << unique_register_lines_read_per_thread << "\n" + << " global_allocation_bytes_read_per_realization: " << global_allocation_bytes_read_per_realization << "\n" + << " shared_allocation_bytes_read_per_realization: " << shared_allocation_bytes_read_per_realization << "\n" + << " register_allocation_bytes_read_per_realization: " << register_allocation_bytes_read_per_realization << "\n" + << " working_set: " << working_set << "\n" + << " num_scalars: " << num_scalars << "\n" + << " global_bytes_at_task: " << global_bytes_at_task << "\n" + << " shared_bytes_at_task: " << shared_bytes_at_task << "\n" + << " register_bytes_at_task: " << register_bytes_at_task << "\n" + << " global_innermost_bytes_at_task: " << global_innermost_bytes_at_task << "\n" + << " shared_innermost_bytes_at_task: " << shared_innermost_bytes_at_task << "\n" + << " register_innermost_bytes_at_task: " << register_innermost_bytes_at_task << "\n" + << " unique_bytes_read_per_point: " << unique_bytes_read_per_point << "\n" + << " unique_lines_read_per_point: " << unique_lines_read_per_point << "\n" + << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n" + << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n" + << " working_set_at_task: " << working_set_at_task << "\n" + << " working_set_at_production: " << working_set_at_production << "\n" + << " working_set_at_realization: " << working_set_at_realization << "\n" + << " working_set_at_root: " << working_set_at_root << "\n" + << " num_blocks: " << num_blocks << "\n" + << " num_warps_per_block: " << num_warps_per_block << "\n" + << " block_occupancy: " << block_occupancy << "\n" + << " warp_lane_utilization: " << warp_lane_utilization << "\n" + << " num_active_warps_per_block: " << num_active_warps_per_block << "\n" + << " warp_lane_utilization_at_block_y: " << warp_lane_utilization_at_block_y << "\n" + << " warp_lane_utilization_at_block_z: " << warp_lane_utilization_at_block_z << "\n" + << " idle_lane_wastage: " << idle_lane_wastage << "\n" + << " num_shared_mem_loads_per_block: " << num_shared_mem_loads_per_block << "\n" + << " num_global_mem_loads_per_block: " << num_global_mem_loads_per_block << "\n" + << " num_shared_mem_stores_per_block: " << num_shared_mem_stores_per_block << "\n" + << " num_global_mem_stores_per_block: " << num_global_mem_stores_per_block << "\n" + << " shared_mem_store_efficiency: " << shared_mem_store_efficiency << "\n" + << " shared_mem_load_efficiency: " << shared_mem_load_efficiency << "\n" + << " global_mem_store_efficiency: " << global_mem_store_efficiency << "\n" + << " global_mem_load_efficiency: " << global_mem_load_efficiency << "\n" + << " working_set_at_thread: " << working_set_at_thread << "\n" + << " shared_mem_occupancy: " << shared_mem_occupancy << "\n" + << " shared_mem_block_limit_factor: " << shared_mem_block_limit_factor << "\n" + << " max_warp_occupancy: " << max_warp_occupancy << "\n" + << " max_block_occupancy: " << max_block_occupancy << "\n" + << " num_threads_per_block: " << num_threads_per_block << "\n" + << " expr_branching: " << expr_branching << "\n"; + } + + void dump() const { + auto os = aslog(0); + dump(os); + } + + bool equal(const ScheduleFeatures& other) const { + return num_realizations == other.num_realizations + && num_productions == other.num_productions + && points_computed_per_realization == other.points_computed_per_realization + && points_computed_per_production == other.points_computed_per_production + && points_computed_per_thread == other.points_computed_per_thread + && points_computed_total == other.points_computed_total + && points_computed_minimum == other.points_computed_minimum + && innermost_loop_extent == other.innermost_loop_extent + && innermost_pure_loop_extent == other.innermost_pure_loop_extent + && unrolled_loop_extent == other.unrolled_loop_extent + && inner_parallelism == other.inner_parallelism + && outer_parallelism == other.outer_parallelism + && bytes_at_realization == other.bytes_at_realization + && bytes_at_production == other.bytes_at_production + && bytes_at_root == other.bytes_at_root + && innermost_bytes_at_realization == other.innermost_bytes_at_realization + && innermost_bytes_at_production == other.innermost_bytes_at_production + && innermost_bytes_at_root == other.innermost_bytes_at_root + && inlined_calls == other.inlined_calls + && unique_global_bytes_read_per_realization == other.unique_global_bytes_read_per_realization + && unique_shared_bytes_read_per_realization == other.unique_shared_bytes_read_per_realization + && unique_register_bytes_read_per_realization == other.unique_register_bytes_read_per_realization + && unique_global_lines_read_per_realization == other.unique_global_lines_read_per_realization + && unique_shared_lines_read_per_realization == other.unique_shared_lines_read_per_realization + && unique_register_lines_read_per_realization == other.unique_register_lines_read_per_realization + && unique_global_bytes_read_per_thread == other.unique_global_bytes_read_per_thread + && unique_shared_bytes_read_per_thread == other.unique_shared_bytes_read_per_thread + && unique_register_bytes_read_per_thread == other.unique_register_bytes_read_per_thread + && unique_global_lines_read_per_thread == other.unique_global_lines_read_per_thread + && unique_shared_lines_read_per_thread == other.unique_shared_lines_read_per_thread + && unique_register_lines_read_per_thread == other.unique_register_lines_read_per_thread + && global_allocation_bytes_read_per_realization == other.global_allocation_bytes_read_per_realization + && shared_allocation_bytes_read_per_realization == other.shared_allocation_bytes_read_per_realization + && register_allocation_bytes_read_per_realization == other.register_allocation_bytes_read_per_realization + && working_set == other.working_set + && num_scalars == other.num_scalars + && global_bytes_at_task == other.global_bytes_at_task + && shared_bytes_at_task == other.shared_bytes_at_task + && register_bytes_at_task == other.register_bytes_at_task + && global_innermost_bytes_at_task == other.global_innermost_bytes_at_task + && shared_innermost_bytes_at_task == other.shared_innermost_bytes_at_task + && register_innermost_bytes_at_task == other.register_innermost_bytes_at_task + && unique_bytes_read_per_point == other.unique_bytes_read_per_point + && unique_lines_read_per_point == other.unique_lines_read_per_point + && unique_bytes_read_per_task == other.unique_bytes_read_per_task + && unique_lines_read_per_task == other.unique_lines_read_per_task + && working_set_at_task == other.working_set_at_task + && working_set_at_production == other.working_set_at_production + && working_set_at_realization == other.working_set_at_realization + && working_set_at_root == other.working_set_at_root + && num_blocks == other.num_blocks + && num_warps_per_block == other.num_warps_per_block + && block_occupancy == other.block_occupancy + && warp_lane_utilization == other.warp_lane_utilization + && num_active_warps_per_block == other.num_active_warps_per_block + && warp_lane_utilization_at_block_y == other.warp_lane_utilization_at_block_y + && warp_lane_utilization_at_block_z == other.warp_lane_utilization_at_block_z + && idle_lane_wastage == other.idle_lane_wastage + && num_shared_mem_loads_per_block == other.num_shared_mem_loads_per_block + && num_global_mem_loads_per_block == other.num_global_mem_loads_per_block + && num_shared_mem_stores_per_block == other.num_shared_mem_stores_per_block + && num_global_mem_stores_per_block == other.num_global_mem_stores_per_block + && shared_mem_store_efficiency == other.shared_mem_store_efficiency + && shared_mem_load_efficiency == other.shared_mem_load_efficiency + && global_mem_store_efficiency == other.global_mem_store_efficiency + && global_mem_load_efficiency == other.global_mem_load_efficiency + && working_set_at_thread == other.working_set_at_thread + && shared_mem_occupancy == other.shared_mem_occupancy + && shared_mem_block_limit_factor == other.shared_mem_block_limit_factor + && max_warp_occupancy == other.max_warp_occupancy + && max_block_occupancy == other.max_block_occupancy + && num_threads_per_block == other.num_threads_per_block + && expr_branching == other.expr_branching; + } +}; + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp new file mode 100644 index 000000000000..b942d0cdc283 --- /dev/null +++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp @@ -0,0 +1,1283 @@ +#include "FunctionDAG.h" + +#include "ASLog.h" + +namespace Halide { +namespace Internal { + +template<> +RefCount &ref_count(const Autoscheduler::BoundContents *t) noexcept { + return t->ref_count; +} + +template<> +void destroy(const Autoscheduler::BoundContents *t) { + // Release it back into the memory pool to be reused + t->layout->release(t); +} + +namespace Autoscheduler { + +namespace { + +class Featurizer : public IRVisitor { + using IRVisitor::visit; + + Function &func; + FunctionDAG::Node::Stage &stage; + + int &op_bucket(PipelineFeatures::OpType op_type, Type scalar_type) { + int type_bucket = (int)classify_type(scalar_type); + stage.features.types_in_use[type_bucket] = true; + return stage.features.op_histogram[(int)op_type][type_bucket]; + } + + PipelineFeatures::ScalarType classify_type(Type t) { + if (t.is_float() && t.bits() > 32) { + return PipelineFeatures::ScalarType::Double; + } else if (t.is_float()) { + return PipelineFeatures::ScalarType::Float; + } else if (t.bits() == 1) { + return PipelineFeatures::ScalarType::Bool; + } else if (t.bits() <= 8) { + return PipelineFeatures::ScalarType::UInt8; + } else if (t.bits() <= 16) { + return PipelineFeatures::ScalarType::UInt16; + } else if (t.bits() <= 32) { + return PipelineFeatures::ScalarType::UInt32; + } else { + return PipelineFeatures::ScalarType::UInt64; + } + } + void visit(const Variable *op) override { + if (op->param.defined()) { + op_bucket(PipelineFeatures::OpType::Param, op->type)++; + } else { + op_bucket(PipelineFeatures::OpType::Variable, op->type)++; + } + } + void visit(const IntImm *op) override { + op_bucket(PipelineFeatures::OpType::Const, op->type)++; + } + void visit(const UIntImm *op) override { + op_bucket(PipelineFeatures::OpType::Const, op->type)++; + } + void visit(const FloatImm *op) override { + op_bucket(PipelineFeatures::OpType::Const, op->type)++; + } + void visit(const Add *op) override { + op_bucket(PipelineFeatures::OpType::Add, op->type)++; + IRVisitor::visit(op); + } + void visit(const Sub *op) override { + op_bucket(PipelineFeatures::OpType::Sub, op->type)++; + IRVisitor::visit(op); + } + void visit(const Mul *op) override { + op_bucket(PipelineFeatures::OpType::Mul, op->type)++; + IRVisitor::visit(op); + } + void visit(const Mod *op) override { + op_bucket(PipelineFeatures::OpType::Mod, op->type)++; + IRVisitor::visit(op); + } + void visit(const Div *op) override { + op_bucket(PipelineFeatures::OpType::Div, op->type)++; + IRVisitor::visit(op); + } + void visit(const Min *op) override { + op_bucket(PipelineFeatures::OpType::Min, op->type)++; + IRVisitor::visit(op); + } + void visit(const Max *op) override { + op_bucket(PipelineFeatures::OpType::Max, op->type)++; + IRVisitor::visit(op); + } + void visit(const EQ *op) override { + op_bucket(PipelineFeatures::OpType::EQ, op->type)++; + IRVisitor::visit(op); + } + void visit(const NE *op) override { + op_bucket(PipelineFeatures::OpType::NE, op->type)++; + IRVisitor::visit(op); + } + void visit(const LT *op) override { + op_bucket(PipelineFeatures::OpType::LT, op->type)++; + IRVisitor::visit(op); + } + void visit(const LE *op) override { + op_bucket(PipelineFeatures::OpType::LE, op->type)++; + IRVisitor::visit(op); + } + void visit(const GT *op) override { + // Treat as a flipped LT + op_bucket(PipelineFeatures::OpType::LT, op->type)++; + IRVisitor::visit(op); + } + void visit(const GE *op) override { + op_bucket(PipelineFeatures::OpType::LE, op->type)++; + IRVisitor::visit(op); + } + void visit(const And *op) override { + op_bucket(PipelineFeatures::OpType::And, op->type)++; + IRVisitor::visit(op); + } + void visit(const Or *op) override { + op_bucket(PipelineFeatures::OpType::Or, op->type)++; + IRVisitor::visit(op); + } + void visit(const Not *op) override { + op_bucket(PipelineFeatures::OpType::Not, op->type)++; + IRVisitor::visit(op); + } + void visit(const Select *op) override { + op_bucket(PipelineFeatures::OpType::Select, op->type)++; + IRVisitor::visit(op); + } + Scope lets; + + void visit(const Let *op) override { + ScopedBinding bind(lets, op->name, op->value); + op_bucket(PipelineFeatures::OpType::Let, op->type)++; + IRVisitor::visit(op); + } + void visit(const Call *op) override { + IRVisitor::visit(op); + if (op->call_type == Call::Halide) { + if (op->name == func.name()) { + visit_memory_access(op->name, op->type, op->args, PipelineFeatures::AccessType::LoadSelf); + op_bucket(PipelineFeatures::OpType::SelfCall, op->type)++; + } else { + visit_memory_access(op->name, op->type, op->args, PipelineFeatures::AccessType::LoadFunc); + op_bucket(PipelineFeatures::OpType::FuncCall, op->type)++; + } + } else if (op->call_type == Call::Extern || op->call_type == Call::PureExtern || + op->call_type == Call::Intrinsic || op->call_type == Call::PureIntrinsic) { + op_bucket(PipelineFeatures::OpType::ExternCall, op->type)++; + } else if (op->call_type == Call::Image) { + visit_memory_access(op->name, op->type, op->args, PipelineFeatures::AccessType::LoadImage); + op_bucket(PipelineFeatures::OpType::ImageCall, op->type)++; + } // TODO: separate out different math calls a little better (sqrt vs sin vs lerp) + } + + // Take the derivative of an integer index expression. If it's + // a rational constant, return it, otherwise return a sentinel + // value. + + // The derivative of each let w.r.t each var. The keys are + // just the var names separated by a space. + Scope dlets; + + OptionalRational differentiate(const Expr &e, const string &v) { + if (!expr_uses_var(e, v, lets)) { + return {0, 1}; + } else if (const Variable *var = e.as()) { + if (var->name == v) { + return {1, 1}; + } + for (const auto &l : stage.loop) { + if (var->name == l.var) { + // Some other loop variable + return {0, 1}; + } + } + if (var->param.defined()) { + // An argument + return {0, 1}; + } else if (lets.contains(var->name)) { + string key = v + " " + var->name; + if (dlets.contains(key)) { + return dlets.get(key); + } + auto a = differentiate(lets.get(var->name), v); + dlets.push(key, a); + return a; + } + // Some mystery variable. Who knows what it depends on. + internal_error << "Encountered unbound variable in call args: " << var->name << "\n"; + return {0, 0}; + } else if (const Add *op = e.as()) { + auto a = differentiate(op->a, v); + a += differentiate(op->b, v); + return a; + } else if (const Sub *op = e.as()) { + auto a = differentiate(op->a, v); + auto b = differentiate(op->b, v); + b.numerator = -b.numerator; + a += b; + return a; + } else if (const Mul *op = e.as()) { + auto a = differentiate(op->a, v); + if (const int64_t *ib = as_const_int(op->b)) { + a.numerator *= *ib; + return a; + } else { + return {0, 0}; + } + } else if (const Div *op = e.as
()) { + auto a = differentiate(op->a, v); + if (const int64_t *ib = as_const_int(op->b)) { + if (a.numerator != 0) { + a.denominator *= *ib; + } + return a; + } else { + return {0, 0}; + } + } else if (const Call *op = e.as()) { + if (op->is_intrinsic(Call::likely)) { + // TODO: Should a likely on one side of a min/max dominate? + return differentiate(op->args[0], v); + } + } + + return {0, 0}; + } + + void visit_memory_access(const std::string &name, Type t, const vector &args, PipelineFeatures::AccessType type) { + // Compute matrix of partial derivatives of args w.r.t. loop params + LoadJacobian matrix(args.size(), stage.loop.size(), 1); + vector ones_per_row(args.size(), 0), + zeros_per_row(args.size(), 0), + ones_per_col(stage.loop.size(), 0), + zeros_per_col(stage.loop.size(), 0); + bool is_pointwise = args.size() == stage.loop.size(); + for (size_t i = 0; i < args.size(); i++) { + for (size_t j = 0; j < stage.loop.size(); j++) { + auto deriv = differentiate(args[i], stage.loop[j].var); + zeros_per_row[i] += deriv == 0; + ones_per_row[i] += deriv == 1; + zeros_per_col[j] += deriv == 0; + ones_per_col[j] += deriv == 1; + is_pointwise &= (i == j ? deriv == 1 : deriv == 0); + matrix(i, j) = deriv; + } + } + bool is_transpose = (args.size() == stage.loop.size()); + bool is_broadcast = true, is_slice = true; + for (size_t i = 0; i < args.size(); i++) { + bool single_one = (ones_per_row[i] == 1) && (zeros_per_row[i] == stage.loop.size() - 1); + bool all_zero = (zeros_per_row[i] == stage.loop.size()); + is_transpose &= single_one; + is_broadcast &= single_one; + is_slice &= single_one || all_zero; + } + for (size_t j = 0; j < stage.loop.size(); j++) { + bool single_one = (ones_per_col[j] == 1) && (zeros_per_col[j] == args.size() - 1); + bool all_zero = (zeros_per_col[j] == args.size()); + is_transpose &= single_one || all_zero; + is_broadcast &= single_one; + is_slice &= single_one; + } + + auto type_class = classify_type(t); + + stage.features.pointwise_accesses[(int)type][(int)type_class] += is_pointwise; + stage.features.transpose_accesses[(int)type][(int)type_class] += is_transpose; + stage.features.broadcast_accesses[(int)type][(int)type_class] += is_broadcast; + stage.features.slice_accesses[(int)type][(int)type_class] += is_slice; + + if (type == PipelineFeatures::AccessType::Store) { + stage.store_jacobian = std::make_unique(matrix); + } + + for (auto *e : stage.incoming_edges) { + if (e->producer->func.name() == name) { + // The same name can be encountered multiple times + // (e.g. a+a, where a is a trivial function), + // so we can't use std::move(matrix) here without making a copy + e->add_load_jacobian(matrix); + } + } + } + +public: + Featurizer(Function &func, FunctionDAG::Node::Stage &stage) + : func(func), stage(stage) { + } + + void visit_store_args(const std::string &name, Type t, vector args) { + for (auto &e : args) { + e = common_subexpression_elimination(simplify(e)); // Get things into canonical form + } + visit_memory_access(name, t, args, PipelineFeatures::AccessType::Store); + } +}; + +} // namespace + +void LoadJacobian::dump(const char *prefix) const { + if (count() > 1) { + aslog(0) << prefix << count() << " x\n"; + } + for (size_t i = 0; i < producer_storage_dims(); i++) { + aslog(0) << prefix << " ["; + + for (size_t j = 0; j < consumer_loop_dims(); j++) { + const auto &c = (*this)(i, j); + if (!c.exists()) { + aslog(0) << " _ "; + } else if (c.denominator == 1) { + aslog(0) << " " << c.numerator << " "; + } else { + aslog(0) << c.numerator << "/" << c.denominator << " "; + } + } + aslog(0) << "]\n"; + } + aslog(0) << "\n"; +} + +void BoundContents::validate() const { + for (int i = 0; i < layout->total_size; i++) { + auto p = data()[i]; + if (p.max() < p.min()) { + aslog(0) << "Bad bounds object:\n"; + for (int j = 0; j < layout->total_size; j++) { + if (i == j) { + aslog(0) << "=> "; + } else { + aslog(0) << " "; + } + aslog(0) << j << ": " << data()[j].min() << ", " << data()[j].max() << "\n"; + } + internal_error << "Aborting"; + } + } +} + +BoundContents::Layout::~Layout() { + internal_assert(num_live == 0) + << "Destroying a Layout without returning all the BoundContents. " + << num_live << " are still live\n"; + for (auto *b : pool) { + b->~BoundContents(); + } + for (auto b : blocks) { + free(b); + } +} + +void BoundContents::Layout::allocate_some_more() const { + size_t size_of_one = sizeof(BoundContents) + total_size * sizeof(Span); + const size_t number_per_block = std::max((size_t)8, 4096 / size_of_one); // Make a page of them, or 8, whichever is larger. + const size_t bytes_to_allocate = std::max(size_of_one * number_per_block, (size_t)4096); + unsigned char *mem = (unsigned char *)malloc(bytes_to_allocate); + + blocks.push_back(mem); + static_assert((sizeof(BoundContents) & 7) == 0, "BoundContents header is not aligned"); + for (size_t i = 0; i < number_per_block; i++) { + BoundContents *b = (BoundContents *)(mem + i * size_of_one); + new (b) BoundContents; + b->layout = this; + pool.push_back(b); + } + internal_assert(((unsigned char *)(pool[0]) + size_of_one) == (unsigned char *)(pool[1])); +} + +BoundContents *BoundContents::Layout::make() const { + if (pool.empty()) { + allocate_some_more(); + } + BoundContents *b = pool.back(); + pool.pop_back(); + num_live++; + return b; +} + +void BoundContents::Layout::release(const BoundContents *b) const { + internal_assert(b->layout == this) << "Releasing BoundContents onto the wrong pool!"; + b->~BoundContents(); + pool.push_back(const_cast(b)); + num_live--; +} + +void FunctionDAG::Node::loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const { + const auto &s = stages[stage_idx]; + map computed_map; + if (!s.loop_nest_all_common_cases) { + for (int i = 0; i < func.dimensions(); i++) { + computed_map[region_required[i].min.name()] = (int)computed[i].min(); + computed_map[region_required[i].max.name()] = (int)computed[i].max(); + } + } + + for (size_t i = 0; i < s.loop.size(); i++) { + const auto &l = s.loop[i]; + if (l.equals_region_computed) { + loop[i] = computed[l.region_computed_dim]; + } else if (l.bounds_are_constant) { + loop[i] = Span(l.c_min, l.c_max, true); + } else { + Expr min = simplify(substitute(computed_map, l.min)); + Expr max = simplify(substitute(computed_map, l.max)); + const int64_t *imin = as_const_int(min); + const int64_t *imax = as_const_int(max); + internal_assert(imin && imax) << min << ", " << max << "\n"; + loop[i] = Span(*imin, *imax, false); + } + } +} + +void FunctionDAG::Node::required_to_computed(const Span *required, Span *computed) const { + map required_map; + if (!region_computed_all_common_cases) { + // Make a binding for the value of each symbolic variable + for (int i = 0; i < func.dimensions(); i++) { + required_map[region_required[i].min.name()] = (int)required[i].min(); + required_map[region_required[i].max.name()] = (int)required[i].max(); + } + } + for (int i = 0; i < func.dimensions(); i++) { + const auto &comp = region_computed[i]; + if (comp.equals_required) { + computed[i] = required[i]; + } else if (comp.equals_union_of_required_with_constants) { + computed[i] = Span(std::min(required[i].min(), comp.c_min), + std::max(required[i].max(), comp.c_max), + false); + } else { + Expr min = simplify(substitute(required_map, comp.in.min)); + Expr max = simplify(substitute(required_map, comp.in.max)); + const int64_t *imin = as_const_int(min); + const int64_t *imax = as_const_int(max); + internal_assert(imin && imax) << min << ", " << max << "\n"; + computed[i] = Span(*imin, *imax, false); + } + } +} + +FunctionDAG::Edge::BoundInfo::BoundInfo(const Expr &e, const Node::Stage &consumer) + : expr(e) { + // Do the analysis to detect if this is a simple case + // that can be evaluated more cheaply. Currently this + // acceleration recognises affine expressions. In the + // future we may consider quasi-affine, or even + // piecewise-quasi-affine. If the bounds are + // non-affine, we use the symbolic expression. + const Add *add = expr.as(); + const Mul *mul = add ? add->a.as() : expr.as(); + const IntImm *coeff_imm = mul ? mul->b.as() : nullptr; + const IntImm *constant_imm = add ? add->b.as() : nullptr; + // clang-format off + Expr v = (mul ? mul->a : + add ? add->a : + expr); + // clang-format on + const Variable *var = v.as(); + + if (const IntImm *c = e.as()) { + affine = true; + coeff = 0; + constant = c->value; + } else if (var && (!mul || coeff_imm) && (!add || constant_imm)) { + affine = true; + coeff = mul ? coeff_imm->value : 1; + constant = add ? constant_imm->value : 0; + consumer_dim = -1; + for (int i = 0; i < (int)consumer.loop.size(); i++) { + const auto &in = consumer.loop[i]; + if (var->name == consumer.node->func.name() + "." + in.var + ".min") { + consumer_dim = i; + uses_max = false; + break; + } else if (var->name == consumer.node->func.name() + "." + in.var + ".max") { + consumer_dim = i; + uses_max = true; + break; + } + } + internal_assert(consumer_dim >= 0) << "Could not find consumer loop variable: " << var->name << "\n"; + aslog(2) << "Bound is affine: " << e << " == " << var->name << " * " << coeff << " + " << constant << "\n"; + } else { + affine = false; + aslog(2) << "Bound is non-affine: " << e << "\n"; + } +} + +bool FunctionDAG::Edge::all_load_jacobian_coeffs_exist() const { + for (const auto& jac : load_jacobians) { + if (!jac.all_coeffs_exist()) { + return false; + } + } + return true; +} + +void FunctionDAG::Edge::add_load_jacobian(LoadJacobian j1) { + for (auto &j2 : load_jacobians) { + if (j2.merge(j1)) return; + } + load_jacobians.emplace_back(std::move(j1)); +} + +void FunctionDAG::Edge::expand_footprint(const Span *consumer_loop, Span *producer_required) const { + // Create a map from the symbolic loop variables to the actual loop size + const auto &symbolic_loop = consumer->loop; + map s; + if (!all_bounds_affine) { + for (size_t i = 0; i < symbolic_loop.size(); i++) { + auto p = consumer_loop[i]; + const string &var = symbolic_loop[i].var; + s[consumer->node->func.name() + "." + var + ".min"] = (int)p.min(); + s[consumer->node->func.name() + "." + var + ".max"] = (int)p.max(); + } + } + // Apply that map to the bounds relationship encoded + // in the edge to expand the bounds of the producer to + // satisfy the consumer + for (int i = 0; i < producer->func.dimensions(); i++) { + // Get bounds required of this dimension of the + // producer in terms of a symbolic region of the + // consumer. + bool bounds_are_constant = true; + auto eval_bound = [&](const BoundInfo &b) { + if (b.affine) { + // Common-case performance optimization + if (b.coeff == 0) { + return b.constant; + } else { + const auto &src_pair = consumer_loop[b.consumer_dim]; + int64_t src = b.uses_max ? src_pair.max() : src_pair.min(); + bounds_are_constant &= src_pair.constant_extent(); + return src * b.coeff + b.constant; + } + } else { + Expr substituted = substitute(s, b.expr); + Expr e = simplify(substituted); + const int64_t *i = as_const_int(e); + internal_assert(i) << "Should be constant: " << b.expr << " -> " << substituted << " -> " << e << "\n"; + bounds_are_constant = false; + return *i; + } + }; + int64_t a = eval_bound(bounds[i].first); + int64_t b = eval_bound(bounds[i].second); + producer_required[i].union_with(Span(a, b, bounds_are_constant)); + } +} + +FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams ¶ms, const Target &target) { + map env = build_environment(outputs); + + // A mutator to apply parameter estimates to the expressions + // we encounter while constructing the graph. + class ApplyParamEstimates : public IRMutator { + using IRMutator::visit; + + Expr visit(const Variable *op) override { + Expr expr; + if (op->param.defined()) { + if (!op->param.is_buffer()) { + expr = op->param.estimate(); + } else { + for (int i = 0; i < op->param.dimensions(); i++) { + if (op->name == op->param.name() + ".min." + std::to_string(i)) { + expr = op->param.min_constraint_estimate(i); + } else if (op->name == op->param.name() + ".extent." + std::to_string(i)) { + expr = op->param.extent_constraint_estimate(i); + } + } + } + internal_assert(expr.defined()) << "Missing estimate for " << op->name << "\n"; + return expr; + } else { + return op; + } + } + } apply_param_estimates; + + // Compute a realization order + vector order = topological_order(outputs, env); + + // Construct the mapping from Funcs to Nodes + nodes.resize(order.size()); + map node_map; + for (size_t i = 0; i < order.size(); i++) { + Function f = env[order[order.size() - i - 1]]; + nodes[i].func = f; + nodes[i].id = (int)i; + nodes[i].max_id = (int)order.size(); + nodes[i].dag = this; + node_map[f] = &nodes[i]; + } + + int stage_count = 0; + + for (size_t i = order.size(); i > 0; i--) { + Node &node = nodes[order.size() - i]; + Function consumer = node.func; + Scope scope; + + // Create a symbolic region for this Func. + for (int j = 0; j < consumer.dimensions(); j++) { + Halide::Var min_var(consumer.name() + "." + consumer.args()[j] + ".min"); + Halide::Var max_var(consumer.name() + "." + consumer.args()[j] + ".max"); + Interval interval(min_var, max_var); + scope.push(consumer.args()[j], interval); + node.region_required.emplace_back(SymbolicInterval{min_var, max_var}); + } + + auto pure_args = node.func.args(); + + for (int s = 0; s <= (int)consumer.updates().size(); s++) { + stage_count++; + if (s == 0) { + node.stages.emplace_back(Stage(consumer, consumer.definition(), 0)); + } else { + node.stages.emplace_back(Stage(consumer, consumer.update(s - 1), s)); + } + } + + for (int s = 0; s <= (int)consumer.updates().size(); s++) { + auto &stage = node.stages[s]; + stage.node = &node; + stage.name = consumer.name(); + if (s > 0) { + stage.name += ".update(" + std::to_string(s - 1) + ")"; + } + + stage.sanitized_name = node.func.name(); + sanitize_names(stage.sanitized_name); + stage.sanitized_name += "_s" + std::to_string(s); + + const Definition &def = (s == 0) ? consumer.definition() : consumer.update(s - 1); + const StageSchedule &sched = def.schedule(); + + Scope stage_scope_with_concrete_rvar_bounds, stage_scope_with_symbolic_rvar_bounds; + stage_scope_with_concrete_rvar_bounds.set_containing_scope(&scope); + stage_scope_with_symbolic_rvar_bounds.set_containing_scope(&scope); + for (const auto &rv : sched.rvars()) { + Expr min = simplify(apply_param_estimates.mutate(rv.min)); + Expr max = simplify(apply_param_estimates.mutate(rv.min + rv.extent - 1)); + stage_scope_with_concrete_rvar_bounds.push(rv.var, Interval(min, max)); + min = Variable::make(Int(32), consumer.name() + "." + rv.var + ".min"); + max = Variable::make(Int(32), consumer.name() + "." + rv.var + ".max"); + stage_scope_with_symbolic_rvar_bounds.push(rv.var, Interval(min, max)); + } + + // Figure out the region computed of the stage by taking bounds of the LHS Exprs + if (s == 0) { + node.region_computed.resize(consumer.dimensions()); + } + + FuncValueBounds func_value_bounds = compute_function_value_bounds(order, env); + for (int j = 0; j < consumer.dimensions(); j++) { + // The region computed always uses the full extent of the rvars + Interval in = bounds_of_expr_in_scope(def.args()[j], stage_scope_with_concrete_rvar_bounds, func_value_bounds); + internal_assert(in.is_bounded()) + << "Region computed of " << consumer.name() + << " is unbounded: [" << in.min << " " << in.max << "]\n"; + if (s == 0) { + node.region_computed[j].in = in; + } else { + node.region_computed[j].in.include(in); + } + } + if (s == (int)consumer.updates().size()) { + // Simplify region computed and perform additional + // special-case analysis to make it faster to evaluate. + node.region_computed_all_common_cases = true; + for (int j = 0; j < consumer.dimensions(); j++) { + const auto &req = node.region_required[j]; + auto &comp = node.region_computed[j]; + comp.in.min = simplify(apply_param_estimates.mutate(comp.in.min)); + comp.in.max = simplify(apply_param_estimates.mutate(comp.in.max)); + if (equal(comp.in.min, req.min) && equal(comp.in.max, req.max)) { + comp.equals_required = true; + } else { + const Min *min = comp.in.min.as(); + const Max *max = comp.in.max.as(); + const int64_t *min_b = min ? as_const_int(min->b) : nullptr; + const int64_t *max_b = max ? as_const_int(max->b) : nullptr; + if (min_b && max_b && equal(min->a, req.min) && equal(max->a, req.max)) { + comp.equals_union_of_required_with_constants = true; + comp.c_min = *min_b; + comp.c_max = *max_b; + } else { + node.region_computed_all_common_cases = false; + } + } + } + } + + // We'll take any existing reordering, but won't handle existing splits + user_assert(sched.splits().empty()) + << "The Func \"" << consumer.name() << "\" has scheduling directive(s) " + << "applied to it; you must remove these, or conditionalize them " + << "using `if (!auto_schedule)`, to use the autoscheduler on this pipeline."; + stage.loop_nest_all_common_cases = true; + for (size_t i = 0; i < sched.dims().size(); i++) { + const auto &d = sched.dims()[i]; + // Skip synthetic loops like "__outermost" + if (!stage_scope_with_symbolic_rvar_bounds.contains(d.var)) continue; + + Node::Loop l; + l.var = d.var; + l.accessor = stage.name + ".get_schedule().dims()[" + std::to_string(i) + "].var"; + + // We already have the right variable names in the stage scope + Interval in = stage_scope_with_concrete_rvar_bounds.get(l.var); + l.min = in.min; + l.max = in.max; + l.pure = d.is_pure(); + l.rvar = d.is_rvar(); + l.pure_dim = -1; + + // Additional analysis to speed up evaluation of + // common cases. Loop bounds that are just one of + // the dimensions of the symbolic region computed + // are common, as are constant bounds. + l.equals_region_computed = false; + for (int j = 0; j < consumer.dimensions(); j++) { + if (l.var == pure_args[j]) { + l.pure_dim = j; + } + if (equal(l.min, node.region_computed[j].in.min) && + equal(l.max, node.region_computed[j].in.max)) { + l.equals_region_computed = true; + l.region_computed_dim = j; + break; + } + } + + if (!l.equals_region_computed) { + const int64_t *c_min = as_const_int(l.min), *c_max = as_const_int(l.max); + if (c_min && c_max) { + l.bounds_are_constant = true; + l.c_min = *c_min; + l.c_max = *c_max; + } else { + l.bounds_are_constant = false; + } + } + + stage.loop_nest_all_common_cases &= (l.bounds_are_constant || l.equals_region_computed); + stage.loop.emplace_back(std::move(l)); + } + + // Bundle all expressions associated with the definition into a single dummy call node + vector exprs_vector = def.args(); + exprs_vector.insert(exprs_vector.end(), def.values().begin(), def.values().end()); + if (def.predicate().defined()) { + exprs_vector.push_back(def.predicate()); + } + Expr exprs = Call::make(Int(32), "dummy", exprs_vector, Call::Extern); + + // Walk over the expressions involved sniffing types + class CheckTypes : public IRVisitor { + using IRVisitor::visit; + + void visit(const IntImm *op) override { + check_type(op->type); + } + + void visit(const UIntImm *op) override { + check_type(op->type); + } + + void visit(const FloatImm *op) override { + check_type(op->type); + } + + void visit(const Variable *op) override { + check_type(op->type); + } + + void visit(const Call *op) override { + calls[op->name]++; + IRVisitor::visit(op); + check_type(op->type); + if (op->call_type == Call::Halide || op->call_type == Call::Image) { + is_pointwise &= op->args.size() == func.args().size(); + if (is_pointwise) { + for (size_t i = 0; i < op->args.size(); i++) { + const Variable *v = op->args[i].as(); + is_pointwise &= (v != nullptr) && (v->name == func.args()[i]); + } + } + } + } + + void visit(const Cast *op) override { + IRVisitor::visit(op); + check_type(op->type); + } + + void check_type(Type t) { + if (t.bits() > 1 && + (!narrowest_type.bits() || + t.bits() < narrowest_type.bits())) { + narrowest_type = t; + } + } + Function func; + + public: + bool is_pointwise = true; + int leaves = 0; + Type narrowest_type; + map calls; + CheckTypes(Function f) + : func(f) { + } + }; + CheckTypes checker(consumer); + exprs.accept(&checker); + + Type widest_output_type = def.values()[0].type(); + + int bytes_per_point = 0; + for (const auto &e : def.values()) { + bytes_per_point += e.type().bytes(); + if (e.type().bytes() > widest_output_type.bytes()) { + widest_output_type = e.type(); + } + } + if (s == 0) { + node.bytes_per_point = bytes_per_point; + } + + if (target.has_gpu_feature()) { + stage.vector_size = 1; + } else { + stage.vector_size = target.natural_vector_size(checker.narrowest_type); + } + + if (s == 0) { + node.vector_size = stage.vector_size; + } else { + node.vector_size = std::max(node.vector_size, stage.vector_size); + } + + node.is_output = false; + for (const auto &o : outputs) { + node.is_output |= o.same_as(node.func); + } + + if (node.is_output) { + // Get the bounds estimate + map estimates; + for (auto b : consumer.schedule().estimates()) { + int64_t i_min = *as_const_int(b.min); + int64_t i_extent = *as_const_int(b.extent); + + if ((false)) { // Intentional dead code. Extra parens to pacify clang-tidy. + // Some methods we compare to compile for + // statically known input/output sizes. We + // don't need to - we take estimates but + // the compiled code doesn't enforce + // them. If you want to make a comparison + // fair and target a fixed size, use this + // branch of the if. In practice we don't + // see a runtime difference, so we left it + // disabled. In theory, Sizes being + // constant makes it possible to do things + // like unroll across color channels, so + // it affects the scheduling space. + Func(node.func).bound(b.var, b.min, b.extent); + estimates[b.var] = Span(i_min, i_min + i_extent - 1, true); + } else { + estimates[b.var] = Span(i_min, i_min + i_extent - 1, false); + } + } + for (auto b : consumer.schedule().bounds()) { + const int64_t *i_min = as_const_int(b.min); + const int64_t *i_extent = as_const_int(b.extent); + if (i_min && i_extent) { + // It's a true bound, not just an estimate + estimates[b.var] = Span(*i_min, *i_min + *i_extent - 1, true); + } + } + // Set the bounds using the estimates + for (int i = 0; i < consumer.dimensions(); i++) { + auto it = estimates.find(consumer.args()[i]); + user_assert(it != estimates.end()) + << "Need an estimate on dimension " << i << " of \"" << consumer.name() << "\""; + node.estimated_region_required.push_back(it->second); + } + } + + stage.index = s; + + exprs = apply_param_estimates.mutate(exprs); + + for (auto &p : func_value_bounds) { + p.second.min = apply_param_estimates.mutate(p.second.min); + p.second.max = apply_param_estimates.mutate(p.second.max); + } + + // For this stage scope we want symbolic bounds for the rvars + + // Now create the edges that lead to this func + bool any_incoming_edges = false; + node.is_pointwise = !node.func.has_update_definition(); + + // TODO: peephole the boundary condition call pattern instead of assuming the user used the builtin + node.is_boundary_condition = node.is_pointwise && starts_with(node.func.name(), "repeat_edge"); + + auto boxes = boxes_required(exprs, stage_scope_with_symbolic_rvar_bounds, func_value_bounds); + for (auto &p : boxes) { + auto it = env.find(p.first); + if (it != env.end() && p.first != consumer.name()) { + // Discard loads from input images and self-loads + Edge edge; + edge.consumer = &stage; + edge.producer = node_map.at(env[p.first]); + edge.all_bounds_affine = true; + + for (Interval &in : p.second.bounds) { + // Whenever a relationship is unbounded, we must inline + internal_assert(in.is_bounded()) + << "Unbounded producer->consumer relationship: " + << edge.producer->func.name() << " -> " << edge.consumer->name << "\n"; + Edge::BoundInfo min(simplify(in.min), *edge.consumer); + Edge::BoundInfo max(simplify(in.max), *edge.consumer); + edge.bounds.emplace_back(std::move(min), std::move(max)); + edge.all_bounds_affine &= edge.bounds.back().first.affine; + edge.all_bounds_affine &= edge.bounds.back().second.affine; + } + edge.calls = checker.calls[edge.producer->func.name()]; + any_incoming_edges = true; + node.is_pointwise &= checker.is_pointwise; + edges.emplace_back(std::move(edge)); + } + } + + node.is_wrapper = node.func.is_wrapper(); + node.is_input = !node.func.has_update_definition() && node.is_wrapper && !any_incoming_edges; + node.dimensions = node.func.dimensions(); + } + } + + // Initialize the memory layouts for the bounds structs + for (auto &n : nodes) { + n.bounds_memory_layout.reset(new BoundContents::Layout); + auto &l = *(n.bounds_memory_layout); + l.computed_offset = n.func.dimensions(); + l.total_size = l.computed_offset + n.func.dimensions(); + for (const auto &s : n.stages) { + l.loop_offset.push_back(l.total_size); + l.total_size += (int)s.loop.size(); + } + } + + // Give all the stages unique ids to support perfect hashing of them + { + int i = 0; + for (auto &n : nodes) { + for (auto &s : n.stages) { + s.id = i; + s.max_id = stage_count; + stage_id_to_node_map[s.id] = &n; + i++; + } + } + } + + for (size_t i = 0; i < edges.size(); i++) { + edges[i].producer->outgoing_edges.push_back(&(edges[i])); + edges[i].consumer->incoming_edges.push_back(&(edges[i])); + } + + // Compute transitive dependencies + for (size_t i = nodes.size(); i > 0; i--) { + auto &n = nodes[i - 1]; + for (auto &s : n.stages) { + s.dependencies.resize(nodes.size(), false); + for (auto *e : s.incoming_edges) { + s.dependencies[e->producer->id] = true; + for (auto &s2 : e->producer->stages) { + for (size_t j = 0; j < nodes.size(); j++) { + s.dependencies[j] = s.dependencies[j] || s2.dependencies[j]; + } + } + } + } + } + + // Compute the algorithm-specific features for the neural net + featurize(); + + for (Node &node : nodes) { + if (node.is_input) { + continue; + } + ++num_non_input_nodes; + } +} + +void FunctionDAG::featurize() { + for (Node &node : nodes) { + for (size_t stage_idx = 0; stage_idx < node.stages.size(); stage_idx++) { + Node::Stage &stage = node.stages[stage_idx]; + + Featurizer featurizer(node.func, stage); + + if (node.func.extern_definition_proxy_expr().get()) { + // Extern function call with a proxy implementation specified: generate the featurization from the proxy + Expr v = simplify(node.func.extern_definition_proxy_expr()); + v = common_subexpression_elimination(v); + v.accept(&featurizer); + } else { + Definition def = node.func.definition(); + if (stage_idx > 0) { + def = node.func.updates()[stage_idx - 1]; + } + stage.features = PipelineFeatures(); + + for (auto v : def.values()) { + featurizer.visit_store_args(node.func.name(), v.type(), def.args()); + v = common_subexpression_elimination(simplify(v)); // Get things into canonical form + v.accept(&featurizer); + } + for (auto v : def.args()) { + v = common_subexpression_elimination(simplify(v)); // Get things into canonical form + v.accept(&featurizer); + } + } + } + } +} + +template +void FunctionDAG::dump_internal(OS &os) const { + for (const Node &n : nodes) { + os << "Node: " << n.func.name() << "\n" + << " Symbolic region required: \n"; + for (const SymbolicInterval &i : n.region_required) { + os << " " << i.min << ", " << i.max << "\n"; + } + os << " Region computed: \n"; + for (const auto &i : n.region_computed) { + os << " " << i.in.min << ", " << i.in.max << "\n"; + } + for (size_t i = 0; i < n.stages.size(); i++) { + os << " Stage " << i << ":\n"; + for (const auto &l : n.stages[i].loop) { + os << " " << l.var << " " << l.min << " " << l.max << "\n"; + } + n.stages[i].features.dump(os); + } + os << " pointwise: " << n.is_pointwise + << " boundary condition: " << n.is_boundary_condition + << " wrapper: " << n.is_wrapper + << " input: " << n.is_input + << " output: " << n.is_output << "\n"; + } + for (const Edge &e : edges) { + os << "Edge: " << e.producer->func.name() << " -> " << e.consumer->name << "\n" + << " Footprint: \n"; + int j = 0; + for (const auto &i : e.bounds) { + os << " Min " << j << ": " << i.first.expr << "\n"; + os << " Max " << j << ": " << i.second.expr << "\n"; + j++; + } + + os << " Load Jacobians:\n"; + for (const auto &jac : e.load_jacobians) { + jac.dump(" "); + } + } +} + +void FunctionDAG::dump() const { + auto os = aslog(0); + dump_internal(os); +} + +std::ostream &FunctionDAG::dump(std::ostream &os) const { + dump_internal(os); + return os; +} + +int ExprBranching::visit(const IntImm *op) { + return 1; +} + +int ExprBranching::visit(const UIntImm *op) { + return 1; +} + +int ExprBranching::visit(const FloatImm *op) { + return 1; +} + +int ExprBranching::visit(const StringImm *op) { + return 1; +} + +int ExprBranching::visit(const Broadcast *op) { + return Super::dispatch(op->value); +} + +int ExprBranching::visit(const Cast *op) { + return Super::dispatch(op->value); +} + +int ExprBranching::visit(const Variable *op) { + return 1; +} + +int ExprBranching::visit_binary(const Expr &a, const Expr &b) { + int branching_a = Super::dispatch(a); + int branching_b = Super::dispatch(b); + + if (branching_a == branching_b) { + return branching_a + 1; + } + + return std::max(branching_a, branching_b); +} + +int ExprBranching::visit(const Add *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Sub *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Mul *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Div *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Mod *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Min *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Max *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const EQ *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const NE *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const LT *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const LE *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const GT *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const GE *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const And *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Or *op) { + return visit_binary(op->a, op->b); +} + +int ExprBranching::visit(const Not *op) { + return Super::dispatch(op->a); +} + +int ExprBranching::visit(const Select *op) { + int branching_true = visit_binary(op->condition, op->true_value); + int branching_false = visit_binary(op->condition, op->false_value); + return std::max(branching_true, branching_false); +} + +int ExprBranching::visit(const Ramp *op) { + return visit_binary(op->base, op->stride); +} + +int ExprBranching::visit(const Load *op) { + return visit_binary(op->predicate, op->index); +} + +int ExprBranching::visit_nary(const std::vector& exprs) { + int total_branching = 0; + + for (Expr e : exprs) { + int branching = Super::dispatch(e); + if (branching == 0) { + continue; + } + + if (branching == total_branching) { + ++total_branching; + } else { + total_branching = std::max(total_branching, branching); + } + } + + return total_branching; +} + +int ExprBranching::visit(const Call *op) { + for (const auto& i : inlined) { + if (op->name == i.first->func.name()) { + return compute(i.first->func); + } + } + + return visit_nary(op->args); +} + +int ExprBranching::visit(const Shuffle *op) { + return visit_nary(op->vectors); +} + +int ExprBranching::visit(const Let *op) { + return visit_binary(op->value, op->body); +} + +int ExprBranching::visit(const VectorReduce *op) { + return Super::dispatch(op->value); +} + +int ExprBranching::compute(const Function& f) { + Definition def = f.definition(); + + std::vector values; + values.reserve(def.values().size()); + for (auto v : def.values()) { + values.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form + } + + int branching = visit_nary(values); + + std::vector args; + args.reserve(def.args().size()); + for (auto v : def.args()) { + args.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form + } + + return std::max(branching, visit_nary(args)); +} + +void sanitize_names(std::string& str) { + bool in_quotes = false; + for (auto &c : str) { + in_quotes ^= (c == '"'); + if (!in_quotes && c == '$') c = '_'; + } +} + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/FunctionDAG.h b/src/autoschedulers/anderson2021/FunctionDAG.h new file mode 100644 index 000000000000..00d6313d8d17 --- /dev/null +++ b/src/autoschedulers/anderson2021/FunctionDAG.h @@ -0,0 +1,674 @@ +/** This file defines the class FunctionDAG, which is our + * representation of a Halide pipeline, and contains methods to using + * Halide's bounds tools to query properties of it. */ + +#ifndef FUNCTION_DAG_H +#define FUNCTION_DAG_H + +#include +#include +#include +#include +#include + +#include "Errors.h" +#include "Featurization.h" +#include "Halide.h" +#include "PerfectHashMap.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +using std::map; +using std::pair; +using std::string; +using std::unique_ptr; +using std::vector; + +// First we have various utility classes. + +// An optional rational type used when analyzing memory dependencies. +struct OptionalRational { + int32_t numerator = 0, denominator = 0; + + bool exists() const { + return denominator != 0; + } + + OptionalRational() = default; + OptionalRational(int64_t n, int64_t d) + : numerator(n), denominator(d) { + } + + void operator+=(const OptionalRational &other) { + if ((denominator & other.denominator) == 0) { + numerator = denominator = 0; + return; + } + if (denominator == other.denominator) { + numerator += other.numerator; + return; + } + + int64_t l = lcm(denominator, other.denominator); + numerator *= l / denominator; + denominator = l; + numerator += other.numerator * (l / other.denominator); + int64_t g = gcd(numerator, denominator); + numerator /= g; + denominator /= g; + } + + OptionalRational operator*(int64_t factor) const { + if ((*this) == 0) return *this; + int64_t num = numerator * factor; + return OptionalRational{num, denominator}; + } + + OptionalRational operator*(const OptionalRational &other) const { + if ((*this) == 0) return *this; + if (other == 0) return other; + int64_t num = numerator * other.numerator; + int64_t den = denominator * other.denominator; + return OptionalRational{num, den}; + } + + // Because this type is optional (exists may be false), we don't + // have a total ordering. These methods all return false when the + // operators are not comparable, so a < b is not the same as !(a + // >= b). + bool operator<(int x) const { + if (denominator == 0) { + return false; + } else if (denominator > 0) { + return numerator < x * denominator; + } else { + return numerator > x * denominator; + } + } + + bool operator<=(int x) const { + if (denominator == 0) { + return false; + } else if (denominator > 0) { + return numerator <= x * denominator; + } else { + return numerator >= x * denominator; + } + } + + bool operator>(int x) const { + if (!exists()) return false; + return !((*this) <= x); + } + + bool operator>=(int x) const { + if (!exists()) return false; + return !((*this) < x); + } + + bool operator==(int x) const { + return exists() && (numerator == (x * denominator)); + } + + bool operator==(const OptionalRational &other) const { + return (exists() == other.exists()) && (numerator * other.denominator == denominator * other.numerator); + } +}; + +// A LoadJacobian records the derivative of the coordinate accessed in +// some producer w.r.t the loops of the consumer. +class LoadJacobian { + std::vector coeffs; + int64_t c; + size_t rows, cols; + +public: + LoadJacobian(size_t producer_storage_dims, size_t consumer_loop_dims, int64_t count) + : c(count), rows(producer_storage_dims), cols(consumer_loop_dims) { + coeffs.resize(rows * cols); + } + + bool all_coeffs_exist() const { + for (const auto& coeff : coeffs) { + if (!coeff.exists()) { + return false; + } + } + return true; + } + + bool empty() const { + return rows == 0; + } + + size_t producer_storage_dims() const { + return rows; + } + + size_t consumer_loop_dims() const { + return cols; + } + + bool is_constant() const { + for (const auto& c : coeffs) { + if (!c.exists() || !(c == 0)) { + return false; + } + } + + return true; + } + + OptionalRational operator()(int producer_storage_dim, int consumer_loop_dim) const { + if (producer_storage_dims() == 0 || consumer_loop_dims() == 0) { + // The producer or consumer is scalar, so all strides are zero. + return {0, 1}; + } + return coeffs[producer_storage_dim * cols + consumer_loop_dim]; + } + + OptionalRational &operator()(int producer_storage_dim, int consumer_loop_dim) { + return coeffs[producer_storage_dim * cols + consumer_loop_dim]; + } + + // To avoid redundantly re-recording copies of the same + // load Jacobian, we keep a count of how many times a + // load with this Jacobian occurs. + int64_t count() const { + return c; + } + + // Try to merge another LoadJacobian into this one, increasing the + // count if the coefficients match. + bool merge(const LoadJacobian &other) { + if (other.rows != rows || other.cols != cols) return false; + for (size_t i = 0; i < rows * cols; i++) { + if (!(other.coeffs[i] == coeffs[i])) return false; + } + c += other.count(); + return true; + } + + // Scale the matrix coefficients by the given factors + LoadJacobian operator*(const std::vector &factors) const { + LoadJacobian result(rows, cols, c); + for (size_t i = 0; i < producer_storage_dims(); i++) { + for (size_t j = 0; j < consumer_loop_dims(); j++) { + result(i, j) = (*this)(i, j) * factors[j]; + } + } + return result; + } + + // Multiply Jacobians, used to look at memory dependencies through + // inlined functions. + LoadJacobian operator*(const LoadJacobian &other) const { + LoadJacobian result(producer_storage_dims(), other.consumer_loop_dims(), count() * other.count()); + for (size_t i = 0; i < producer_storage_dims(); i++) { + for (size_t j = 0; j < other.consumer_loop_dims(); j++) { + result(i, j) = OptionalRational{0, 1}; + for (size_t k = 0; k < consumer_loop_dims(); k++) { + result(i, j) += (*this)(i, k) * other(k, j); + } + } + } + return result; + } + + void dump(const char *prefix) const; +}; + +// Classes to represent a concrete set of bounds for a Func. A Span is +// single-dimensional, and a Bound is a multi-dimensional box. For +// each dimension we track the estimated size, and also whether or not +// the size is known to be constant at compile-time. For each Func we +// track three different types of bounds: + +// 1) The region required by consumers of the Func, which determines +// 2) The region actually computed, which in turn determines +// 3) The min and max of all loops in the loop next. + +// 3 in turn determines the region required of the inputs to a Func, +// which determines their region computed, and hence their loop nest, +// and so on back up the Function DAG from outputs back to inputs. + +class Span { + int64_t min_, max_; + bool constant_extent_; + +public: + int64_t min() const { + return min_; + } + int64_t max() const { + return max_; + } + int64_t extent() const { + return max_ - min_ + 1; + } + bool constant_extent() const { + return constant_extent_; + } + + void union_with(const Span &other) { + min_ = std::min(min_, other.min()); + max_ = std::max(max_, other.max()); + constant_extent_ = constant_extent_ && other.constant_extent(); + } + + void set_extent(int64_t e) { + max_ = min_ + e - 1; + } + + void translate(int64_t x) { + min_ += x; + max_ += x; + } + + Span(int64_t a, int64_t b, bool c) + : min_(a), max_(b), constant_extent_(c) { + } + Span() = default; + Span(const Span &other) = default; + static Span empty_span() { + return Span(INT64_MAX, INT64_MIN, true); + } +}; + +// Bounds objects are created and destroyed very frequently while +// exploring scheduling options, so we have a custom allocator and +// memory pool. Much like IR nodes, we treat them as immutable once +// created and wrapped in a Bound object so that they can be shared +// safely across scheduling alternatives. +struct BoundContents { + mutable RefCount ref_count; + + class Layout; + const Layout *layout = nullptr; + + Span *data() const { + // This struct is a header + return (Span *)(const_cast(this) + 1); + } + + Span ®ion_required(int i) { + return data()[i]; + } + + Span ®ion_computed(int i) { + return data()[i + layout->computed_offset]; + } + + Span &loops(int i, int j) { + return data()[j + layout->loop_offset[i]]; + } + + const Span ®ion_required(int i) const { + return data()[i]; + } + + const Span ®ion_computed(int i) const { + return data()[i + layout->computed_offset]; + } + + const Span &loops(int i, int j) const { + return data()[j + layout->loop_offset[i]]; + } + + BoundContents *make_copy() const { + auto b = layout->make(); + size_t bytes = sizeof(data()[0]) * layout->total_size; + memcpy(b->data(), data(), bytes); + return b; + } + + void validate() const; + + // We're frequently going to need to make these concrete bounds + // arrays. It makes things more efficient if we figure out the + // memory layout of those data structures once ahead of time, and + // make each individual instance just use that. Note that this is + // not thread-safe. + class Layout { + // A memory pool of free BoundContent objects with this layout + mutable std::vector pool; + + // All the blocks of memory allocated + mutable std::vector blocks; + + mutable size_t num_live = 0; + + void allocate_some_more() const; + + public: + // number of Span to allocate + int total_size; + + // region_computed comes next at the following index + int computed_offset; + + // the loop for each stage starts at the following index + std::vector loop_offset; + + Layout() = default; + ~Layout(); + + Layout(const Layout &) = delete; + void operator=(const Layout &) = delete; + Layout(Layout &&) = delete; + void operator=(Layout &&) = delete; + + // Make a BoundContents object with this layout + BoundContents *make() const; + + // Release a BoundContents object with this layout back to the pool + void release(const BoundContents *b) const; + }; +}; + +using Bound = IntrusivePtr; + +// A representation of the function DAG. The nodes and edges are both +// in reverse realization order, so if you want to walk backwards up +// the DAG, just iterate the nodes or edges in-order. +struct FunctionDAG { + + // An edge is a producer-consumer relationship + struct Edge; + + struct SymbolicInterval { + Halide::Var min; + Halide::Var max; + }; + + // A Node represents a single Func + struct Node { + // A pointer back to the owning DAG + FunctionDAG *dag; + + // The Halide Func this represents + Function func; + + // The number of bytes per point stored. + double bytes_per_point; + + // The min/max variables used to denote a symbolic region of + // this Func. Used in the cost above, and in the Edges below. + vector region_required; + + // A concrete region required from a bounds estimate. Only + // defined for outputs. + vector estimated_region_required; + + // The region computed of a Func, in terms of the region + // required. For simple Funcs this is identical to the + // region_required. However, in some Funcs computing one + // output requires computing other outputs too. You can't + // really ask for a single output pixel from something blurred + // with an IIR without computing the others, for example. + struct RegionComputedInfo { + // The min and max in their full symbolic glory. We use + // these in the general case. + Interval in; + + // Analysis used to accelerate common cases + bool equals_required = false, equals_union_of_required_with_constants = false; + int64_t c_min = 0, c_max = 0; + }; + vector region_computed; + bool region_computed_all_common_cases = false; + + // Expand a region required into a region computed, using the + // symbolic intervals above. + void required_to_computed(const Span *required, Span *computed) const; + + // Metadata about one symbolic loop in a Func's default loop nest. + struct Loop { + string var; + bool pure, rvar; + Expr min, max; + + // Which pure dimension does this loop correspond to? Invalid if it's an rvar + int pure_dim; + + // Precomputed metadata to accelerate common cases: + + // If true, the loop bounds are just the region computed in the given dimension + bool equals_region_computed = false; + int region_computed_dim = 0; + + // If true, the loop bounds are a constant with the given min and max + bool bounds_are_constant = false; + int64_t c_min = 0, c_max = 0; + + // A persistent fragment of source for getting this Var + // from its owner Func. Used for printing source code + // equivalent to a computed schedule. + string accessor; + }; + + // Get the loop nest shape as a function of the region computed + void loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const; + + // One stage of a Func + struct Stage { + // The owning Node + Node *node; + + // Which stage of the Func is this. 0 = pure. + int index; + + // The loop nest that computes this stage, from innermost out. + vector loop; + bool loop_nest_all_common_cases = false; + + // The vectorization width that will be used for + // compute. Corresponds to the natural width for the + // narrowest type used. + int vector_size; + + // The featurization of the compute done + PipelineFeatures features; + + // The actual Halide front-end stage object + Halide::Stage stage; + + // The name for scheduling (e.g. "foo.update(3)") + string name; + + string sanitized_name; + + // Ids for perfect hashing on stages. + int id, max_id; + + std::unique_ptr store_jacobian; + + vector incoming_edges; + + vector dependencies; + bool downstream_of(const Node &n) const { + return dependencies[n.id]; + }; + + Stage(Halide::Stage s) + : stage(s) { + } + + int get_loop_index_from_var(const std::string& var) const { + int i = 0; + for (const auto& l : loop) { + if (l.var == var) { + return i; + } + + ++i; + } + + return -1; + } + }; + vector stages; + + vector outgoing_edges; + + // Max vector size across the stages + int vector_size; + + // A unique ID for this node, allocated consecutively starting + // at zero for each pipeline. + int id, max_id; + + // Just func->dimensions(), but we ask for it so many times + // that's it's worth avoiding the function call into + // libHalide. + int dimensions; + + // Is a single pointwise call to another Func + bool is_wrapper; + + // We represent the input buffers as node, though we do not attempt to schedule them. + bool is_input; + + // Is one of the pipeline outputs + bool is_output; + + // Only uses pointwise calls + bool is_pointwise; + + // Only uses pointwise calls + clamping on all indices + bool is_boundary_condition; + + std::unique_ptr bounds_memory_layout; + + BoundContents *make_bound() const { + return bounds_memory_layout->make(); + } + }; + + // A representation of a producer-consumer relationship + struct Edge { + struct BoundInfo { + // The symbolic expression for the bound in this dimension + Expr expr; + + // Fields below are the results of additional analysis + // used to evaluate this bound more quickly. + int64_t coeff, constant; + int64_t consumer_dim; + bool affine, uses_max; + + BoundInfo(const Expr &e, const Node::Stage &consumer); + }; + + // Memory footprint on producer required by consumer. + vector> bounds; + + FunctionDAG::Node *producer; + FunctionDAG::Node::Stage *consumer; + + // The number of calls the consumer makes to the producer, per + // point in the loop nest of the consumer. + int calls; + + bool all_bounds_affine; + + vector load_jacobians; + + bool all_load_jacobian_coeffs_exist() const; + + void add_load_jacobian(LoadJacobian j1); + + // Given a loop nest of the consumer stage, expand a region + // required of the producer to be large enough to include all + // points required. + void expand_footprint(const Span *consumer_loop, Span *producer_required) const; + }; + + vector nodes; + vector edges; + + int num_non_input_nodes{0}; + + // We're going to be querying this DAG a lot while searching for + // an optimal schedule, so we'll also create a variety of + // auxiliary data structures. + map stage_id_to_node_map; + + // Create the function DAG, and do all the dependency and cost + // analysis. This is done once up-front before the tree search. + FunctionDAG(const vector &outputs, const MachineParams ¶ms, const Target &target); + + void dump() const; + std::ostream &dump(std::ostream &os) const; + +private: + // Compute the featurization for the entire DAG + void featurize(); + + // This class uses a lot of internal pointers, so we'll hide the copy constructor. + FunctionDAG(const FunctionDAG &other) = delete; + void operator=(const FunctionDAG &other) = delete; + + template + void dump_internal(OS &os) const; +}; + +template +using NodeMap = PerfectHashMap; + +class ExprBranching : public VariadicVisitor { + using Super = VariadicVisitor; + +private: + const NodeMap& inlined; + +public: + int visit(const IntImm *op); + int visit(const UIntImm *op); + int visit(const FloatImm *op); + int visit(const StringImm *op); + int visit(const Broadcast *op); + int visit(const Cast *op); + int visit(const Variable *op); + int visit(const Add *op); + int visit(const Sub *op); + int visit(const Mul *op); + int visit(const Div *op); + int visit(const Mod *op); + int visit(const Min *op); + int visit(const Max *op); + int visit(const EQ *op); + int visit(const NE *op); + int visit(const LT *op); + int visit(const LE *op); + int visit(const GT *op); + int visit(const GE *op); + int visit(const And *op); + int visit(const Or *op); + int visit(const Not *op); + int visit(const Select *op); + int visit(const Ramp *op); + int visit(const Load *op); + int visit(const Call *op); + int visit(const Shuffle *op); + int visit(const Let *op); + int visit(const VectorReduce *op); + int visit_binary(const Expr &a, const Expr &b); + int visit_nary(const std::vector& exprs); + + ExprBranching(const NodeMap& inlined) + : inlined{inlined} + {} + + int compute(const Function& f); +}; + +void sanitize_names(std::string& str); + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // FUNCTION_DAG_H diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.cpp b/src/autoschedulers/anderson2021/GPULoopInfo.cpp new file mode 100644 index 000000000000..f1c53ec8b9b0 --- /dev/null +++ b/src/autoschedulers/anderson2021/GPULoopInfo.cpp @@ -0,0 +1,108 @@ +#include "GPULoopInfo.h" +#include "Errors.h" +#include "LoopNest.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +void GPULoopInfo::update(const Target& target, const LoopNest* loop) { + if (loop->is_gpu_block(target)) { + current_block_loop = loop; + num_blocks = loop->get_block_and_serial_extents(loop).first; + return; + } + + if (loop->is_gpu_thread(target)) { + current_thread_loop = loop; + return; + } + + if (loop->is_gpu_serial(target) && at_or_inside_block()) { + int64_t serial_loop_extents = 1; + for (auto c : loop->size) { + serial_loop_extents *= c; + } + + if (at_or_inside_thread()) { + total_inner_serial_extents *= serial_loop_extents; + inner_loop_stack.push_back(loop); + } else { + total_outer_serial_extents *= serial_loop_extents; + } + } +} + +int64_t GPULoopInfo::total_serial_extents() const { + return total_outer_serial_extents * total_inner_serial_extents; +} + +bool GPULoopInfo::at_or_inside_block() const { + return current_block_loop != nullptr; +} + +bool GPULoopInfo::at_or_inside_thread() const { + return current_thread_loop != nullptr; +} + +std::vector GPULoopInfo::get_inner_serial_loop_extents(const LoopNest* loop_nest) const { + internal_assert(at_or_inside_thread()); + + std::vector extents; + std::size_t N = loop_nest->stage->loop.size(); + extents.reserve(N); + + const auto &bounds = current_thread_loop->get_bounds(loop_nest->stage->node); + + for (std::size_t i = 0; i < N; i++) { + auto extent = bounds->loops(loop_nest->stage->index, i).extent(); + extents.push_back(extent); + } + + return extents; +} + +// If you have a realization inside a serial loop e.g. +// f 80 gpu_block +// f 32 gpu_thread +// f 8 gpu_serial +// realize: g +// g 1 gpu_serial +// g 1 gpu_simd +// f 1 gpu_simd +// This method will give the extents of the loops inside the thread level but +// outside the given loop_nest's realization e.g. 8 for g above. +int64_t GPULoopInfo::get_total_inner_serial_extents_outside_realization(const LoopNest* loop_nest) const { + int64_t extents = 1; + + for (const auto* loop : inner_loop_stack) { + if (loop->node == loop_nest->node) { + break; + } + + for (auto c : loop->size) { + extents *= c; + } + } + + return extents; +} + +std::unique_ptr GPULoopInfo::create_thread_info() { + internal_assert(at_or_inside_block()); + internal_assert(at_or_inside_thread()); + + auto max_thread_counts = current_block_loop->get_union_thread_counts(nullptr); + std::unique_ptr new_thread_info = std::make_unique( + current_thread_loop->vectorized_loop_index, + current_thread_loop->size, + current_thread_loop->stage->loop, + max_thread_counts + ); + thread_info = new_thread_info.get(); + return new_thread_info; +} + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.h b/src/autoschedulers/anderson2021/GPULoopInfo.h new file mode 100644 index 000000000000..62e669eb500f --- /dev/null +++ b/src/autoschedulers/anderson2021/GPULoopInfo.h @@ -0,0 +1,55 @@ +#ifndef GPU_LOOP_INFO_H +#define GPU_LOOP_INFO_H + +/** \file + * + * Data structure containing information about the current GPU loop nest + * hierarchy of blocks, threads, etc. Useful when computing GPU features + */ + +#include + +#include "Halide.h" +#include "ThreadInfo.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +struct LoopNest; + +struct GPULoopInfo { + GPULoopInfo(const LoopNest* root) + : root{root} + {} + + const LoopNest* root = nullptr; + const LoopNest* current_block_loop = nullptr; + const LoopNest* current_thread_loop = nullptr; + std::vector inner_loop_stack; + int64_t num_blocks = 1; + int64_t total_outer_serial_extents = 1; + int64_t total_inner_serial_extents = 1; + const ThreadInfo* thread_info = nullptr; + + void update(const Target& target, const LoopNest* loop); + + int64_t total_serial_extents() const; + + bool at_or_inside_block() const; + + bool at_or_inside_thread() const; + + std::vector get_inner_serial_loop_extents(const LoopNest* loop_nest) const; + + std::unique_ptr create_thread_info(); + + int64_t get_total_inner_serial_extents_outside_realization(const LoopNest* loop_nest) const; + +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // GPU_LOOP_INFO_H diff --git a/src/autoschedulers/anderson2021/GPUMemInfo.h b/src/autoschedulers/anderson2021/GPUMemInfo.h new file mode 100644 index 000000000000..03acff9301e9 --- /dev/null +++ b/src/autoschedulers/anderson2021/GPUMemInfo.h @@ -0,0 +1,412 @@ +#ifndef GPU_MEM_INFO_H +#define GPU_MEM_INFO_H + +#include +#include +#include + +#include "ASLog.h" + +/** \file + * + * Data structures that help track memory access information. Useful when + * computing GPU features + */ + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +struct GlobalMem; +struct GlobalAccessAccumulator; +struct SharedMem; +struct SharedAccessAccumulator; +struct LocalMem; +struct LocalAccessAccumulator; + +template +struct MemTraits; + +template <> +struct MemTraits { + static constexpr double bytes_per_transaction = 32; + using MemInfoType = GlobalMem; + using Accumulator = GlobalAccessAccumulator; +}; + +template <> +struct MemTraits { + static constexpr double bytes_per_transaction = 128; + using MemInfoType = SharedMem; + using Accumulator = SharedAccessAccumulator; +}; + +template <> +struct MemTraits { + static constexpr double bytes_per_transaction = 32; + using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem + using Accumulator = LocalAccessAccumulator; +}; + +template +using Accumulator = typename MemTraits::Accumulator; + +template +struct MemInfo { + static constexpr double bytes_per_transaction = MemTraits::bytes_per_transaction; + + double num_transactions() const { + return total_num_transactions; + } + + void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request) { + internal_assert(num_bytes_used_per_request > 0); + + double total_transactions = num_requests * num_transactions_per_request; + double total_bytes = total_transactions * bytes_per_transaction; + double total_bytes_used = num_requests * num_bytes_used_per_request; + + internal_assert(total_bytes_used <= total_bytes) + << "\ntotal_bytes_used = " << total_bytes_used + << "\ntotal_bytes = " << total_bytes + << "\ntotal_transactions = " << total_transactions + << "\nnum_transactions_per_request = " << num_transactions_per_request + << "\nnum_requests = " << num_requests; + + update_totals(total_transactions, total_bytes_used, total_bytes); + } + + void add(const MemInfo& other) { + total_num_transactions += other.total_num_transactions; + total_num_bytes_used += other.total_num_bytes_used; + total_num_bytes += other.total_num_bytes; + } + + double efficiency() const { + if (total_num_bytes == 0) { + return 1; + } + + double result = total_num_bytes_used / total_num_bytes; + internal_assert(result <= 1); + return result; + } + +private: + void update_totals(double num_transactions, double num_bytes_used, double num_bytes) { + total_num_transactions += num_transactions; + total_num_bytes_used += num_bytes_used; + total_num_bytes += num_bytes; + } + + double total_num_transactions = 0; + double total_num_bytes_used = 0; + double total_num_bytes = 0; +}; + +template +using MemInfoType = MemInfo::MemInfoType>; + +using GlobalMemInfo = MemInfoType; +using SharedMemInfo = MemInfoType; +using LocalMemInfo = MemInfoType; + +struct Strides { +public: + Strides(const std::vector& storage_strides) + : storage_strides{storage_strides} + {} + + void add_valid(const std::vector& strides) { + add(strides, true); + } + + void add_invalid() { + add({}, false); + } + + bool valid(size_t loop_index) const { + return is_valid[loop_index]; + } + + int64_t offset(size_t loop_index, int64_t point) const { + internal_assert(loop_index < is_valid.size() && valid(loop_index)); + internal_assert(index_strides[loop_index].size() == storage_strides.size()); + + int64_t result = 0; + for (size_t i = 0; i < storage_strides.size(); ++i) { + result += (int64_t)(point * index_strides[loop_index][i]) * storage_strides[i]; + } + return std::abs(result); + } + + void dump(bool verbose=false) { + if (!verbose) { + return; + } + + for (size_t i = 0; i < storage_strides.size(); ++i) { + if (!valid(i)) { + aslog(2) << "stride " << i << ": invalid\n"; + continue; + } + aslog(2) << "storage_stride " << i << ": " << storage_strides[i] << "\n"; + } + + for (size_t i = 0; i < index_strides.size(); ++i) { + for (size_t j = 0; j < index_strides[i].size(); ++j) { + aslog(2) << "index_stride " << i << ", storage_stride " << j << ": " << index_strides[i][j] << " "; + } + aslog(2) << "\n"; + } + } + +private: + void add(const std::vector& strides, bool e) { + index_strides.push_back(strides); + is_valid.push_back(e); + } + + std::vector storage_strides; + std::vector> index_strides; + std::vector is_valid; +}; + +struct GlobalAccessAccumulator { + GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides& strides, bool verbose) + : bytes_per_access{bytes_per_access} + , dimensions{dimensions} + , strides{strides} + , verbose{verbose} + {} + + void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { + if (!active) { + return; + } + + if (verbose) { + aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; + } + + int thread_ids[3] = {x, y, z}; + int64_t byte = 0; + for (size_t i = 0; i < dimensions; ++i) { + if (!strides.valid(i)) { + ++unknown_sectors; + return; + } + byte += bytes_per_access * strides.offset(i, thread_ids[i]); + } + + if (verbose) { + aslog(2) << "byte accessed: " << byte << "\n"; + } + + int64_t sector = byte / 32; + if (verbose) { + aslog(2) << "sectors accessed: "; + } + for (int i = 0; i < bytes_per_access; ++i) { + if (verbose) { + aslog(2) << sector << " "; + } + sectors_accessed[sector].insert(byte + i); + } + if (verbose) { + aslog(2) << "\n\n"; + } + } + + void add_access_info(int num_requests, GlobalMemInfo& global_mem_info, bool is_tail_warp) const { + int num_transactions_per_request = sectors_accessed.size() + unknown_sectors; + + if (verbose) { + if (is_tail_warp) { + aslog(2) << "tail_"; + } + aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n"; + } + + int num_bytes_used_per_request = 0; + for (const auto& sector : sectors_accessed) { + num_bytes_used_per_request += sector.second.size(); + } + + num_bytes_used_per_request += unknown_sectors * bytes_per_access; + + if (verbose) { + if (is_tail_warp) { + aslog(2) << "tail_"; + } + aslog(2) << "num_requests_per_block = " << num_requests << "\n"; + } + + global_mem_info.add_access_info( + num_requests, + num_transactions_per_request, + num_bytes_used_per_request + ); + } + +private: + int bytes_per_access; + size_t dimensions; + Strides strides; + bool verbose; + int unknown_sectors = 0; + std::unordered_map> sectors_accessed; +}; + +struct SharedAccessAccumulator { + SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides& strides, bool verbose) + : bytes_per_access{bytes_per_access} + , dimensions{dimensions} + , strides{strides} + , verbose{verbose} + {} + + void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { + if (!active) { + return; + } + + if (verbose) { + aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; + } + + int thread_ids[3] = {x, y, z}; + int64_t byte = 0; + for (size_t i = 0; i < dimensions; ++i) { + if (!strides.valid(i)) { + ++unknown_banks; + return; + } + byte += bytes_per_access * strides.offset(i, thread_ids[i]); + } + + if (verbose) { + aslog(2) << "bytes accessed: "; + for (int i = 0; i < bytes_per_access; ++i) { + aslog(2) << byte + i << " "; + } + aslog(2) << "\n"; + } + + if (verbose) { + aslog(2) << "banks accessed: "; + } + for (int i = 0; i < bytes_per_access; ++i) { + int64_t word = (byte + i) / 4; + int64_t bank = word % 32; + if (verbose) { + aslog(2) << bank << " "; + } + bytes_accessed.insert(byte + i); + bank_to_words_accessed[bank].insert(word); + } + if (verbose) { + aslog(2) << "\n\n"; + } + } + + void add_access_info(int num_requests, SharedMemInfo& shared_mem_info, bool is_tail_warp) const { + int num_transactions_per_request = 0; + for (const auto& bank : bank_to_words_accessed) { + num_transactions_per_request = std::max(num_transactions_per_request, (int)bank.size()); + } + + num_transactions_per_request += unknown_banks; + + if (verbose) { + if (is_tail_warp) { + aslog(2) << "tail_"; + } + aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n"; + } + + int num_bytes_used_per_request = bytes_accessed.size(); + + num_bytes_used_per_request += unknown_banks * bytes_per_access; + + if (verbose) { + if (is_tail_warp) { + aslog(2) << "tail_"; + } + aslog(2) << "num_requests_per_block = " << num_requests << "\n"; + } + + shared_mem_info.add_access_info( + num_requests, + num_transactions_per_request, + num_bytes_used_per_request + ); + } + +private: + int bytes_per_access; + size_t dimensions; + Strides strides; + bool verbose; + int unknown_banks = 0; + std::unordered_set bytes_accessed; + std::array, 32> bank_to_words_accessed; +}; + +struct LocalAccessAccumulator { + LocalAccessAccumulator(int bytes_per_access, bool verbose) + : bytes_per_access{bytes_per_access} + , verbose{verbose} + {} + + void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { + if (!active) { + return; + } + + ++thread_count; + + if (verbose) { + aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; + } + } + + void add_access_info(int num_requests, LocalMemInfo& local_mem_info, bool is_tail_warp) const { + int num_bytes_used_per_request = thread_count * bytes_per_access; + int sectors_accessed = std::ceil((float)num_bytes_used_per_request / (float)LocalMemInfo::bytes_per_transaction); + int num_transactions_per_request = sectors_accessed; + + if (verbose) { + if (is_tail_warp) { + aslog(2) << "tail_"; + } + aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n"; + } + + if (verbose) { + if (is_tail_warp) { + aslog(2) << "tail_"; + } + aslog(2) << "num_requests_per_block = " << num_requests << "\n"; + } + + local_mem_info.add_access_info( + num_requests, + num_transactions_per_request, + num_bytes_used_per_request + ); + } + +private: + int bytes_per_access; + bool verbose; + int thread_count = 0; + std::unordered_map> sectors_accessed; +}; + + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // GPU_MEM_INFO_H diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp new file mode 100644 index 000000000000..b13fedb282ab --- /dev/null +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -0,0 +1,4024 @@ +#include "LoopNest.h" + +#include +#include +#include + +using std::set; +using std::vector; + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +// How small should an innermost loop cluster be before you just +// entirely unroll the thing +const int kUnrollLimitGPU = 16; + +// Get the HL_NO_SUBTILING environment variable. Purpose described above. +bool get_may_subtile() { + string no_subtiling_str = get_env_variable("HL_NO_SUBTILING"); + if (no_subtiling_str == "1") { + return false; + } else { + return true; + } +} + +bool may_subtile() { + static bool b = get_may_subtile(); + return b; +} + +// Shared memory limit per block for the target GPU +int64_t get_shared_memory_limit() { + // HL_SHARED_MEMORY_LIMIT is in KB + std::string limit = get_env_variable("HL_SHARED_MEMORY_LIMIT"); + if (limit.empty()) { + return 48 * 1024; + } + return atoi(limit.c_str()) * 1024; // Convert to bytes +} + +// Shared memory limit per SM for the target GPU +int64_t get_shared_memory_sm_limit_helper() { + // HL_SHARED_MEMORY_SM_LIMIT is in KB + std::string limit = get_env_variable("HL_SHARED_MEMORY_SM_LIMIT"); + if (limit.empty()) { + return 96 * 1024; + } + return atoi(limit.c_str()) * 1024; // Convert to bytes +} + +int64_t get_shared_memory_sm_limit() { + static int64_t limit = get_shared_memory_sm_limit_helper(); + return limit; +} + +// Maximum number of active blocks for the target GPU +int64_t get_active_block_hardware_limit() { + std::string limit = get_env_variable("HL_ACTIVE_BLOCK_LIMIT"); + if (limit.empty()) { + return 32; + } + return atoi(limit.c_str()); +} + +// Maximum number of active warps for the target GPU +int64_t get_active_warp_hardware_limit() { + std::string limit = get_env_variable("HL_ACTIVE_WARP_LIMIT"); + if (limit.empty()) { + return 64; + } + return atoi(limit.c_str()); +} + +int get_unroll_limit(const Target &target) { + return kUnrollLimitGPU; +} + +bool in_range_zero_one(double x) { + return x > 0 && x <= 1; +} + +bool are_valid_thread_extents(const vector &counts) { + int num_thread_loops = 0; + int num_threads = 1; + + for (auto c : counts) { + if (c == 1) { + continue; + } + + if (num_thread_loops >= 3 || num_threads * c > MAX_THREADS_PER_BLOCK) { + return false; + } + + num_threads *= c; + ++num_thread_loops; + } + + return true; +} + +bool all(const vector& v) { + for (auto x : v) { + if (!x) { + return false; + } + } + return true; +} + +// given a newly inserted node f into this LoopNest, get union of thread counts in each dimension +// across all siblings of f. +vector LoopNest::get_union_thread_counts(const FunctionDAG::Node *f) const { + vector max_size{1, 1, 1}; + // find the loop nests we just created and get max gpu_thread extents of other children + for (auto &c : children) { + if (c->node != f) { + if (c->gpu_label == thread) { + vector lowered_size; + lowered_dims(c->size, c->vectorized_loop_index, lowered_size); + for (int dim = 0; dim < (int)(lowered_size.size()); dim++) { + if (dim >= (int)(max_size.size())) { + max_size.push_back(lowered_size[dim]); + } else { + max_size[dim] = std::max(max_size[dim], lowered_size[dim]); + } + } + } else if (c->children.size() > 0) { // descend into children for thread blocks in serial loops + vector child_max_sizes = c->get_union_thread_counts(f); + for (int dim = 0; dim < (int)(child_max_sizes.size()); dim++) { + if (dim >= (int)(max_size.size())) { + max_size.push_back(child_max_sizes[dim]); + } else { + max_size[dim] = std::max(max_size[dim], child_max_sizes[dim]); + } + } + } // otherwise this a serial loop with no threaded descendants + } + } + return max_size; +} + +// given a newly inserted node f into this LoopNest, gets the size of +// all of f's stages and their pure_dim indices +void LoopNest::get_stage_sizes(const FunctionDAG::Node *f, + vector> &stage_sizes, + vector> &pure_dims, + vector &vectorized_indices) const { + stage_sizes.resize(f->stages.size()); + pure_dims.resize(f->stages.size()); + vectorized_indices.resize(f->stages.size()); + for (auto &c : children) { + if (c->node == f && f->dimensions > 0) { + vectorized_indices[c->stage->index] = c->vectorized_loop_index; + stage_sizes[c->stage->index] = c->size; + for (size_t i = 0; i < c->stage->loop.size(); i++) { + pure_dims[c->stage->index].push_back(c->stage->loop[i].pure_dim); + } + } + } +} + +// given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes +// for the vectorized dimension will allow the resulting thread tiles to be multiples of 32 +// if so, we will include these in the serial loop sizes +void LoopNest::generate_vec_dim_serial_tilings(vector &serial_sizes) const { + // generate suggested tilings for vectorized dimension + int warp_width = 32; + if (size[vectorized_loop_index] % warp_width == 0) { + int remaining_ext = size[vectorized_loop_index] / warp_width; + for (int s = 3; s < 8; s += 2) { + if (remaining_ext % s == 0) { + serial_sizes.push_back(s); + } + } + } +} + +// get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles +// the newly inserted loop nests of f into a threads loop outside a serial loop. +// V is the vectorized dimension of f. Adds loopnests created from each tiling option in result. +bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, + const MachineParams ¶ms, + const Target &target, + int v, + vector> &result, + vector max_size) { + vector> stage_sizes; + vector> pure_dims; + vector vectorized_indices; + this->get_stage_sizes(f, stage_sizes, pure_dims, vectorized_indices); + internal_assert(stage_sizes.size() != 0); + auto tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, (int)(stage_sizes[0].size() - 1), vectorized_indices, true, false); + bool made_child = false; + for (const auto &t : tilings) { + LoopNest *new_parent = new LoopNest; + new_parent->copy_from(*(this)); + for (auto &c : new_parent->children) { + if (c->node == f) { + c = c->parallelize_in_tiles(params, t, new_parent, target, false, false); + } + } + result.emplace_back(new_parent); + made_child = true; + } + if (!made_child) { // if we can't tile into gpu threads the inserted node, make it serial + for (auto &c : children) { + if (c->node == f) + c->gpu_label = serial; + } + } + return made_child; +} + +void LoopNest::copy_from(const LoopNest &n) { + size = n.size; + children = n.children; + inlined = n.inlined; + store_at = n.store_at; + bounds = n.bounds; + node = n.node; + stage = n.stage; + innermost = n.innermost; + tileable = n.tileable; + parallel = n.parallel; + vector_dim = n.vector_dim; + vectorized_loop_index = n.vectorized_loop_index; + gpu_label = n.gpu_label; + features.clear(); +}; + +void LoopNest::copy_from_including_features(const LoopNest &n) { + size = n.size; + children = n.children; + inlined = n.inlined; + store_at = n.store_at; + bounds = n.bounds; + node = n.node; + stage = n.stage; + innermost = n.innermost; + tileable = n.tileable; + parallel = n.parallel; + vector_dim = n.vector_dim; + vectorized_loop_index = n.vectorized_loop_index; + gpu_label = n.gpu_label; + features = n.features; + feature_intermediates = n.feature_intermediates; +}; + +// Hash the loop structure and sizes up to a fixed depth. This is +// used as the hash function for the coarse-to-fine beam search in +// the paper. +void LoopNest::structural_hash(uint64_t &h, int depth) const { + if (depth < 0) return; + + // Which Funcs are store_at this level? + for (const auto *n : store_at) { + hash_combine(h, n->id); + } + + hash_combine(h, -1); + + // Which Funcs are compute_at this level? + for (const auto &c : children) { + hash_combine(h, c->stage->id); + } + + // Add a barrier to ensure that moving something from the last + // compute_at to the first inlined doesn't result in the same + // hash. + hash_combine(h, -1); + + // Which Funcs are inlined at this level? + for (auto it = inlined.begin(); it != inlined.end(); it++) { + hash_combine(h, it.key()->id); + } + + hash_combine(h, -1); + + if (depth > 0) { + // What are the loop sizes of the children? + for (const auto &c : children) { + for (int64_t s : c->size) { + if (depth == 1) { + // Just take the most significant bit: is it one or not? + s = (s > 1) ? 1 : 0; + } + hash_combine(h, s); + } + } + + // Which dimension are we vectorized over? + hash_combine(h, vectorized_loop_index); + + hash_combine(h, vector_dim); + } + + if (depth > 1) { + // Descend into children + for (const auto &c : children) { + c->structural_hash(h, depth - 2); + } + } +} + +GPUMemoryType LoopNest::get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined) const { + if (is_inlined) { + return GPUMemoryType::inlined; + } + + if (in_thread) { + internal_assert(in_block); + return GPUMemoryType::local; + } + + if (in_block) { + return GPUMemoryType::shared; + } + + return GPUMemoryType::global; +} + +std::vector LoopNest::unrolled_loops(const Target& target, const LoopNest* parent, const LoopNest* grandparent) const { + internal_assert(innermost); + const auto &grandparent_bounds = grandparent->get_bounds(node); + std::vector unrolled(parent->size.size(), 0); + + if (parent->node != node) { + return unrolled; + } + + int64_t total_extent = 1; + for (size_t i = 0; i < parent->size.size(); i++) { + if (!stage->loop[i].rvar) { + const auto &l = grandparent_bounds->loops(parent->stage->index, i); + unrolled[i] = l.constant_extent(); + total_extent *= l.extent(); + } + } + + if (total_extent <= get_unroll_limit(target)) { + return unrolled; + } + + std::fill(unrolled.begin(), unrolled.end(), 0); + return unrolled; +} + +bool accessed_at_constant_indices(const std::vector& unrolled, const FunctionDAG::Edge* e) { + for (const auto& jac : e->load_jacobians) { + for (size_t loop_index = 0; loop_index < unrolled.size(); ++loop_index) { + for (int i = 0; i < e->producer->dimensions; ++i) { + // There are two ways for an index to be constant: + // 1. It's an actual constant i.e. the jac entry = 0 + // 2. It has a known stride and the loop accessing it is + // unrolled + if (!(jac(i, loop_index) == 0) && (!jac(i, loop_index).exists() || !unrolled[loop_index])) { + return false; + } + } + } + } + + return true; +} + +void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target, + StageMap &sites, + NodeMap &can_be_promoted_to_registers, + const LoopNest *grandparent, + const LoopNest *parent) const { + + for (const auto* alloc_node : store_at) { + const auto& store_site = sites.get(&alloc_node->stages[0]); + if (store_site.gpu_store_memory_type != GPUMemoryType::local) { + continue; + } + + can_be_promoted_to_registers.get_or_create(alloc_node) = store_site.is_constant_allocation && store_site.allocation_size <= get_register_mem_alloc_limit(); + } + + for (const auto &c : children) { + c->get_allocs_that_can_be_promoted_to_registers(target, sites, can_be_promoted_to_registers, parent, this); + } + + if (innermost) { + auto unrolled = unrolled_loops(target, parent, grandparent); + + for (const auto* e : stage->incoming_edges) { + if (sites.get(&e->producer->stages[0]).gpu_store_memory_type != GPUMemoryType::local) { + continue; + } + + can_be_promoted_to_registers.get(e->producer) = can_be_promoted_to_registers.get(e->producer) && accessed_at_constant_indices(unrolled, e); + } + } +} + +// Compute all the sites of interest for each pipeline stage +void LoopNest::get_sites(const Target &target, + StageMap &sites, + StageMap &total_shared_mem_alloc_sizes, + const LoopNest *task, + const LoopNest *parent, + const LoopNest *current_thread_loop) const { + if (is_gpu_thread(target)) { + current_thread_loop = this; + } + + if (!task && !is_root()) { + task = this; + } + + for (const auto &c : children) { + c->get_sites(target, sites, total_shared_mem_alloc_sizes, task, this, current_thread_loop); + } + if (parent && node != parent->node) { + auto &s = sites.get_or_create(stage); + s.compute = parent; + s.produce = this; + s.task = task; + } + + bool in_block = task != nullptr; + bool in_thread = current_thread_loop != nullptr; + + for (auto f : store_at) { + auto store_gpu_memory_type = get_gpu_memory_type(in_block, in_thread); + + for (const auto &s : f->stages) { + sites.get_or_create(&s).store = this; + sites.get_or_create(&s).gpu_store_memory_type = store_gpu_memory_type; + auto alloc = sites.get_or_create(&s).store->compute_alloc_size_of_node_here(f); + sites.get_or_create(&s).allocation_size = alloc.first; + sites.get_or_create(&s).is_constant_allocation = alloc.second; + + const LoopNest* store_site = sites.get_or_create(&s).store; + if (store_site->gpu_label == block && s.index == 0) { + total_shared_mem_alloc_sizes.get_or_create(store_site->stage) += alloc.first; + } + } + } + for (auto it = inlined.begin(); it != inlined.end(); it++) { + auto &s = sites.get_or_create(&(it.key()->stages[0])); + s.inlined = true; + // These values will be unreliable for inlined Funcs that are located + // at multiple different locations + s.compute = s.store = s.produce = s.innermost = this; + + // Accumulate all the innermost loop nests into which this func is + // inlined + s.inlined_innermosts.push_back(this); + s.gpu_store_memory_type = GPUMemoryType::inlined; + s.task = task; + } + if (innermost) { + sites.get_or_create(stage).innermost = this; + sites.get_or_create(stage).thread = current_thread_loop; + } +} + +bool LoopNest::promote_allocs_to_registers(const Target &target, StageMap &sites) const { + NodeMap can_be_promoted_to_registers; + get_allocs_that_can_be_promoted_to_registers(target, sites, can_be_promoted_to_registers, nullptr, nullptr); + + + for (auto& node : can_be_promoted_to_registers) { + if (!node.second) { + return false; + } + + for (auto& stage : node.first->stages) { + internal_assert(sites.get(&stage).gpu_store_memory_type == GPUMemoryType::local); + sites.get(&stage).gpu_store_memory_type = GPUMemoryType::registers; + } + } + + return true; +} + +bool LoopNest::exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const { + bool parent_of_innermost = false; + for (const auto &c : children) { + if (c->node == node && c->innermost) { + parent_of_innermost = true; + } + } + + if (gpu_label == serial && stage->index == 0) { + int64_t serial_loop_extents = 1; + for (size_t i = 0; i < stage->loop.size(); i++) { + if (!stage->loop[i].pure) { + continue; + } + + serial_loop_extents *= size[stage->loop[i].pure_dim]; + } + + if (parent_of_innermost) { + return serial_loop_extents > get_unroll_limit(target); + } + + if (serial_loop_extents > 64) { + return true; + } + } + + for (const auto &c : children) { + if (c->exceeds_serial_extents_limit(target, this, in_threads_loop || c->gpu_label == thread)) { + return true; + } + } + + return false; +} + +bool LoopNest::node_has_dynamic_region_computed(const FunctionDAG::Node *f) const { + for (int i = 0; i < f->dimensions; i++) { + const auto ®ion = get_bounds(f)->region_computed(i); + + if (!region.constant_extent()) { + return true; + } + } + + return false; +} + +bool LoopNest::has_dynamic_allocation_inside_thread(bool in_thread_loop) const { + in_thread_loop = in_thread_loop || (gpu_label == thread); + + if (in_thread_loop) { + for (const auto &f : store_at) { + if (node_has_dynamic_region_computed(f)) { + return true; + } + } + } + + for (const auto &child : children) { + if (child->has_dynamic_allocation_inside_thread(in_thread_loop)) { + return true; + } + } + + return false; +} + +const LoopNest *LoopNest::find_pure_stage_loop_nest(const FunctionDAG::Node *node) const { + const LoopNest *pure; + for (const auto &c : children) { + if (node == c->node) { + if (c->stage->index == 0) { + return c.get(); + } + } else { + pure = c->find_pure_stage_loop_nest(node); + if (pure) { + return pure; + } + } + } + + return nullptr; +} + +int LoopNest::get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const { + const auto *pure = find_pure_stage_loop_nest(node); + internal_assert(pure) << "No pure stage found for " << node->func.name() << "\n"; + return pure->vectorized_loop_index; +} + +int LoopNest::get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const { + int v = vectorized_loop_index; + if (v < 0) { + v = root.get_pure_stage_vectorized_loop_index(node); + } + + // For update stages, it's possible that the pure stage's vectorized + // loop index is larger than the dimensions of the update stage e.g. + // the pure stage's vectorized loop index is 3, but the update stage + // has 3 or fewer dimensions. In this case, the vectorized loop + // index should just be its innermost dimension i.e. 0 + if ((size_t)v >= stage->loop.size()) { + v = 0; + } + + return v; +} + +// Get the stride over "node's" storage for a unit increment in the vectorized loop's +// index +double LoopNest::storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const { + internal_assert(innermost_storage_dim >= 0); + + // The node's storage dimensions (from innermost outward) + std::vector storage_dims; + storage_dims.push_back(innermost_storage_dim); + for (int i = 0; i < storage_node->dimensions; i++) { + if (i == storage_dims[0]) { + continue; + } + + storage_dims.push_back(i); + } + + std::vector storage_strides; + int64_t storage_stride = 1; + for (std::size_t i = 0; i < storage_dims.size(); i++) { + storage_strides.push_back(storage_stride); + storage_stride *= store_bounds->region_required(storage_dims[i]).extent(); + } + + int v = get_vectorized_loop_index_from_pure_stage(root); + + double stride = 0; + for (std::size_t i = 0; i < storage_dims.size(); i++) { + auto jac_stride = jac(storage_dims[i], v); + + float s = (float)jac_stride.numerator / (float)jac_stride.denominator; + stride += s * storage_strides[i]; + } + + return std::abs(stride); +} + +// Shared mem accesses with stride 1 will likely be vectorized +bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const { + for (int i = 0; i < accessed->dimensions; i++) { + auto stride = jac(i, loop_index); + if (i == innermost_dim) { + if (!(stride == 1)) { + return false; + } + } else if (!(stride == 0)) { + return false; + } + } + + return true; +} + +bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType& mem_type) const { + if (loop_index < 0 || mem_type != GPUMemoryType::shared) { + return false; + } + + internal_assert(innermost_dim >= 0); + return can_vectorize_access_for_innermost_dim(jac, accessed, innermost_dim, loop_index); +} + +int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType& mem_type, bool verbose) const { + int vector_size = 1; + if (mem_type != GPUMemoryType::shared) { + return vector_size; + } + + if (accessed_has_been_scheduled) { + // Loads can potentially be vectorized in any loop dimension, not just + // the vectorized_loop dimension. It's possible that some of the loop + // dimensions will be removed by LICM but those indices won't conflict with + // any potential vectorized indices because the Jacobian entry for them + // must be 0 in all storage dimensions, whereas for vectorization it + // must be 1 for the innermost_dim and 0 for all others + for (size_t loop_index = 0; loop_index < size.size(); ++loop_index) { + if (!can_vectorize_access_for_innermost_dim(jac, accessed, innermost_dim, loop_index)) { + continue; + } + + vector_size = std::max(vector_size, vectorized_access_size(loop_index, verbose)); + } + + if (verbose) { + aslog(2) << "vector_size = " << vector_size << "\n"; + } + + return vector_size; + } + + // If the producer has not been scheduled, try all of its dimensions as the + // innermost storage dim to see if any can be vectorized + for (int i = 0; i < accessed->dimensions; i++) { + for (size_t loop_index = 0; loop_index < size.size(); ++loop_index) { + if (!can_vectorize_access_for_innermost_dim(jac, accessed, i, loop_index)) { + continue; + } + + vector_size = std::max(vector_size, vectorized_access_size(loop_index, verbose)); + } + } + + if (verbose) { + aslog(2) << "vector_size = " << vector_size << "\n"; + } + return vector_size; +} + +int LoopNest::vectorized_access_size(size_t loop_index, bool verbose) const { + int64_t extent = size[loop_index]; + constexpr int max_vector_size_in_bytes = 16; + int64_t max_points_per_vector = std::min(4, max_vector_size_in_bytes / (int)node->bytes_per_point); + + if (verbose) { + aslog(2) << "\nextent = " << extent; + aslog(2) << "\nbytes_per_point = " << node->bytes_per_point; + aslog(2) << "\nmax_points_per_vector = " << max_points_per_vector; + } + + if (extent >= max_points_per_vector && extent % max_points_per_vector == 0) { + return max_points_per_vector; + } + + if (extent < max_points_per_vector && max_points_per_vector % extent == 0) { + return extent; + } + + return 1; +} +double LoopNest::compute_local_mem_stride(double stride, double bytes) const { + // Each word is 4 bytes so adjust the stride based + // on width of data being accessed + double word_stride = (bytes / 4); + int words_per_access = std::max(1.0, word_stride); + stride *= words_per_access; + + stride = std::min(8.0, std::max(1.0, stride)); + + return stride; +} + +// Get the stride over "node's" storage and its element-wise stride for a unit +// increment in the given thread loops +Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo& thread_info, bool verbose) const { + internal_assert(innermost_storage_dim >= 0); + + if (verbose) { + aslog(2) << "\nstrides: " << node->func.name() << " (stage = " << stage->index << ") loading from " << storage_node->func.name() << " ->\n"; + if (aslog::aslog_level() >= 2) { + jac.dump(""); + } + } + + // The node's storage dimensions (from innermost outward) + std::vector storage_dims; + storage_dims.push_back(innermost_storage_dim); + for (int i = 0; i < storage_node->dimensions; i++) { + if (i == storage_dims[0]) { + continue; + } + + storage_dims.push_back(i); + } + + std::vector storage_strides; + int64_t storage_stride = 1; + if (verbose) { + aslog(2) << "Storage stride: "; + } + for (std::size_t i = 0; i < storage_dims.size(); i++) { + storage_strides.push_back(storage_stride); + if (verbose) { + aslog(2) << storage_stride << " "; + } + storage_stride *= store_bounds->region_required(storage_dims[i]).extent(); + } + if (verbose) { + aslog(2) << "\n"; + } + + Strides strides{storage_strides}; + for (const auto& thread_loop_var : thread_info.loop_vars) { + int loop_index = stage->get_loop_index_from_var(thread_loop_var); + bool loop_index_exists = loop_index >= 0; + + std::vector index_strides; + bool exists = true; + for (std::size_t i = 0; i < storage_dims.size(); i++) { + if (verbose) { + aslog(2) << "loop_index for this stage = " << loop_index; + aslog(2) << "; loop_var = " << thread_loop_var; + aslog(2) << "; storage_dim = " << i; + } + + if (loop_index_exists) { + auto jac_stride = jac(storage_dims[i], loop_index); + if (!jac_stride.exists()) { + if (verbose) { + aslog(2) << "; stride does not exist\n"; + jac.dump(""); + } + exists = false; + break; + } + + float s = (float)jac_stride.numerator / (float)jac_stride.denominator; + index_strides.push_back(s); + } else { + index_strides.push_back(0); + } + + if (verbose) { + aslog(2) << "; index_stride = " << index_strides.back() << "\n"; + } + } + + if (exists) { + strides.add_valid(index_strides); + if (verbose) { + aslog(2) << "adding valid stride\n"; + } + } else { + strides.add_invalid(); + if (verbose) { + aslog(2) << "adding invalid stride\n"; + } + } + } + + if (verbose) { + aslog(2) << "<- strides\n\n"; + } + + return strides; +} + +bool LoopNest::all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const { + int v = get_vectorized_loop_index_from_pure_stage(root); + + for (int i = 0; i < storage_node->dimensions; i++) { + auto stride = jac(i, v); + + if (!stride.exists()) { + return false; + } + } + return true; +} + +int LoopNest::get_actual_vector_dim(const Bound &store_bounds) const { + if (store_bounds->region_computed(vector_dim).extent() > 1) { + return vector_dim; + } + + for (int i = 0; i < node->dimensions; ++i) { + if (store_bounds->region_computed(i).extent() > 1) { + return i; + } + } + + return vector_dim; +} + +void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo& global_mem_loads, SharedMemInfo& shared_mem_loads, LocalMemInfo& local_mem_loads, bool verbose) const { + if (consumer_site.is_stored_in_registers()) { + return; + } + + const ThreadInfo &thread_info = *gpu_loop_info.thread_info; + bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::shared; + + size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds); + + // If any of the store dimensions are constant over all the loop dimensions, + // then the value to be stored will likely be held in a register and stored + // once instead of on every iteration + double total_serial_loop_extents = gpu_loop_info.total_serial_extents(); + int vector_size = 1; + for (size_t loop_index = 0; loop_index < stage->loop.size(); ++loop_index) { + bool constant = true; + for (int i = 0; i < node->dimensions; ++i) { + if (!(jac(i, loop_index) == 0)) { + constant = false; + break; + } + } + + if (constant) { + total_serial_loop_extents /= parent->size[loop_index]; + } else if (can_vectorize_store_access(jac, node, true, actual_vector_dim, loop_index, consumer_site.gpu_store_memory_type)) { + vector_size = std::max(vector_size, parent->vectorized_access_size(loop_index)); + } + } + total_serial_loop_extents /= vector_size; + + if (verbose) { + std::string type = stage->index == 0 ? "store" : "load_and_store"; + std::string consumer_name = node->func.name(); + sanitize_names(consumer_name); + std::string mem_type = "global"; + if (consumer_site.gpu_store_memory_type == GPUMemoryType::shared) { + mem_type = "shared"; + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::local) { + mem_type = "local"; + } + aslog(2) << "BEGIN MEM ACCESS " << mem_type << "_mem_" << type; + aslog(2) << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name << "\n"; + aslog(2) << "total_serial_loop_extents = " << total_serial_loop_extents << "\n"; + } + + if (is_shared_mem) { + if (verbose) { + aslog(2) << "vector_size = " << vector_size << "\n"; + } + auto store_jac = jac * inner_serial_loop_extents; + auto shared_mem_info = compute_mem_store_info( + store_jac, + consumer_innermost_dim, + node, + consumer_store_bounds, + thread_info, + total_serial_loop_extents, + verbose + ); + + feat.num_shared_mem_stores_per_block = shared_mem_info.num_transactions(); + if (stage->index > 0) { + shared_mem_loads.add(shared_mem_info); + } + feat.shared_mem_store_efficiency = shared_mem_info.efficiency(); + + internal_assert(in_range_zero_one(feat.shared_mem_store_efficiency)) << "Invalid shared mem store efficiency: " << feat.shared_mem_store_efficiency << " for " << node->func.name(); + + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::global) { + if (verbose) { + aslog(2) << "vector_size = " << vector_size << "\n"; + } + auto store_jac = jac * inner_serial_loop_extents; + auto global_mem_info = compute_mem_store_info( + store_jac, + consumer_innermost_dim, + node, + consumer_store_bounds, + thread_info, + total_serial_loop_extents, + verbose + ); + + feat.num_global_mem_stores_per_block = global_mem_info.num_transactions(); + if (stage->index > 0) { + global_mem_loads.add(global_mem_info); + } + feat.global_mem_store_efficiency = global_mem_info.efficiency(); + + internal_assert(in_range_zero_one(feat.global_mem_store_efficiency)) << "Invalid global mem store efficiency: " << feat.global_mem_store_efficiency << " for " << node->func.name(); + + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::local) { + auto local_mem_info = compute_mem_store_info( + jac, + consumer_innermost_dim, + node, + consumer_store_bounds, + thread_info, + total_serial_loop_extents, + verbose + ); + //feat.num_local_mem_stores_per_block = local_mem_info.num_transactions(); + if (stage->index > 0) { + local_mem_loads.add(local_mem_info); + } + //feat.local_mem_store_efficiency = local_mem_info.efficiency(); + + //internal_assert(in_range_zero_one(feat.local_mem_store_efficiency)) << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency << " for " << node->func.name(); + } + + if (verbose) { + aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; + std::string type = stage->index == 0 ? "store" : "load_and_store"; + std::string consumer_name = node->func.name(); + sanitize_names(consumer_name); + std::string mem_type = "global"; + if (consumer_site.gpu_store_memory_type == GPUMemoryType::shared) { + mem_type = "shared"; + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::local) { + mem_type = "local"; + } + aslog(2) << "END MEM ACCESS " << mem_type << "_mem_" << type << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name; + if (!jac.all_coeffs_exist()) { + aslog(2) << " (not all coeffs exist)"; + + } + aslog(2) << "\n\n"; + } +} + +template +void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const { + Strides strides = compute_strides(jac, innermost_dim, node, store_bounds, thread_info, verbose); + + size_t dimensions = thread_info.loop_indices.size(); + strides.dump(verbose); + + int bytes_per_access = node->bytes_per_point; + + { + int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp; + Accumulator accumulator(bytes_per_access, dimensions, strides, verbose); + thread_info.for_each_thread_id_in_first_warp(accumulator); + + accumulator.add_access_info( + num_requests, + mem_info, + false + ); + + if (verbose) { + aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n"; + aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n"; + } + } + + if (!thread_info.has_tail_warp) { + return; + } + + if (verbose) { + aslog(2) << "\nBEGIN tail warp\n"; + aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n"; + } + + Accumulator accumulator(bytes_per_access, dimensions, strides, verbose); + thread_info.for_each_thread_id_in_tail_warp(accumulator); + + accumulator.add_access_info( + num_requests_per_warp, + mem_info, + true + ); + + if (verbose) { + aslog(2) << "END tail warp\n\n"; + } +} + +template +void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; + +template +void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; + +template <> +void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const { + int bytes_per_access = node->bytes_per_point; + + { + int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp; + LocalAccessAccumulator accumulator(bytes_per_access, verbose); + thread_info.for_each_thread_id_in_first_warp(accumulator); + + accumulator.add_access_info( + num_requests, + mem_info, + false + ); + + if (verbose) { + aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n"; + aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n"; + } + } + + if (!thread_info.has_tail_warp) { + return; + } + + if (verbose) { + aslog(2) << "\nBEGIN tail warp\n"; + aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n"; + } + + LocalAccessAccumulator accumulator(bytes_per_access, verbose); + thread_info.for_each_thread_id_in_tail_warp(accumulator); + + accumulator.add_access_info( + num_requests_per_warp, + mem_info, + true + ); + + if (verbose) { + aslog(2) << "END tail warp\n\n"; + } +} + +std::pair LoopNest::compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const { + // Assume worst case serialized loads if the stride is unknown + if (!all_strides_exist(jac, node, root)) { + double stride = compute_local_mem_stride(32.0, node->bytes_per_point); + double accesses = jac.count() * std::ceil((stride * serial_loop_extents) / 8.0); + return {accesses, 1.0 / stride}; + } + + double stride = storage_stride(jac, consumer_innermost_dim, node, consumer_store_bounds, root); + stride = compute_local_mem_stride(stride, node->bytes_per_point); + double accesses = jac.count() * std::ceil((stride * serial_loop_extents) / 8.0); + return {accesses, 1.0 / stride}; +} + +template +MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const { + MemInfoType mem_info; + + compute_num_mem_accesses_per_block(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose); + return mem_info; +} + +template +MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const; + +template +MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const; + +template +void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { + if (producer_has_been_scheduled) { + compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); + + return; + } + + // Assume best case if producer has not been scheduled: try all the + // possible innermost dimensions and take the best + int min_required_accesses = 0; + MemInfoType min_info; + + for (int i = 0; i < node->dimensions; i++) { + MemInfoType info; + compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, i, points_accessed_per_thread, info, verbose); + if (i == 0 || info.num_transactions() < min_required_accesses) { + min_info = info; + min_required_accesses = info.num_transactions(); + } + } + + mem_info.add(min_info); +} + +template +void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; + +template +void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; + +template <> +void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { + compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); +} + +// Assumes block, serial, thread or block, thread nesting +const LoopNest *LoopNest::get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const { + internal_assert(gpu_label == thread); + + if (parent->gpu_label == block && grandparent->is_root()) { + return parent; + } + + if (parent->gpu_label == serial && grandparent->gpu_label == block) { + return grandparent; + } + + internal_error << "Invalid nesting: " << parent->gpu_label << ", " << grandparent->gpu_label << "\n"; + return nullptr; +} + +std::pair LoopNest::get_block_and_serial_extents(const LoopNest *block) const { + constexpr int max_blocks[3] = {2147483647, 65535, 65535}; + int block_extents[3] = {1, 1, 1}; + + std::vector lowered_size; + lowered_dims(block->size, block->vectorized_loop_index, lowered_size); + + int64_t total_block_extents = 1; + + size_t i = 0; + size_t block_i = 0; + for (size_t N = lowered_size.size(); i < N && block_i < 3; ++i) { + if (lowered_size[i] * block_extents[block_i] > max_blocks[block_i]) { + ++block_i; + continue; + } + + block_extents[block_i] *= lowered_size[i]; + total_block_extents *= lowered_size[i]; + } + + int64_t serial_extents = 1; + for (; i < lowered_size.size(); ++i) { + serial_extents *= lowered_size[i]; + } + + internal_assert(serial_extents == 1); + return {total_block_extents, serial_extents}; +} + +bool LoopNest::all_paths_to_leaves_have_thread_loop() const { + if (gpu_label == thread) { + return true; + } + + if (children.size() == 0) { + return false; + } + + for (const auto &c : children) { + if (!c->all_paths_to_leaves_have_thread_loop()) { + return false; + } + } + + return true; +} + +bool LoopNest::has_thread_loop_descendant() const { + if (gpu_label == thread) { + return true; + } + + for (const auto &c : children) { + if (c->has_thread_loop_descendant()) { + return true; + } + } + + return false; +} + +void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const { + const ThreadInfo *thread_info = gpu_loop_info.thread_info; + features.warp_lane_utilization = thread_info->warp_lane_utilization(); + features.num_active_warps_per_block = thread_info->num_active_warps_per_block; + features.idle_lane_wastage = thread_info->idle_lane_wastage(); + features.num_warps_per_block = thread_info->num_warps_per_block; + features.num_blocks = gpu_loop_info.num_blocks; + features.block_occupancy = thread_info->block_occupancy(); + features.num_threads_per_block = thread_info->num_threads; + + internal_assert(in_range_zero_one(features.block_occupancy)) << "Invalid block occupancy: " << features.block_occupancy; + internal_assert(in_range_zero_one(features.warp_lane_utilization)) << "Invalid warp utilization: " << features.warp_lane_utilization; +} + +// Assume that when a block is active, all its warps are active +void LoopNest::compute_warp_and_block_occupancy(const MachineParams& params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { + // Only compute these features for stage's that actually have a block + // loop + if (node != gpu_loop_info.current_block_loop->node) { + return; + } + + auto active_block_hardware_limit = get_active_block_hardware_limit(); + auto active_warp_hardware_limit = get_active_warp_hardware_limit(); + + int64_t num_warps_per_block = gpu_loop_info.thread_info->num_warps_per_block; + + int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)params.parallelism); + + auto max_theoretical_active_blocks = std::min(active_block_hardware_limit, num_blocks); + auto max_active_warps = std::min(active_warp_hardware_limit, max_theoretical_active_blocks * num_warps_per_block); + + auto max_active_blocks = max_active_warps / num_warps_per_block; + + feat.max_warp_occupancy = (double)max_active_warps / (double)active_warp_hardware_limit; + feat.max_block_occupancy = (double)max_active_blocks / (double)active_block_hardware_limit; +} + +void LoopNest::compute_shared_mem_occupancy(const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const { + if (!is_gpu_block(target)) { + return; + } + + static auto shared_mem_limit = get_shared_memory_limit(); + static auto shared_mem_sm_limit = get_shared_memory_sm_limit(); + static auto active_block_hardware_limit = get_active_block_hardware_limit(); + + feat.shared_mem_occupancy = (double)total_shared_mem_alloc_size / (double)shared_mem_limit; + internal_assert(feat.shared_mem_occupancy <= 1) << "Invalid shared mem occupancy: " << feat.shared_mem_occupancy; + + if (total_shared_mem_alloc_size > 0) { + auto shared_mem_max_active_blocks = std::min(active_block_hardware_limit, shared_mem_sm_limit / total_shared_mem_alloc_size); + feat.shared_mem_block_limit_factor = (double)shared_mem_max_active_blocks / (double)active_block_hardware_limit; + + internal_assert(feat.shared_mem_block_limit_factor <= 1) << "Invalid shared mem block limit factor: " << feat.shared_mem_block_limit_factor; + } +} + +std::pair LoopNest::find_innermost_and_parent() const { + internal_assert(!innermost); + + const LoopNest *parent = this; + const LoopNest *child = nullptr; + + while (true) { + for (const auto &c : parent->children) { + if (c->node != node) { + continue; + } + + child = c.get(); + } + + internal_assert(child); + + if (child->innermost) { + break; + } + + parent = child; + } + + return {child, parent}; +} + +int64_t LoopNest::points_accessed_per_thread(const MachineParams& params, const Target& target, const GPULoopInfo &gpu_loop_info, const std::vector& edge_chain, const LoadJacobian& jac, const LoopNest* parent, const LoopNest* grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian& serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType& mem_type, bool verbose) const { + + std::unique_ptr innermost_parent_clone = std::make_unique(); + innermost_parent_clone->copy_from(*parent); + int64_t unrolled_loop_extent = feat.unrolled_loop_extent; + vector tiling(node->dimensions, 1); + vector rvars_to_move_inward(parent->size.size(), 0); + + // There are 3 cases to consider when computing the number of unique points + // accessed: + // 1. If LICM can be applied, then accessed points can be reused across + // the loop's iterations so its extents are not counted + // 2. If LICM cannot be applied to a loop but it is unrolled, then accessed + // points can potentially be reused across the unrolled block and the number + // of unique points accessed is equal to the region_required + // 3. If LICM cannot be applied to a loop and it is not unrolled, then + // points accessed cannot be reused across iterations and the number of + // unique points accessed in 2. is multiplied by the loop's extents + + int64_t product_of_non_licm_non_unrolled_extents = 1; + int64_t product_of_non_licm_extents = 1; + int num_pure_loops = 0; + const FunctionDAG::Node* producer = edge_chain.back()->producer; + for (size_t idx = 0; idx < parent->size.size(); idx++) { + bool can_apply_licm = true; + for (int i = 0; i < producer->dimensions; i++) { + if (!(jac(i, idx) == 0)) { + can_apply_licm = false; + break; + } + } + + bool pure = stage->loop[idx].pure; + bool pure_and_unrolled = pure && unrolled_loop_extent > 1; + + if (pure) { + ++num_pure_loops; + } + + if (!can_apply_licm) { + product_of_non_licm_extents *= parent->size[idx]; + if (pure_and_unrolled) { + // Case 2 + if (stage->loop[idx].pure_dim >= 0) { + tiling[stage->loop[idx].pure_dim] = parent->size[idx]; + } else { + rvars_to_move_inward[idx] = 1; + } + if (verbose) { + aslog(2) << "loop idx = " << idx << ": non_licm_unrolled = " << parent->size[idx] << "\n"; + } + } else { + // Case 3 + product_of_non_licm_non_unrolled_extents *= parent->size[idx]; + if (verbose) { + aslog(2) << "loop idx = " << idx << ": non_licm_non_unrolled = " << parent->size[idx] << "\n"; + } + } + } else if (verbose) { + // Case 1 + aslog(2) << "loop idx = " << idx << ": apply licm = " << parent->size[idx] << "\n"; + } + } + + IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(params, tiling, grandparent, target, true, false, false, rvars_to_move_inward); + + const auto& bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain); + int64_t num_points = 1; + for (int i = 0; i < producer->dimensions; i++) { + num_points *= bounds->region_required(i).extent(); + + // If the min is >= 100000, there's a good chance that the bounds are + // uninitialized, indicating a bug + internal_assert(std::abs(bounds->region_required(i).min()) < 100000) + << "region_required min = " << std::abs(bounds->region_required(i).min()) + << "; region_required max = " << std::abs(bounds->region_required(i).max()); + if (verbose) { + aslog(2) << "region_required(" << i << ") = " << bounds->region_required(i).extent() << "; "; + } + } + + // There are 2 ways to calculate the number of points accessed: + // 1. The region_required of the producer in the non-LICM unrolled loops * the loop extents of the non-LICM loops that cannot be unrolled + int64_t points_accessed_by_region_required = num_points * product_of_non_licm_non_unrolled_extents; + + // 2. The number of points computed according to 'n' (the number of + // entries in the LoadJacobian i.e. the number of loads, ignoring any reuse + // of points) * the loops extents of all the non-LICM loops. This value is + // an upper bound + int64_t points_accessed_by_loop_extents = n * product_of_non_licm_extents; + + // In some cases, the region_required is larger than the actual number of + // points that need to be loaded e.g. if f(x) = g(x) + g(x + 100), the + // region_required of g will be the range [x, x + 100] but really only 2 + // points need to be loaded. In cases like this, option 1. will + // over-estimate and we instead use the upper bound from option 2. + int64_t points_accessed = points_accessed_by_region_required; + if (points_accessed_by_loop_extents <= points_accessed_by_region_required) { + points_accessed = points_accessed_by_loop_extents; + + if (mem_type == GPUMemoryType::shared) { + int vector_size = parent->vectorized_load_access_size( + serial_jac, + producer, + producer_has_been_scheduled, + producer_innermost_dim, + mem_type, + verbose + ); + + if (verbose) { + aslog(2) << "\n"; + aslog(2) << "vector_size = " << vector_size << "\n"; + } + + if (points_accessed % vector_size == 0) { + points_accessed /= vector_size; + if (verbose) { + aslog(2) << "vectorization applied\n"; + } + } + } + } + + points_accessed *= gpu_loop_info.total_outer_serial_extents; + + int64_t total_inner_serial_extents_outside_realization = gpu_loop_info.get_total_inner_serial_extents_outside_realization(this); + + // If you have a realization inside a serial loop e.g. + // f 80 gpu_block + // f 32 gpu_thread + // f 8 gpu_serial + // realize: g + // g 1 gpu_serial + // g 1 gpu_simd + // f 1 gpu_simd + // LICM won't be able to hoist g's loads/stores above its realization level + // so 'f 8' will contribute a factor of 8 to the total + points_accessed *= total_inner_serial_extents_outside_realization; + + if (verbose) { + aslog(2) << "\n"; + aslog(2) << "region_required = " << num_points << "\n"; + aslog(2) << "total_inner_serial_extents = " << gpu_loop_info.total_inner_serial_extents << "\n"; + aslog(2) << "total_outer_serial_extents = " << gpu_loop_info.total_outer_serial_extents << "\n"; + aslog(2) << "total_inner_serial_extents_outside_realization = " << total_inner_serial_extents_outside_realization << "\n"; + aslog(2) << "product_of_non_licm_non_unrolled_extents = " << product_of_non_licm_non_unrolled_extents << "\n"; + aslog(2) << "n = " << n << "\n"; + aslog(2) << "points_accessed_by_region_required = " << points_accessed_by_region_required << "\n"; + aslog(2) << "points_accessed_by_loop_extents = " << points_accessed_by_loop_extents << "\n"; + aslog(2) << "final points_accessed_per_thread = " << points_accessed << "\n"; + } + + return points_accessed; +} + +int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const { + // Is this load loop-invariant over an + // unrolled block? If so, we amortize the + // number of loads to account for LICM. + int64_t amortization = 1; + if (feat.unrolled_loop_extent <= 1) { + return amortization; + } + + for (size_t idx = 0; idx < innermost->stage->loop.size(); idx++) { + if (!innermost->stage->loop[idx].rvar) { + bool loop_invariant = true; + for (int i = 0; i < producer_dims; i++) { + if (!(jac(i, idx) == 0)) { + loop_invariant = false; + break; + } + } + if (loop_invariant) { + amortization *= parent->size[idx]; + } + } + } + + // TODO: LICM still acts for the innermost loop of non-unrolled things + + return amortization; +} + +void LoopNest::memoize_points_computed_minimum(StageMap& memoized_features, const StageMap *features) const { + for (auto it = inlined.begin(); it != inlined.end(); it++) { + const auto *f = it.key(); + const auto &inlined_feat = features->get(&(f->stages[0])); + memoized_features.get(&(f->stages[0])).points_computed_minimum = inlined_feat.points_computed_minimum; + } + + memoized_features.get(stage).points_computed_minimum = features->get(stage).points_computed_minimum; + + for (const auto &c : children) { + c->memoize_points_computed_minimum(memoized_features, features); + } +} + +vector> LoopNest::collect_producers(const StageMap &sites) const { + set stages; + collect_stages(stages); + + vector pending; + + for (const auto *stage : stages) { + for (const auto *e : stage->incoming_edges) { + pending.push_back(e); + } + } + + set done; + vector> producers; + + // Collect all producers of the funcs within this LoopNest + while (!pending.empty()) { + const auto *e = pending.back(); + pending.pop_back(); + if (done.count(e->producer)) continue; + done.insert(e->producer); + const auto &site = sites.get(&(e->producer->stages[0])); + if (site.store->is_root()) { + int vector_dim = (e->producer->is_input ? 0 : + site.produce != nullptr ? site.produce->vector_dim : + -1); + producers.push_back({e->producer->id, vector_dim}); + } else if (site.produce != nullptr) { + // Computation must be nested inside this task or inlined into it. + for (const auto &s : e->producer->stages) { + for (const auto *e2 : s.incoming_edges) { + pending.push_back(e2); + } + } + } + } + + return producers; +} + +uint64_t LoopNest::compute_hash_of_producers_stored_at_root(const StageMap &sites) const { + vector> producers = collect_producers(sites); + + // Sort them according to node id + std::sort(producers.begin(), producers.end(), [](const pair& a, const pair& b) { + return a.first < b.first; + }); + + uint64_t store_root_hash = 0; + for (const auto& p : producers) { + hash_combine(store_root_hash, p.first); + hash_combine(store_root_hash, p.second); + } + + return store_root_hash; +} + +void LoopNest::collect_stages(std::set& stages) const { + stages.insert(stage); + + for (const auto &c : children) { + c->collect_stages(stages); + } +} + +void LoopNest::memoize_features(StageMap& memoized_features, const StageMap *features) const { + for (auto it = inlined.begin(); it != inlined.end(); it++) { + const auto *f = it.key(); + if (memoized_features.contains(&(f->stages[0]))) { + continue; + } + + const auto &inlined_feat = features->get(&(f->stages[0])); + memoized_features.insert(&(f->stages[0]), inlined_feat); + } + + if (!memoized_features.contains(stage)) { + memoized_features.insert(stage, features->get(stage)); + } + + for (const auto &c : children) { + c->memoize_features(memoized_features, features); + } +} + +void LoopNest::compute_working_set_from_features(int64_t *working_set, + const StageMap *features) const { + int64_t working_set_here = 0; + + for (const auto &c : children) { + c->compute_working_set_from_features(&working_set_here, features); + } + + for (const auto *node : store_at) { + auto &feat = features->get(&(node->stages[0])); + working_set_here += feat.bytes_at_production; + } + + *working_set += working_set_here; +} + +void LoopNest::recompute_inlined_features(const StageMap &sites, StageMap *features) const { + for (const auto &c : children) { + c->recompute_inlined_features(sites, features); + } + + for (auto it = inlined.begin(); it != inlined.end(); it++) { + const auto *f = it.key(); + internal_assert(f); + + const auto &block = sites.get(stage).task; + + internal_assert(sites.contains(block->stage)); + uint64_t hash_of_producers = sites.get(block->stage).hash_of_producers_stored_at_root; + + internal_assert(block->feature_intermediates.count(hash_of_producers) > 0); + auto& intermediate_map = block->feature_intermediates[hash_of_producers].get(&(f->stages[0])); + auto& intermediate = intermediate_map.get(stage); + + auto &inlined_feat = features->get(&(f->stages[0])); + inlined_feat.inlined_calls += intermediate.inlined_calls; + inlined_feat.num_scalars += intermediate.num_scalars; + if (inlined_feat.innermost_pure_loop_extent > 0) { + inlined_feat.innermost_pure_loop_extent = + std::min(inlined_feat.innermost_pure_loop_extent, + intermediate.innermost_pure_loop_extent); + } else { + inlined_feat.innermost_pure_loop_extent = intermediate.innermost_pure_loop_extent; + } + inlined_feat.outer_parallelism = intermediate.outer_parallelism; + inlined_feat.num_blocks = intermediate.outer_parallelism; + inlined_feat.num_warps_per_block += intermediate.num_warps_per_block; + + inlined_feat.num_threads_per_block += intermediate.num_threads_per_block; + inlined_feat.points_computed_per_thread += intermediate.points_computed_per_thread; + } +} + +std::pair LoopNest::compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const { + const auto &bounds = get_bounds(f); + + auto bytes = f->bytes_per_point; + bool is_constant = true; + for (int i = 0; i < f->dimensions; i++) { + const auto &p = bounds->region_computed(i); + bytes *= p.extent(); + is_constant = is_constant && p.constant_extent(); + } + + return {bytes, is_constant}; +} + +// Do a recursive walk over the loop nest computing features to feed the cost model. +void LoopNest::compute_features(const FunctionDAG &dag, + const MachineParams ¶ms, + const Target &target, + const StageMap &sites, + int64_t instances, + int64_t parallelism, + const LoopNest *parent, + const LoopNest *grandparent, + const LoopNest &root, + int64_t *working_set, + int64_t *working_set_local_constant, + int64_t *working_set_local_dynamic, + StageMap *features, + GPULoopInfo gpu_loop_info, + bool use_memoized_features, + const StageMap &total_shared_mem_alloc_sizes, + Statistics& stats, + bool verbose) const { + + gpu_loop_info.update(target, this); + std::unique_ptr thread_info; + + if (is_gpu_thread(target)) { + thread_info = gpu_loop_info.create_thread_info(); + } + + int64_t working_set_here = 0; + int64_t working_set_here_local_constant = 0; + int64_t working_set_here_local_dynamic = 0; + + int64_t loop_instances = 1, parallel_tasks = 1; + bool in_impure = false; + for (int idx = (int)size.size() - 1; idx >= 0; idx--) { + size_t i = size[idx]; + loop_instances *= i; + if (stage->loop[idx].pure && !in_impure) { + if (params.parallelism > 1 && + (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) { + // Either we've picked our parallel tiling, or + // it's not yet determined. Assume we'll not split + // any loops and just stop after we hit the + // required number of cores + parallel_tasks *= i; + // If we haven't picked out parallel tiling yet, + // assume that we'll target 8*cores when we do, + // which is a common rule of thumb. + if (!parallel && parallel_tasks > params.parallelism * 8) { + // We would split this loop + parallel_tasks = params.parallelism * 8; + } + } + } else if (i != 1) { + in_impure = true; + } + } + + int64_t subinstances = instances * loop_instances; + + for (const auto *node : store_at) { + // Figure out the features at the store_at level + const auto &bounds = get_bounds(node); + + for (size_t s = 0; s < node->stages.size(); s++) { + // TODO: Lift invariants from this loop. Most of it's the same for every stage. + internal_assert(!node->is_input); + ScheduleFeatures &feat = features->get_or_create(&(node->stages[s])); + + feat.num_realizations = subinstances; + + feat.points_computed_per_realization = 1; + feat.num_scalars = subinstances; + for (int i = 0; i < (int)node->stages[s].loop.size(); i++) { + const auto &p = bounds->loops(s, i); + int64_t extent = p.extent(); + feat.points_computed_per_realization *= extent; + if (i == sites.get(&(node->stages[s])).produce->vectorized_loop_index) { + // Assumes that we're not going to split + // things such that non-native-width + // vectorization is a problem, except for the + // tail. + feat.num_scalars *= extent % node->stages[s].vector_size; + } else { + feat.num_scalars *= extent; + } + } + feat.points_computed_total = feat.points_computed_per_realization * feat.num_realizations; + + feat.bytes_at_realization = node->bytes_per_point; + for (int i = 0; i < node->dimensions; i++) { + const auto &p = bounds->region_computed(i); + feat.bytes_at_realization *= p.extent(); + } + int64_t innermost_storage_extent = 1; + int v = sites.get(&(node->stages[s])).produce->vector_dim; + if (v >= 0 && node->dimensions > 0) { + innermost_storage_extent = bounds->region_computed(v).extent(); + } + feat.innermost_bytes_at_realization = node->bytes_per_point * innermost_storage_extent; + + if (!is_root()) { + auto site = sites.get(&(node->stages[0])); + if (site.is_stored_in_global_mem()) { + feat.global_bytes_at_task = feat.bytes_at_realization; + feat.global_innermost_bytes_at_task = feat.innermost_bytes_at_realization; + } else if (site.is_stored_in_shared_mem()) { + feat.shared_bytes_at_task = feat.bytes_at_realization; + feat.shared_innermost_bytes_at_task = feat.innermost_bytes_at_realization; + } else if (site.is_stored_in_local_mem()) { + //feat.local_bytes_at_task = feat.bytes_at_realization; + //feat.local_innermost_bytes_at_task = feat.innermost_bytes_at_realization; + } else if (site.is_stored_in_registers()) { + feat.register_bytes_at_task = feat.bytes_at_realization; + feat.register_innermost_bytes_at_task = feat.innermost_bytes_at_realization; + } else { + internal_assert(false); + } + } + } + } + + if (is_root()) { + // TODO: This block of code is repeated below. Refactor + for (const auto &c : children) { + uint64_t hash_of_producers = sites.get(c->stage).hash_of_producers_stored_at_root; + if (use_memoized_features) { + if (c->features.count(hash_of_producers) > 0) { + ++stats.num_memoization_hits; + + const auto& entry = c->features.at(hash_of_producers); + for (auto it = entry.begin(); it != entry.end(); it++) { + auto &stage = *(it.key()); + const auto &feat = it.value(); + + features->insert(&stage, feat); + } + + // 'working_set_here' is required below for computing the + // root-level features so we compute the value that it + // would have had if the current loop nest had not been + // memoized + int64_t working_set_c{0}; + c->compute_working_set_from_features(&working_set_c, features); + working_set_here += working_set_c; + continue; + } + + ++stats.num_memoization_misses; + } + + c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + + if (use_memoized_features) { + c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id); + c->memoize_features(c->features[hash_of_producers], features); + } + } + + for (const auto *node : store_at) { + auto &feat = features->get(&(node->stages[0])); + working_set_here += feat.bytes_at_production; + } + for (const auto *node : store_at) { + for (const auto &s : node->stages) { + auto &feat = features->get(&s); + feat.working_set_at_realization = working_set_here; + } + } + for (const auto &c : children) { + if (c->node != node) { + auto &feat = features->get(c->stage); + feat.working_set_at_production = working_set_here; + } + } + + // Figure out the root-level features for every Func + for (auto it = features->begin(); it != features->end(); it++) { + const auto *stage = it.key(); + const auto *node = stage->node; + auto &feat = it.value(); + const auto &root_bounds = root.get_bounds(node); + + feat.bytes_at_root = node->bytes_per_point; + for (int i = 0; i < node->dimensions; i++) { + const auto &p = root_bounds->region_computed(i); + feat.bytes_at_root *= p.extent(); + } + + feat.working_set_at_root = working_set_here; + + auto *p = sites.get(stage).produce; + if (p) { + // Extent of the innermost dimension in the storage layout + int64_t innermost_storage_extent = 1; + int v = p->vector_dim; + if (v >= 0 && v < node->dimensions) { + innermost_storage_extent = root_bounds->region_computed(v).extent(); + } + feat.innermost_bytes_at_root = node->bytes_per_point * innermost_storage_extent; + } else { + feat.innermost_bytes_at_root = 0; + } + + feat.points_computed_minimum = 1; + for (int i = 0; i < (int)stage->loop.size(); i++) { + const auto &p = root_bounds->loops(stage->index, i); + feat.points_computed_minimum *= p.extent(); + } + + if (node->stages.size() == 1 && !node->is_output) { + int64_t points_computed_minimum_if_inlined = 0; + for (auto *e : node->outgoing_edges) { + points_computed_minimum_if_inlined += features->get(e->consumer).points_computed_minimum * e->calls; + } + feat.points_computed_minimum = std::min(feat.points_computed_minimum, (double)points_computed_minimum_if_inlined); + } + + // When memoizing, we need to recompute features for inlined Funcs + // so we reset them here + if (use_memoized_features && sites.get(stage).inlined) { + feat.inlined_calls = 0; + feat.num_scalars = 0; + feat.innermost_pure_loop_extent = 0; + feat.outer_parallelism = 0; + feat.num_warps_per_block = 0; + feat.num_threads_per_block = 0; + feat.points_computed_per_thread = 0; + } + } + + if (use_memoized_features) { + for (const auto &c : children) { + uint64_t hash_of_producers = sites.get(c->stage).hash_of_producers_stored_at_root; + + // When computing feat.points_computed_minimum above, the order + // of nodes considered is possibly different from the loop nest + // traversal order so 'features->get(e->consumer).points_computed_minimum' + // may not have been computed when it is accessed as a memoized + // feature. We memoize 'points_computed_minimum' here to ensure + // its value is always available + if (c->features.count(hash_of_producers) > 0) { + c->memoize_points_computed_minimum(c->features[hash_of_producers], features); + } + } + + recompute_inlined_features(sites, features); + } + + return; + } + + int64_t subparallelism = parallel_tasks * parallelism; + + // Figure out the features at the compute_at level + internal_assert(!stage->node->is_input); + ScheduleFeatures &feat = features->get_or_create(stage); + + if (innermost) { + } else { + // We want these features just outside the innermost loop, + // so just set them at every level and let them get + // progressively overwritten as we descend the loop nest + // tree. + size_t idx = 0; + feat.innermost_loop_extent = 1; + feat.innermost_pure_loop_extent = 1; + for (const auto &l : stage->loop) { + feat.innermost_loop_extent *= size[idx]; + if (!l.rvar) { + feat.innermost_pure_loop_extent *= size[idx]; + } + idx++; + } + } + + const bool at_task = parent->is_root(); + const bool at_production = parent->node != node; + const bool at_pure_production = at_production && stage->index == 0; + + if (at_task) { + double bytes_at_task = 0; + double innermost_bytes_at_task = 0; + if (parallel) { + const auto &bounds = get_bounds(node); + bytes_at_task = node->bytes_per_point; + int64_t innermost_storage_extent = 1; + for (int i = 0; i < node->dimensions; i++) { + int64_t outer = 1; + for (size_t l = 0; l < stage->loop.size(); l++) { + if (stage->loop[l].var == node->func.args()[i]) { + outer = size[l]; + break; + } + } + const auto &p = bounds->region_computed(i); + int64_t extent = p.extent(); + extent /= outer; + bytes_at_task *= extent; + if (i == vector_dim) { + innermost_storage_extent = extent; + } + } + innermost_bytes_at_task = node->bytes_per_point * innermost_storage_extent; + } else { + // How this loop will be parallelized is not yet + // determined. Use optimistic values for the features. + bytes_at_task = (feat.bytes_at_realization + params.parallelism - 1) / params.parallelism; + innermost_bytes_at_task = std::min(bytes_at_task, feat.innermost_bytes_at_realization); + } + + const auto &site = sites.get(&node->stages[0]); + if (site.is_stored_in_global_mem()) { + feat.global_bytes_at_task = bytes_at_task; + feat.global_innermost_bytes_at_task = innermost_bytes_at_task; + } else if (site.is_stored_in_shared_mem()) { + feat.shared_bytes_at_task = bytes_at_task; + feat.shared_innermost_bytes_at_task = innermost_bytes_at_task; + } else if (site.is_stored_in_local_mem()) { + //feat.local_bytes_at_task = bytes_at_task; + //feat.local_innermost_bytes_at_task = innermost_bytes_at_task; + } else { + internal_assert(false); + } + + feat.unique_bytes_read_per_task = 0; + feat.unique_lines_read_per_task = 0; + + // We're at a parallel for loop. Check all the accesses + // done by Funcs inside this loop to values computed + // outside of it to figure out how much data we'll be + // streaming onto the core. + vector pending; + set done; + for (const auto *e : stage->incoming_edges) { + pending.push_back(e); + } + while (!pending.empty()) { + const auto *e = pending.back(); + pending.pop_back(); + if (done.count(e->producer)) continue; + done.insert(e->producer); + const auto &site = sites.get(&(e->producer->stages[0])); + if (site.store->is_root()) { + const auto &b = get_bounds(e->producer); + int64_t bytes = e->producer->bytes_per_point, lines = 1; + int64_t max_extent = 1; + // clang-format off + int vector_dim = (e->producer->is_input ? 0 : + site.produce != nullptr ? site.produce->vector_dim : + -1); + // clang-format on + for (int i = 0; i < e->producer->dimensions; i++) { + int64_t extent = b->region_required(i).extent(); + max_extent = std::max(extent, max_extent); + bytes *= extent; + if (i != vector_dim) { + lines *= extent; + } + } + if (!e->producer->is_input && site.produce == nullptr) { + // We haven't scheduled the producer so we + // don't know the memory layout yet. Assume + // the best case. + lines /= max_extent; + } + feat.unique_bytes_read_per_task += bytes; + feat.unique_lines_read_per_task += lines; + + } else if (site.produce != nullptr) { + // Computation must be nested inside this task or inlined into it. + for (const auto &s : e->producer->stages) { + for (const auto *e2 : s.incoming_edges) { + pending.push_back(e2); + } + } + } + } + } + + if (at_production) { + feat.num_productions = instances; + feat.inner_parallelism = parallel_tasks; + feat.outer_parallelism = parallelism; + + const auto &bounds = parent->get_bounds(node); + + feat.bytes_at_production = node->bytes_per_point; + for (int i = 0; i < node->dimensions; i++) { + const auto &p = bounds->region_computed(i); + feat.bytes_at_production *= p.extent(); + } + int64_t innermost_storage_extent = 1; + if (vector_dim >= 0 && node->dimensions > 0) { + innermost_storage_extent = bounds->region_computed(vector_dim).extent(); + } + feat.innermost_bytes_at_production = node->bytes_per_point * innermost_storage_extent; + } + + // Recurse inwards + for (const auto &c : children) { + c->compute_features(dag, params, target, sites, subinstances, subparallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + } + for (const auto *node : store_at) { + auto &feat = features->get(&(node->stages[0])); + working_set_here += feat.bytes_at_production; + } + for (const auto *node : store_at) { + for (const auto &s : node->stages) { + auto &feat = features->get(&s); + feat.working_set_at_realization = working_set_here; + } + } + for (const auto &c : children) { + if (c->node != node) { + auto &feat = features->get(c->stage); + feat.working_set_at_production = working_set_here; + } + } + + if (is_gpu_thread(target)) { + feat.working_set_at_thread = working_set_here; + } + + if (at_task) { + set_working_set_at_task_feature(working_set_here, features); + } + + if (at_production) { + feat.working_set = working_set_here; + } + + if (innermost) { + bool parent_unrolled = + (feat.innermost_pure_loop_extent <= get_unroll_limit(target) && + parent->node == node); + + if (parent_unrolled) { + parent_unrolled = all(unrolled_loops(target, parent, grandparent)); + } + + if (parent_unrolled) { + feat.unrolled_loop_extent = feat.innermost_pure_loop_extent; + } else { + feat.unrolled_loop_extent = 1; + } + + ExprBranching branching{inlined}; + feat.expr_branching = branching.compute(node->func); + } + + *working_set += working_set_here; + *working_set_local_constant += working_set_here_local_constant; + *working_set_local_dynamic += working_set_here_local_dynamic; + + // Analyze all memory dependencies of this stage, looking + // through any Funcs inlined into it. This is where we track + // things like vector gathers. + int64_t global_bytes_loaded = 0, shared_bytes_loaded = 0, local_bytes_loaded = 0, register_bytes_loaded = 0; + int64_t global_lines_loaded = 0, shared_lines_loaded = 0, local_lines_loaded = 0, register_lines_loaded = 0; + int64_t global_bytes_loaded_per_thread = 0, shared_bytes_loaded_per_thread = 0, register_bytes_loaded_per_thread = 0; + int64_t global_lines_loaded_per_thread = 0, shared_lines_loaded_per_thread = 0, register_lines_loaded_per_thread = 0;; + int64_t global_allocation_bytes_loaded = 0, shared_allocation_bytes_loaded = 0; + GlobalMemInfo global_mem_loads; + SharedMemInfo shared_mem_loads; + LocalMemInfo local_mem_loads; + + if (innermost || at_production) { // These are the sites at which we compute load footprints + // Pick the site at which we will compute the footprint relationship + const auto &consumer_site = sites.get(stage); + + // The store_at location of the consumer + const auto *consumer_store_site = innermost ? parent : consumer_site.store; + + std::vector inner_serial_loop_extents; + + if (innermost && !stage->store_jacobian->empty()) { + const auto &bounds = consumer_site.store->get_bounds(stage->node); + inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this); + auto store_jac = *stage->store_jacobian; + + compute_gpu_store_features( + store_jac, + vector_dim, + stage->node, + bounds, + gpu_loop_info, + inner_serial_loop_extents, + consumer_site, + feat, + parent, + root, + global_mem_loads, + shared_mem_loads, + local_mem_loads, + verbose + ); + } + + // The parallel loop of the consumer + const auto *consumer_task_site = consumer_site.task; + + int64_t consumer_instances = innermost ? instances : feat.num_realizations; + internal_assert(consumer_instances != 0); + + vector>> pending; + vector edge_chain; + pending.emplace_back(stage, edge_chain); + vector> jacobians; + vector> thread_jacobians; + set done; + + while (!pending.empty()) { + auto p_pair = pending.back(); + pending.pop_back(); + + auto p = p_pair.first; + + const auto &next_edges = p->incoming_edges; + for (const auto *e : next_edges) { + internal_assert(sites.contains(&(e->producer->stages[0]))) + << "No site found for " << e->producer->func.name() << "\n"; + + const auto &site = sites.get(&(e->producer->stages[0])); + + bool producer_has_been_scheduled = e->producer->is_input || (site.produce != nullptr); + + std::vector edge_chain = p_pair.second; + edge_chain.push_back(e); + + if (innermost) { + if (e->consumer == stage) { + for (auto &j : e->load_jacobians) { + jacobians.emplace_back(j, e->producer); + + // Thread loops may not be innermost so in the + // Jacobians we need to account for the stride + // of the inner loops + thread_jacobians.emplace_back(j * inner_serial_loop_extents, e->producer); + } + } else { + // Consumer was inlined. Multiply the Jacobians to look through it. + decltype(jacobians) new_jacobians; + for (auto &j1 : jacobians) { + if (e->consumer->node == j1.second) { + for (auto &j2 : e->load_jacobians) { + LoadJacobian j = j2 * j1.first; + new_jacobians.emplace_back(j, e->producer); + } + } + + new_jacobians.emplace_back(std::move(j1)); + } + jacobians.swap(new_jacobians); + + // Consumer was inlined. Concat the jacobians to look through it. + decltype(jacobians) new_thread_jacobians; + for (auto &j1 : thread_jacobians) { + if (e->consumer->node == j1.second) { + for (auto &j2 : e->load_jacobians) { + LoadJacobian j = j2 * j1.first; + new_thread_jacobians.emplace_back(j, e->producer); + } + } + + new_thread_jacobians.emplace_back(std::move(j1)); + } + thread_jacobians.swap(new_thread_jacobians); + } + } + + if (site.inlined) { + // Recursively examine the inputs + pending.emplace_back(&(e->producer->stages[0]), edge_chain); + continue; + } + + // The producer's compute_at site + const auto *producer_compute_site = site.compute; + + // The producer's store_at site + const auto *producer_store_site = site.store; + + // The region required of the producer at various sites. + const auto &bounds = consumer_store_site->get_bounds(e->producer); + const auto &task_bounds = consumer_task_site->get_bounds(e->producer); + const auto &producer_compute_bounds = producer_compute_site->get_bounds(e->producer); + const auto &producer_store_bounds = producer_store_site->get_bounds(e->producer); + + // Compute memory footprints in terms of the + // number of contiguous lines, and the number of + // bytes. + int64_t footprint = e->producer->bytes_per_point; + int64_t thread_footprint = footprint; + int64_t compute_footprint = footprint; + int64_t store_footprint = footprint; + int64_t line_footprint = 1; + int64_t thread_line_footprint = 1; + int64_t compute_line_footprint = 1; + int64_t store_line_footprint = 1; + int64_t task_line_footprint = 1; + + if (e->producer->is_input) { + // This node represents an input. Its sites + // should be at the root level. + internal_assert(producer_store_site->is_root()); + internal_assert(producer_compute_site->is_root()); + } + + if (innermost) { + int producer_innermost_dim = + (e->producer->is_input ? 0 : // Assume default storage layout for inputs + !producer_has_been_scheduled ? -1 : + site.produce->vector_dim); + + // Shared, global, or local memory? + bool is_global_mem = site.gpu_store_memory_type == GPUMemoryType::global; + bool is_shared_mem = site.gpu_store_memory_type == GPUMemoryType::shared; + + // Grab the jacobians that describe the memory dependence + for (size_t i = 0; i < thread_jacobians.size(); ++i) { + const auto &jac = thread_jacobians[i]; + const auto &serial_jac = jacobians[i]; + internal_assert(jac.second == serial_jac.second); + if (jac.second != e->producer) continue; + int64_t n = jac.first.count(); + + if (is_shared_mem) { + if (verbose) { + std::string consumer_name = node->func.name(); + sanitize_names(consumer_name); + std::string producer_name = e->producer->func.name(); + sanitize_names(producer_name); + aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name <<"\n"; + } + + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); + + compute_mem_load_features( + jac.first, + producer_innermost_dim, + e->producer, + producer_store_bounds, + producer_has_been_scheduled, + *gpu_loop_info.thread_info, + shared_mem_loads, + points_accessed, + verbose + ); + if (verbose) { + aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; + aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); + if (!jac.first.all_coeffs_exist()) { + aslog(0) << " (not all coeffs exist)"; + } + aslog(2) << "\n\n"; + } + + } else if (is_global_mem) { + + if (verbose) { + std::string consumer_name = node->func.name(); + sanitize_names(consumer_name); + std::string producer_name = e->producer->func.name(); + sanitize_names(producer_name); + aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name <<"\n"; + } + + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); + + compute_mem_load_features( + jac.first, + producer_innermost_dim, + e->producer, + producer_store_bounds, + producer_has_been_scheduled, + *gpu_loop_info.thread_info, + global_mem_loads, + points_accessed, + verbose + ); + + if (verbose) { + aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; + aslog(2) << "END MEM ACCESS global_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); + if (!jac.first.all_coeffs_exist()) { + aslog(2) << " (not all coeffs exist)"; + } + aslog(2) << "\n\n"; + } + } + } + + if (site.gpu_store_memory_type == GPUMemoryType::local) { + internal_assert(false) << "Loop nest contains local_mem_load"; + for (const auto &jac : jacobians) { + if (jac.second != e->producer) continue; + int64_t n = jac.first.count(); + + if (verbose) { + std::string consumer_name = node->func.name(); + sanitize_names(consumer_name); + std::string producer_name = e->producer->func.name(); + sanitize_names(producer_name); + aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name <<"\n"; + } + + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); + + compute_mem_load_features( + jac.first, + producer_innermost_dim, + e->producer, + producer_store_bounds, + producer_has_been_scheduled, + *gpu_loop_info.thread_info, + local_mem_loads, + points_accessed, + verbose + ); + + if (verbose) { + aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; + aslog(2) << "END MEM ACCESS local_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); + if (!jac.first.all_coeffs_exist()) { + aslog(2) << " (not all coeffs exist)"; + } + aslog(2) << "\n\n"; + } + } + } + } + + // Already dealt with the footprints for this producer via some other path + if (done.find(e->producer) != done.end()) { + continue; + } + + done.insert(e->producer); + + + // Now look at the shapes of the regions read from + // the producer at various sites. + int64_t max_extent = 1, max_thread_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1; + for (int i = 0; i < e->producer->dimensions; i++) { + auto p = bounds->region_required(i); + auto compute_p = producer_compute_bounds->region_computed(i); + auto store_p = producer_store_bounds->region_required(i); + auto task_p = task_bounds->region_required(i); + + // Check some invariants + internal_assert(store_p.min() <= store_p.max()) << store_p.min() << " " << store_p.max() << "\n"; + internal_assert(compute_p.min() <= compute_p.max()) << compute_p.min() << " " << compute_p.max() << "\n"; + internal_assert(task_p.min() <= task_p.max()) << task_p.min() << " " << task_p.max() << "\n"; + + int64_t thread_extent = 1; + if (innermost) { + const auto &thread_bounds = gpu_loop_info.current_thread_loop->get_bounds(e->producer); + auto thread_p = thread_bounds->region_required(i); + thread_extent = thread_p.extent(); + } + + int64_t extent = p.extent(); + int64_t compute_extent = compute_p.extent(); + int64_t store_extent = store_p.extent(); + int64_t task_extent = task_p.extent(); + + max_extent = std::max(extent, max_extent); + max_thread_extent = std::max(thread_extent, max_thread_extent); + max_compute_extent = std::max(compute_extent, max_compute_extent); + max_store_extent = std::max(store_extent, max_store_extent); + max_task_extent = std::max(task_extent, max_task_extent); + + footprint *= extent; + thread_footprint *= thread_extent; + compute_footprint *= compute_extent; + store_footprint *= store_extent; + + bool dense = ((e->producer->is_input && i == 0) || + (site.produce != nullptr && i == site.produce->vector_dim)); + if (!dense) { + line_footprint *= extent; + thread_line_footprint *= thread_extent; + compute_line_footprint *= compute_extent; + store_line_footprint *= store_extent; + task_line_footprint *= task_extent; + } + } + + if (!producer_has_been_scheduled) { + // Optimistically assume it gets vectorized + // along whatever dimension makes these + // numbers the smallest. + line_footprint /= max_extent; + thread_line_footprint /= max_thread_extent; + compute_line_footprint /= max_compute_extent; + store_line_footprint /= max_store_extent; + task_line_footprint /= max_task_extent; + } + + int64_t store_instances_per_consumption = 1; + + if (!e->producer->is_input) { + const int64_t producer_store_instances = + producer_has_been_scheduled + ? features->get_or_create(&(e->producer->stages[0])).num_realizations + : site.num_realizations; + + internal_assert(producer_store_instances > 0); + + if (producer_store_instances) { + // The producer's realization is nested inside this Func's realization + if (producer_store_instances > consumer_instances) { + store_instances_per_consumption = producer_store_instances / consumer_instances; + } + } + } + + if (site.is_stored_in_global_mem()) { + global_allocation_bytes_loaded += compute_footprint; + } else if (site.is_stored_in_shared_mem()) { + shared_allocation_bytes_loaded += compute_footprint; + } else if (site.is_stored_in_local_mem()) { + } else if (site.is_stored_in_registers()) { + } else { + internal_assert(false); + } + + if (store_instances_per_consumption > 1) { + if (site.is_stored_in_global_mem()) { + // The producer is nested inside the consumer + global_bytes_loaded += store_footprint; + // Due to folding, the actual buffer size is smaller than the bounds at the store level + global_lines_loaded += store_line_footprint; + + global_bytes_loaded_per_thread += store_footprint; + global_lines_loaded_per_thread += store_line_footprint; + } else if (site.is_stored_in_shared_mem()) { + shared_bytes_loaded += store_footprint; + shared_lines_loaded += store_line_footprint; + + shared_bytes_loaded_per_thread += store_footprint; + shared_lines_loaded_per_thread += store_line_footprint; + } else if (site.is_stored_in_local_mem()) { + local_bytes_loaded += store_footprint; + local_lines_loaded += store_line_footprint; + } else if (site.is_stored_in_registers()) { + register_bytes_loaded += store_footprint; + register_lines_loaded += store_line_footprint; + + register_bytes_loaded_per_thread += store_footprint; + register_lines_loaded_per_thread += store_line_footprint; + } else { + internal_assert(false); + } + + } else { + // The consumer is consuming some portion of a larger producer computed earlier + if (site.is_stored_in_global_mem()) { + global_bytes_loaded += footprint; + global_lines_loaded += line_footprint; + + global_bytes_loaded_per_thread += thread_footprint; + global_lines_loaded_per_thread += thread_line_footprint; + } else if (site.is_stored_in_shared_mem()) { + shared_bytes_loaded += footprint; + shared_lines_loaded += line_footprint; + + shared_bytes_loaded_per_thread += thread_footprint; + shared_lines_loaded_per_thread += thread_line_footprint; + } else if (site.is_stored_in_local_mem()) { + local_bytes_loaded += footprint; + local_lines_loaded += line_footprint; + } else if (site.is_stored_in_registers()) { + register_bytes_loaded += footprint; + register_lines_loaded += line_footprint; + + if (producer_store_site == gpu_loop_info.current_thread_loop) { + register_bytes_loaded_per_thread += thread_footprint; + register_lines_loaded_per_thread += thread_line_footprint; + } else { + internal_assert(producer_store_site->gpu_label == GPU_parallelism::serial); + register_bytes_loaded_per_thread += store_footprint; + register_lines_loaded_per_thread += store_line_footprint; + } + } else { + internal_assert(false); + } + } + + // We compute (but never use) these; computing them is cheap, + // so let's leave in for future reference, but mark as 'ignore me' + // to avoid clang-tidy warnings. + (void)compute_line_footprint; + (void)task_line_footprint; + } + } + } + + if (at_production) { + // Properties of the realization, but the values are + // computable at the production site because that's where + // the consumers are. + internal_assert(global_bytes_loaded >= 0) << "Negative global bytes loaded: " << global_bytes_loaded << "\n"; + internal_assert(shared_bytes_loaded >= 0) << "Negative shared bytes loaded: " << shared_bytes_loaded << "\n"; + internal_assert(local_bytes_loaded >= 0) << "Negative local bytes loaded: " << local_bytes_loaded << "\n"; + internal_assert(register_bytes_loaded >= 0) << "Negative register bytes loaded: " << register_bytes_loaded << "\n"; + + feat.global_allocation_bytes_read_per_realization = global_allocation_bytes_loaded; + feat.shared_allocation_bytes_read_per_realization = shared_allocation_bytes_loaded; + + feat.unique_global_bytes_read_per_realization = global_bytes_loaded; + feat.unique_shared_bytes_read_per_realization = shared_bytes_loaded; + feat.unique_register_bytes_read_per_realization = register_bytes_loaded; + + feat.unique_global_lines_read_per_realization = global_lines_loaded; + feat.unique_shared_lines_read_per_realization = shared_lines_loaded; + feat.unique_register_lines_read_per_realization = register_lines_loaded; + + if (!at_pure_production) { + // Also pessimistically assume this update definition relies on the entirety of the produced region so far. + // TODO: This overbills scatters, or writes to a sub-window. + internal_assert(feat.bytes_at_production >= 0) << "Negative bytes at production: " << feat.bytes_at_production << "\n"; + + const auto &consumer_site = sites.get(&node->stages[0]); + if (consumer_site.is_stored_in_global_mem()) { + feat.unique_global_bytes_read_per_realization += feat.bytes_at_production; + feat.unique_global_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + feat.global_allocation_bytes_read_per_realization += feat.bytes_at_production; + } else if (consumer_site.is_stored_in_shared_mem()) { + feat.unique_shared_bytes_read_per_realization += feat.bytes_at_production; + feat.unique_shared_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + feat.shared_allocation_bytes_read_per_realization += feat.bytes_at_production; + } else if (consumer_site.is_stored_in_local_mem()) { + //feat.unique_local_bytes_read_per_realization += feat.bytes_at_production; + //feat.unique_local_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + //feat.local_allocation_bytes_read_per_realization += feat.bytes_at_production; + } else if (consumer_site.is_stored_in_registers()) { + feat.unique_register_bytes_read_per_realization += feat.bytes_at_production; + feat.unique_register_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + feat.register_allocation_bytes_read_per_realization += feat.bytes_at_production; + } else { + internal_assert(false); + } + } + } + + if (innermost) { + feat.points_computed_per_thread = gpu_loop_info.total_serial_extents(); + + feat.unique_global_bytes_read_per_thread = global_bytes_loaded_per_thread; + feat.unique_shared_bytes_read_per_thread = shared_bytes_loaded_per_thread; + feat.unique_register_bytes_read_per_thread = register_bytes_loaded_per_thread; + + feat.unique_global_lines_read_per_thread = global_lines_loaded_per_thread; + feat.unique_shared_lines_read_per_thread = shared_lines_loaded_per_thread; + feat.unique_register_lines_read_per_thread = register_lines_loaded_per_thread; + + feat.points_computed_per_production = subinstances / feat.num_productions; + + feat.unique_bytes_read_per_point = global_bytes_loaded + shared_bytes_loaded + local_bytes_loaded + register_bytes_loaded; + feat.unique_lines_read_per_point = global_lines_loaded + shared_lines_loaded + local_lines_loaded + register_bytes_loaded; + + feat.num_global_mem_loads_per_block = global_mem_loads.num_transactions(); + feat.global_mem_load_efficiency = global_mem_loads.efficiency(); + + feat.num_shared_mem_loads_per_block = shared_mem_loads.num_transactions(); + feat.shared_mem_load_efficiency = shared_mem_loads.efficiency(); + + internal_assert(in_range_zero_one(feat.global_mem_load_efficiency)) << "Invalid global mem load efficiency: " << feat.global_mem_load_efficiency; + + internal_assert(in_range_zero_one(feat.shared_mem_load_efficiency)) << "Invalid shared mem load efficiency: " << feat.shared_mem_load_efficiency; + } + + // Track features for inlined Funcs + for (auto it = inlined.begin(); it != inlined.end(); it++) { + const auto *f = it.key(); + internal_assert(f); + auto &inlined_feat = features->get_or_create(&(f->stages[0])); + inlined_feat.inlined_calls += it.value() * subinstances; + inlined_feat.num_scalars += it.value() * feat.num_scalars; + if (inlined_feat.innermost_pure_loop_extent > 0) { + inlined_feat.innermost_pure_loop_extent = + std::min(inlined_feat.innermost_pure_loop_extent, + feat.innermost_pure_loop_extent); + } else { + inlined_feat.innermost_pure_loop_extent = feat.innermost_pure_loop_extent; + } + inlined_feat.inner_parallelism = 1; + inlined_feat.outer_parallelism = parallelism; + inlined_feat.num_blocks = parallelism; + + internal_assert(gpu_loop_info.thread_info); + auto num_warps = it.value() * gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks; + inlined_feat.num_warps_per_block += num_warps; + inlined_feat.num_threads_per_block += gpu_loop_info.thread_info->num_threads; + double points_computed_per_thread = it.value() * feat.points_computed_per_thread; + inlined_feat.points_computed_per_thread += points_computed_per_thread; + + if (use_memoized_features) { + const auto &block = sites.get(stage).task; + uint64_t hash_of_producers = sites.get(block->stage).hash_of_producers_stored_at_root; + auto& intermediate_map = block->feature_intermediates[hash_of_producers].get_or_create(&(f->stages[0])); + auto& intermediate = intermediate_map.get_or_create(stage); + intermediate.inlined_calls = it.value() * subinstances; + intermediate.num_scalars = it.value() * feat.num_scalars; + + intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent; + intermediate.outer_parallelism = parallelism; + intermediate.num_warps_per_block = num_warps; + + intermediate.num_threads_per_block = gpu_loop_info.thread_info->num_threads; + intermediate.points_computed_per_thread = points_computed_per_thread; + } + } + + compute_shared_mem_occupancy(target, total_shared_mem_alloc_sizes.get(stage), feat); + + if (innermost && !is_scalar()) { + compute_warp_features(feat, gpu_loop_info); + + compute_warp_and_block_occupancy(params, feat, gpu_loop_info); + } +} + +// Get the region required of a Func at this site (but only to satisfy the +// consumers along the given edge chain), from which we know what region +// would be computed if it were scheduled here and what its loop nest +// would be. +// This is useful for computing load memory features along a particular edge +// e.g. if out(x) = f(x) + g(x) +// and f(x) = g(x - 100) + g(x + 100) +// and g(x) = x +// we want to be able to compute load memory features by 'out' loading from 'g'. +// For this we need the region required of 'g', but it should only include the +// region required by the edge from 'g' -> 'out' and ignore the region required by the +// edge 'g' -> 'f' (which is what get_bounds() would compute i.e. the region +// required of 'g' should be 1 point for each point of 'out' but get_bounds() +// will also include the edge 'g' -> 'f' and give the result 201 points for every point +// of 'out') +const Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector& edge_chain) const { + internal_assert(edge_chain.size() >= 1); + + internal_assert(edge_chain[0]->consumer == stage) + << "get_bounds_along_edge_chain must be called with an edge chain that begins from the current loop nest's node. But the given edge chain begins with " << edge_chain[0]->consumer->node->func.name() + << " not " << node->func.name(); + + internal_assert(edge_chain.back()->producer == f) + << "get_bounds_along_edge_chain must be called with an edge chain that ends with the given node. But the given edge chain ends with " << edge_chain.back()->producer->func.name() + << " not " << f->func.name(); + + vector bounds; + BoundContents* bound; + + // For the final consumer, we rely on get_bounds() (i.e. on the bounds for it to + // satisfy all of its downstream consumers instead of just along a single edge). This should be + // okay because it is computed in the current loop nest so its bounds need + // to account for all its downstream consumers. + const auto& c_bounds = get_bounds(edge_chain[0]->consumer->node); + Bound cur_consumer_bounds = c_bounds; + + for (const auto* e : edge_chain) { + const auto* producer = e->producer; + + bound = producer->make_bound(); + auto init = Span::empty_span(); + for (int i = 0; i < producer->dimensions; i++) { + bound->region_required(i) = init; + } + + // Get the concrete sizes of the consuming loop + const auto *consumer_loop = &(cur_consumer_bounds->loops(e->consumer->index, 0)); + + // Use the bounds relationship between the nodes to + // map from the consumer's loop to the required region + // of the producer. + e->expand_footprint(consumer_loop, &(bound->region_required(0))); + + // Given a required region of this producer, use the bounds + // analysis to figure out what region actually gets + // computed. For most funcs, these are the same. Some things, + // like histograms or scans, you can only really compute all + // of at once. + producer->required_to_computed(&(bound->region_required(0)), &(bound->region_computed(0))); + + // Finally, figure out what loop nests will be used to compute + // this region. + for (int i = 0; i < (int)producer->stages.size(); i++) { + producer->loop_nest_for_region(i, &(bound->region_computed(0)), &(bound->loops(i, 0))); + } + + bounds.push_back(bound); + cur_consumer_bounds = bound; + } + + return bounds.back(); +} + +// Get the region required of a Func at this site, from which we +// know what region would be computed if it were scheduled here, +// and what its loop nest would be. +const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const { + if (bounds.contains(f)) { + const Bound &b = bounds.get(f); + // Expensive validation for debugging + // b->validate(); + return b; + } + auto bound = f->make_bound(); + + // Compute the region required + if (f->is_output && is_root()) { + // It's an output. Use the bounds estimate. + for (int i = 0; i < f->dimensions; i++) { + bound->region_required(i) = f->estimated_region_required[i]; + } + } else { + internal_assert(!f->outgoing_edges.empty()) + << "No consumers of " << f->func.name() + << " at loop over " << (is_root() ? "root" : node->func.name()) << "\n"; + auto init = Span::empty_span(); + for (int i = 0; i < f->dimensions; i++) { + bound->region_required(i) = init; + } + + for (const auto *e : f->outgoing_edges) { + // Ignore consumers outside of this loop nest + if (!is_root() && + (stage != e->consumer) && + (!stage->downstream_of(*(e->consumer->node)))) { + continue; + } + const auto &c_bounds = get_bounds(e->consumer->node); + + // Get the concrete sizes of the consuming loop + const auto *consumer_loop = &(c_bounds->loops(e->consumer->index, 0)); + + // Use the bounds relationship between the nodes to + // map from the consumer's loop to the required region + // of the producer. + e->expand_footprint(consumer_loop, &(bound->region_required(0))); + } + } + + // Given a required region of this producer, use the bounds + // analysis to figure out what region actually gets + // computed. For most funcs, these are the same. Some things, + // like histograms or scans, you can only really compute all + // of at once. + f->required_to_computed(&(bound->region_required(0)), &(bound->region_computed(0))); + + // Finally, figure out what loop nests will be used to compute + // this region. + for (int i = 0; i < (int)f->stages.size(); i++) { + f->loop_nest_for_region(i, &(bound->region_computed(0)), &(bound->loops(i, 0))); + } + + const Bound &b = set_bounds(f, bound); + // b->validate(); + return b; +} + +void LoopNest::dump() const { + auto stream = aslog(0); + dump(stream, "", nullptr); +} + +std::string LoopNest::to_string() const { + std::ostringstream stream; + dump(stream, "", nullptr); + return stream.str(); +} + +// Recursively print a loop nest representation to the given stream +template +void LoopNest::dump(T& stream, string prefix, const LoopNest *parent) const { + if (!is_root()) { + // Non-root nodes always have parents. + internal_assert(parent != nullptr); + + stream << prefix << node->func.name(); + prefix += " "; + + for (size_t i = 0; i < size.size(); i++) { + stream << " " << size[i]; + // The vectorized loop gets a 'v' suffix + if (innermost && i == (size_t)vectorized_loop_index) { + stream << "v"; + } + // Loops that have a known constant size get a + // 'c'. Useful for knowing what we can unroll. + if (parent->get_bounds(node)->loops(stage->index, i).constant_extent()) { + stream << "c"; + } + } + + // Uncomment when debugging the representative loop bounds selected. + /* + const auto &bounds = get_bounds(node); + for (size_t i = 0; i < size.size(); i++) { + const auto &p = bounds->loops(stage->index, i); + stream << " [" << p.first << ", " << p.second << "]"; + } + */ + + stream << " (" << vectorized_loop_index << ", " << vector_dim << ")"; + } + + if (tileable) { + stream << " t"; + } + if (innermost) { + stream << " *"; + } + if (gpu_label == block) { + stream << " gpu_block\n"; + } else if (gpu_label == serial) { + stream << " gpu_serial\n"; + } else if (gpu_label == none) { + stream << " gpu_none\n"; + } else if (gpu_label == simd) { + stream << " gpu_simd\n"; + } else if (gpu_label == thread) { + stream << " gpu_thread\n"; + } else if (gpu_label == parallelized) { + stream << " gpu_parallelized\n"; + } else if (parallel) { + stream << " p\n"; + } else { + stream << "\n"; + } + for (auto p : store_at) { + stream << prefix << "realize: " << p->func.name() << " ["; + for (int i = 0; i < p->dimensions; i++) { + if (i > 0) { + stream << ", "; + } + const auto ®ion = get_bounds(p)->region_computed(i); + stream << region.extent(); + if (region.constant_extent()) { + stream << "c"; + } + } + stream << "] with " << p->stages.size() << " stages\n"; + } + for (size_t i = children.size(); i > 0; i--) { + children[i - 1]->dump(stream, prefix, this); + } + for (auto it = inlined.begin(); it != inlined.end(); it++) { + stream << prefix << "inlined: " << it.key()->func.name() << " " << it.value() << "\n"; + } +} + +template +void LoopNest::dump(aslog& stream, string prefix, const LoopNest *parent) const; + +template +void LoopNest::dump(std::ostringstream& stream, string prefix, const LoopNest *parent) const; + +// Does this loop nest access the given Func +bool LoopNest::calls(const FunctionDAG::Node *f) const { + for (const auto &c : children) { + if (c->calls(f)) return true; + } + for (const auto *e : f->outgoing_edges) { + if (e->consumer == stage) { + return true; + } + if (inlined.contains(e->consumer->node)) { + return true; + } + } + return false; +} + +// What is the maximum number of inlined calls to a Func that +// occur within this loop. Used to prune states that would +// generate too much code. +int64_t LoopNest::max_inlined_calls() const { + int64_t result = 0; + for (auto it = inlined.begin(); it != inlined.end(); it++) { + result = std::max(result, it.value()); + } + for (const auto &c : children) { + result = std::max(result, c->max_inlined_calls()); + } + return result; +} + +// Does this loop nest access an input buffer? Used to select +// trail strategies when splitting loops. We don't want to read +// out of bounds on inputs, even if we don't intend to use the +// values read. It could create annoying assertion failures for +// the user. It's OK to read out of range of the values computed +// on internal Funcs though. Allocation bounds inference just pads +// out the bounds so that it won't fault. +bool LoopNest::accesses_input_buffer() const { + for (const auto &c : children) { + if (c->accesses_input_buffer()) return true; + } + if (is_root()) return false; + + auto check = [&](const FunctionDAG::Node::Stage *s) { + for (const auto *e : s->incoming_edges) { + if (e->producer->is_input) return true; + } + + for (int t = 0; t < (int)PipelineFeatures::ScalarType::NumScalarTypes; t++) { + if (s->features.op_histogram[(int)PipelineFeatures::OpType::ImageCall][t] > 0) return true; + } + return false; + }; + + if (check(stage)) return true; + for (auto it = inlined.begin(); it != inlined.end(); it++) { + if (check(&(it.key()->stages[0]))) return true; + } + return false; +} + +// Does this loop nest contain a computation of the given Func. +bool LoopNest::computes(const FunctionDAG::Node *f) const { + if (f == node) { + return true; + } + if (inlined.contains(f)) { + return true; + } + for (const auto &c : children) { + if (c->computes(f)) return true; + } + return false; +} + +// Above here most methods query the loop nest. Below we have +// methods that mutate the loop nest. + +// Inline a Func into all consumers within this loop. +void LoopNest::inline_func(const FunctionDAG::Node *f) { + // Inline it into the children + for (size_t i = 0; i < children.size(); i++) { + if (children[i]->calls(f)) { + std::unique_ptr new_child{new LoopNest}; + new_child->copy_from(*children[i]); + new_child->inline_func(f); + children[i] = new_child.release(); + } + } + + // Inline it here if there are any direct calls + if (innermost) { + int64_t calls = 0; + for (const auto *e : f->outgoing_edges) { + if (inlined.contains(e->consumer->node)) { + calls += inlined.get(e->consumer->node) * e->calls; + } + if (e->consumer == stage) { + calls += e->calls; + } + } + if (calls) { + inlined.insert(f, calls); + } + } +} + +// Compute a Func at this site. +bool LoopNest::compute_here(const FunctionDAG::Node *f, + bool tileable, + int v, + bool in_threads_loop, + const Target &target) { + const auto &bounds = get_bounds(f); + + if (!may_subtile()) { + // If we are restricting ourselves to the Mullapudi et al + // scheduling space, then once something is computed here + // we may not subtile this loop. + this->tileable = false; + } + + bool skip_vector_dim = false; + + for (int s = (int)f->stages.size() - 1; s >= 0; s--) { + LoopNest *node = new LoopNest; + node->node = f; + node->stage = &f->stages[s]; + node->innermost = true; + node->vectorized_loop_index = -1; + node->tileable = tileable && (is_root() || may_subtile()); + + // always set gpu_label as thread if legal. + // if !in_threads_loop we are computing either at root level or inside a serial loop + // set gpu_label to none, then call parallelize_in_tiles to create a parallel, serial, SIMD loop + // if compute_root set gpu_label to none, parallelize_in_tiles creates block and thread loops later + // if computing at serial loop set gpu_label to thread. + if (target.has_gpu_feature()) { + if (is_root()) { + node->gpu_label = none; + } else if (!in_threads_loop) { + node->gpu_label = thread; + } else { + node->gpu_label = serial; + } + } + // Set up a bound for the inside of the + // loop. computed/required is still the full region, but + // the loop nest will be a single representative point. + auto single_point = bounds->make_copy(); + size_t loop_dim = f->stages[s].loop.size(); + node->size.resize(loop_dim); + + int64_t vector_size = 1; + bool all_ones = true; + for (size_t i = 0; i < loop_dim; i++) { + const auto &l = bounds->loops(s, i); + // Initialize the loop nest + node->size[i] = l.extent(); + + // Use the first loop iteration to represent the inner + // loop. We'll shift it to a later one once we decide + // on vectorization. + single_point->loops(s, i) = Span(l.min(), l.min(), true); + + internal_assert(l.max() >= l.min()) << i << " " << l.max() << " " << l.min() << "\n"; + + if (f->dimensions && + node->size[i] >= 1 && + f->stages[s].loop[i].var == f->func.args()[v]) { + node->vectorized_loop_index = (int)i; + vector_size = (int64_t)(node->stage->vector_size); + single_point->loops(s, i).set_extent(vector_size); + node->size[i] += vector_size - 1; + node->size[i] /= vector_size; + + // Shift the loops along by some multiple of the + // vector size, to pick a more representative vector + // than the first. We use the middle-most. + int64_t shift = vector_size * (node->size[i] / 2); + single_point->loops(s, i).translate(shift); + } else { + int64_t shift = node->size[i] / 2; + single_point->loops(s, i).translate(shift); + } + + all_ones = all_ones && node->size[i] == 1; + } + + // Leave region required blank inside the computation of a Func + node->set_bounds(f, std::move(single_point)); + node->vector_dim = v; + + if (s == 0) { + skip_vector_dim = !all_ones && node->size[v] == 1; + } + + // Split off the single vector as an inner loop nest. + node->innermost = false; + + LoopNest *one_vector = new LoopNest; + one_vector->node = node->node; + one_vector->stage = node->stage; + one_vector->tileable = false; + one_vector->vectorized_loop_index = node->vectorized_loop_index; + one_vector->vector_dim = v; + one_vector->size.resize(loop_dim, 1); + one_vector->innermost = true; + one_vector->gpu_label = simd; + auto b = node->get_bounds(f)->make_copy(); + // Set the region computed inside this node to be the first vector lane + if (node->vectorized_loop_index >= 0) { + b->loops(s, node->vectorized_loop_index).set_extent(1); + } else { + for (size_t i = 0; i < loop_dim; i++) { + internal_assert(b->loops(s, i).extent() == 1); + } + } + + one_vector->set_bounds(f, b); + if (node->vectorized_loop_index >= 0) { + one_vector->size[node->vectorized_loop_index] = vector_size; + } + + node->children.emplace_back(one_vector); + + children.emplace_back(node); + } + + return skip_vector_dim; +} + +// Parallelize this loop according to the given tiling. +IntrusivePtr LoopNest::parallelize_in_tiles(const MachineParams ¶ms, + const vector &tiling, + const LoopNest *parent, + const Target &target, + bool inner_tiling, + bool adjust_tiling, + bool move_all_rvars_inward, + const vector &rvars_to_move_inward) const { + + // Split this loop and move factors to the inner loop + LoopNest *inner = new LoopNest, *outer = new LoopNest; + inner->node = outer->node = node; + inner->stage = outer->stage = stage; + inner->tileable = outer->tileable = tileable && may_subtile(); + inner->vector_dim = outer->vector_dim = vector_dim; + inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index; + + if (target.has_gpu_feature()) { + if (gpu_label == none) { + inner->gpu_label = serial; + outer->gpu_label = parallelized; + outer->parallel = true; + } else if (gpu_label == parallelized) { + inner->gpu_label = thread; // compute root funcs always allowed to use GPU threads + outer->gpu_label = block; + outer->parallel = true; + } else if (gpu_label == thread) { + inner->gpu_label = serial; + outer->gpu_label = thread; + outer->parallel = false; + } else if (gpu_label == serial) { + inner->gpu_label = serial; + outer->gpu_label = serial; + outer->parallel = false; + } else { + internal_error << "invalid gpu label " << gpu_label << " for parallelized loop\n"; + } + } + + outer->size = size; + outer->innermost = false; + + if (!target.has_gpu_feature()) + outer->parallel = true; + + outer->tileable = may_subtile(); + + // First make an inner loop representing a 1x1x1... tile + inner->size.resize(size.size(), 1); + inner->innermost = innermost; + inner->children = children; + inner->inlined = inlined; + inner->bounds = bounds; + inner->store_at = store_at; + + auto b = inner->get_bounds(node)->make_copy(); + + // Then move factors from the outer loop to the inner loop + auto parent_bounds = parent->get_bounds(node); + + for (size_t i = 0; i < stage->loop.size(); i++) { + int l = stage->loop[i].pure_dim; + + int64_t outer_extent; + if (inner_tiling) { + if (l >= 0) { + internal_assert(l < (int)tiling.size()) << l << " " << tiling.size() << "\n"; + outer_extent = (outer->size[i] + tiling[l] - 1) / tiling[l]; + inner->size[i] = tiling[l]; + } else if (move_all_rvars_inward || (i < rvars_to_move_inward.size() && rvars_to_move_inward[i])) { + // RVars are moved inwards + outer_extent = 1; + inner->size[i] = outer->size[i]; + } else { + outer_extent = outer->size[i]; + inner->size[i] = 1; + } + if (adjust_tiling) { + inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent; + } + } else { + if (l >= 0) { + internal_assert(l < (int)tiling.size()) << l << " " << tiling.size() << "\n"; + inner->size[i] = (outer->size[i] + tiling[l] - 1) / tiling[l]; + outer_extent = tiling[l]; + } else if (move_all_rvars_inward || (i < rvars_to_move_inward.size() && rvars_to_move_inward[i])) { + outer_extent = 1; + inner->size[i] = outer->size[i]; + } else { + outer_extent = outer->size[i]; + inner->size[i] = 1; + } + if (adjust_tiling) { + outer_extent = (outer->size[i] + inner->size[i] - 1) / inner->size[i]; + } + } + outer->size[i] = outer_extent; + const auto &p = parent_bounds->loops(stage->index, i); + int64_t min = p.min(); + int64_t extent = p.extent(); + extent = inner->product_of_self_and_descendants(i); + + // Pick a better representative loop iteration for the + // inner loops. + min += (outer_extent / 2) * extent; + bool compile_time_constant_bounds = p.constant_extent() || stage->loop[i].pure; + b->loops(stage->index, i) = Span(min, min + extent - 1, compile_time_constant_bounds); + } + outer->set_bounds(node, b); + + outer->children.emplace_back(inner); + return outer; +} + +int64_t LoopNest::get_total_local_mem_alloc_size(bool constant_allocs_only, bool in_threads_loop) const { + int64_t result = 0; + + in_threads_loop = in_threads_loop || gpu_label == thread; + + if (in_threads_loop) { + for (const auto *store_node : store_at) { + const auto &bounds = get_bounds(store_node); + + int64_t alloc_size = store_node->bytes_per_point; + bool is_constant_alloc = true; + for (int i = 0; i < store_node->dimensions; i++) { + const auto &p = bounds->region_computed(i); + alloc_size *= p.extent(); + is_constant_alloc = is_constant_alloc && p.constant_extent(); + } + + if (store_node->dimensions > 0 && (!constant_allocs_only || is_constant_alloc)) { + result += alloc_size; + } + } + } + + for (const auto &c : children) { + result += c->get_total_local_mem_alloc_size(constant_allocs_only, in_threads_loop); + } + + return result; +} + +int64_t LoopNest::get_total_constant_local_mem_alloc_size() const { + return get_total_local_mem_alloc_size(true); +} + +// All store ats further in than the block level must be fixed +// sized allocations. This method checks if f will require a dynamic +// allocation +bool LoopNest::requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const { + if (!target.has_gpu_feature() || !in_threads_loop) { + return false; + } + + for (int i = 0; i < f->dimensions; i++) { + if (!get_bounds(f)->region_computed(i).constant_extent()) { + return true; + } + } + + return false; +} + +// Is the region_computed smaller here than at its parent? +bool LoopNest::region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const { + const auto &bounds_here = get_bounds(f); + const auto &bounds_at_parent = parent->get_bounds(f); + + int64_t total_here = 1, total_at_parent = 1; + for (int i = 0; i < f->dimensions; i++) { + const auto &range_here = bounds_here->region_computed(i); + const auto &range_at_parent = bounds_at_parent->region_computed(i); + total_here *= range_here.extent(); + total_at_parent *= range_at_parent.extent(); + } + + return total_here < total_at_parent; +} + +// Return all possible ways to compute f in tiles somewhere within +// this loop nest. +// in_threads_loop tracks whether or not function is going to be placed inside a +// loop marked gpu_threads, in which case f's loops cannot be gpu_threads +vector> LoopNest::compute_in_tiles(const FunctionDAG::Node *f, + const LoopNest *parent, + const MachineParams ¶ms, + const Target &target, + const SearchSpaceOptions &search_space_options, + int v, + bool in_realization, + bool in_threads_loop, + bool is_pre_pass, + vector union_counts) const { + internal_assert(f); + + vector> result; + + // Some pruning to not waste time on terrible states + if (parent) { + const auto &bounds_here = get_bounds(f); + const auto &bounds_at_parent = parent->get_bounds(f); + + // Don't descend into loops that break our ability to + // vectorize if we could have vectorized one level up. + const auto &p = bounds_here->region_computed(v); + const auto &p_parent = bounds_at_parent->region_computed(v); + int64_t e = p.extent(); + int64_t ep = p_parent.extent(); + if (ep >= f->vector_size && e < f->vector_size) return result; + + // Don't descend into loops if the bounds required don't + // shrink. + if (!region_computed_shrinks(f, parent)) return result; + } + + // Figure out which child we can fuse this into + int child = -1; + bool called_by_multiple_children = false; + for (int i = 0; i < (int)children.size(); i++) { + if (children[i]->calls(f)) { + if (child != -1) { + called_by_multiple_children = true; + } + child = i; + } + } + + if (gpu_label == block) { + // once we enter a gpu block loop compute union thread counts to pass down + union_counts = get_union_thread_counts(f); + } + + bool is_block_level = !is_root() && !in_threads_loop; + bool can_compute_here = (is_root() && search_space_options.compute_root()) || f->is_output; + can_compute_here = can_compute_here || (is_block_level && search_space_options.compute_at_block()); + can_compute_here = can_compute_here || (in_threads_loop && search_space_options.compute_at_thread()); + + // Place the computation directly inside this loop (provided it's not a SIMD loop) + if (!innermost && + (!in_realization || + size.empty() || + vector_dim == -1 || + size[vector_dim] == 1) && can_compute_here) { + + std::unique_ptr r{new LoopNest}; + r->copy_from(*this); + r->compute_here(f, true, v, in_threads_loop, target); + if (!in_realization) { + r->store_at.insert(f); + } else { + r->tileable = false; + } + + // if GPU and creating a threads loop INSIDE a block loop, create child for each thread tiling + if (!is_root() && !in_threads_loop && target.has_gpu_feature()) { + bool made_child = r->add_gpu_thread_tilings(f, params, target, v, result, union_counts); + if (!made_child) { // no good thread tilings, just keep r with the untiled loop inserted as serial + result.emplace_back(r.release()); + } + } else { // computing at root or inside a threads loop + result.emplace_back(r.release()); + } + } + + bool stop_here = is_root() && !search_space_options.compute_at_block() && !search_space_options.compute_at_thread(); + stop_here = stop_here || (in_threads_loop && !search_space_options.compute_at_thread()); + if (stop_here || f->is_output || is_pre_pass) { + // Outputs must be compute_root, so we're done. + return result; + } + + if (child >= 0 && !called_by_multiple_children && !in_realization && (may_subtile() || is_root())) { + // Push the Func further inwards in the loop nest + + const auto &c = children[child]; + int num_ones = 0; + for (size_t i = 0; i < c->size.size(); i++) { + int64_t s = c->size[i]; + num_ones += (s == 1) ? 1 : 0; + } + + for (int store_here = 0; store_here < 1; store_here++) { + if (is_root() && num_ones == (int)c->size.size() && params.parallelism > 1) { + // Don't fuse into serial loops, or we could never parallelize this Func. + continue; + } + + in_threads_loop |= (children[child]->gpu_label == thread); + // we must pass down union thread count constraints computed at block level when computing further in + auto opts = children[child]->compute_in_tiles(f, this, params, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); + for (IntrusivePtr &n : opts) { + // (Only valid if one child calls f) Push the + // computation into the child. Possibly leaving + // the storage out here. + LoopNest *r = new LoopNest; + r->copy_from(*this); + r->store_at.insert(f); + r->children[child] = n; + result.emplace_back(r); + } + } + } + + return result; +} + +int64_t LoopNest::product_of_self_and_descendants(int loop_index) const { + return size[loop_index] * product_of_descendants(loop_index); +} + +int64_t LoopNest::product_of_descendants(int loop_index) const { + int64_t prod = 1; + const LoopNest* cur = this; + while (!cur->innermost) { + bool found = false; + for (const auto &c : cur->children) { + if (c->stage != stage) { + continue; + } + + prod *= c->size[loop_index]; + found = true; + cur = c.get(); + break; + } + + internal_assert(found); + } + + return prod; +} + +bool LoopNest::has_constant_region_computed(const FunctionDAG::Node* node) const { + const auto& bounds = get_bounds(node); + for (int i = 0; i < node->dimensions; i++) { + if (!bounds->region_computed(i).constant_extent()) { + return false; + } + } + return true; +} + +bool LoopNest::has_constant_region_required(const FunctionDAG::Node* node) const { + const auto& bounds = get_bounds(node); + for (int i = 0; i < node->dimensions; i++) { + if (!bounds->region_required(i).constant_extent()) { + return false; + } + } + return true; +} + +bool LoopNest::other_stage_has_same_producer(const FunctionDAG::Node* producer) const { + for (const auto& other_stage : node->stages) { + if (stage->index == other_stage.index) { + continue; + } + + for (const auto *e : other_stage.incoming_edges) { + if (producer == e->producer) { + return true; + } + } + } + return false; +} + +int LoopNest::num_serial_loops(const FunctionDAG::Node::Stage* stage) const { + int num_serial_loops = 0; + for (const auto &child : children) { + if (child->stage == stage) { + continue; + } + + for (auto s : child->size) { + if (s > 1) { + ++num_serial_loops; + break; + } + } + + num_serial_loops += child->num_serial_loops(stage); + } + + return num_serial_loops; +} + +int LoopNest::num_serial_loops() const { + return num_serial_loops(stage); +} + +bool LoopNest::producer_computed_here_or_further_in(const FunctionDAG::Node* producer) const { + for (const auto &child : children) { + if (child->node == producer) { + return true; + } + + if (child->producer_computed_here_or_further_in(producer)) { + return true; + } + } + + return false; +} + +void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest) const { + if (is_root()) { + for (auto &c : children) { + descendants.emplace(c->stage, {}); + } + + for (auto &c : children) { + c->get_stages_computed_in_each_compute_root_loop(descendants, c.get()); + } + + return; + } + + descendants.get(compute_root_loop_nest->stage).emplace(stage, true); + + for (auto &c : children) { + c->get_stages_computed_in_each_compute_root_loop(descendants, compute_root_loop_nest); + } +} + +// Apply the schedule represented by this loop nest to a Halide pipeline. +void LoopNest::apply(LoopLevel here, + StageMap> &state_map, + double num_cores, + int depth, + const LoopNest *parent, + const LoopNest *compute_site, + const Target &target, + std::vector &ancestors, + const NodeMap& all_inlined) const { + if (is_root()) { + for (auto &c : children) { + Func(c->node->func).compute_root(); + c->apply(LoopLevel::root(), state_map, num_cores, 1, this, c.get(), target, ancestors, all_inlined); + if (c->stage->index == 0) { + auto &state = state_map.get(c->stage); + state->schedule_source << "\n .compute_root()"; + // TODO: Omitting logic for printing store_root() assumes everything store_root is also compute root + } + } + } else { + // Non-root nodes always have parents. + internal_assert(parent != nullptr); + + if (parent->node != node) { + compute_site = this; + } + + const auto &symbolic_loop = stage->loop; + const auto &parent_bounds = parent->get_bounds(node); + if (!state_map.contains(stage)) { + StageScheduleState *state = new StageScheduleState; + state->node = node; + state->stage = stage; + state->num_cores = num_cores; + state->vector_dim = vector_dim; + state->vectorized_loop_index = vectorized_loop_index; + state->ancestors = ancestors; + for (size_t i = 0; i < symbolic_loop.size(); i++) { + StageScheduleState::FuncVar fv; + const auto &l = symbolic_loop[i]; + fv.var = VarOrRVar(l.var, !l.pure); + fv.orig = fv.var; + fv.accessor = l.accessor; + const auto &p = parent_bounds->loops(stage->index, i); + fv.extent = p.extent(); + fv.constant_extent = p.constant_extent(); + fv.outermost = true; + fv.parallel = l.pure && target.has_gpu_feature() ? gpu_label == block : parallel; + fv.exists = true; + fv.pure = l.pure; + fv.index = i; + fv.innermost_pure_dim = (i == (size_t)vectorized_loop_index); + state->vars.push_back(fv); + } + // Bubble the innermost pure dimension to the front of the pure dimensions + for (int i = vectorized_loop_index - 1; + i >= 0 && state->vars[i].pure; i--) { + std::swap(state->vars[i], state->vars[i + 1]); + } + state_map.emplace(stage, std::unique_ptr(state)); + } + auto &state = *(state_map.get(stage)); + + // The getter for grabbing Func handles is reverse topological order + Stage s = Func(node->func); + if (stage->index > 0) { + s = Func(node->func).update(stage->index - 1); + } + + if (stage->index == 0 && parent->node != node) { + // Pick a memory type + double bytes = node->bytes_per_point; + for (int i = 0; i < node->dimensions; i++) { + const auto &p = parent_bounds->region_computed(i); + bytes *= p.extent(); + } + if (bytes < 64000 && depth > 2) { + // If it's probably a small allocation, and it's + // made more than once, use stack-scoped + // storage. Otherwise let the compiler pick heap + // or stack as it likes. + if (!target.has_gpu_feature()) { + Func(node->func).store_in(MemoryType::Stack); + state.schedule_source << "\n .store_in(MemoryType::Stack)"; + } + } + } + + // Pick a tail strategy for any splits of pure vars. RVars always use guardwithif + auto pure_var_tail_strategy = TailStrategy::Auto; + if (!compute_site->accesses_input_buffer() && !node->is_output) { + // Roundup is lowest overhead, provided it doesn't + // expand the bounds read on the input or written on + // the output. However, you can only really use it on + // pure stages that don't access the input anywhere in + // their loop nest. + pure_var_tail_strategy = TailStrategy::RoundUp; + } else if (stage->index == 0) { + // Pure stages that access the input use shiftinwards + pure_var_tail_strategy = TailStrategy::ShiftInwards; + } else { + // For pure vars in update stages that access the + // input, it's not safe to round up or redundantly + // recompute + pure_var_tail_strategy = TailStrategy::GuardWithIf; + } + + if (!size.empty()) { + if (innermost) { + // In case the threads loop is innermost + for (size_t i = 0; i < symbolic_loop.size(); i++) { + StageScheduleState::FuncVar &v = state.vars[i]; + v.gpu_threads = gpu_label == thread && symbolic_loop[i].pure; + } + + if (vectorized_loop_index >= 0) { + size_t i = 0; + while (!state.vars[i].innermost_pure_dim) + i++; + auto &v = state.vars[i]; + internal_assert(v.innermost_pure_dim && v.exists) << v.var.name() << "\n"; + // Is the result of a split + + // The vector size for gpu depends on the width of the + // stage's types and will often be 1, in which case we + // don't want to vectorize the loop + if (!target.has_gpu_feature() || stage->vector_size > 1) { + state.schedule_source + << "\n .vectorize(" << v.var.name() << ")"; + s.vectorize(v.var); + v.vectorized = true; + state.vectorized = true; + state.vectorized_var = v; + } + } + } else { + // Grab the innermost loop for this node + const LoopNest *innermost_loop = this, *child = nullptr; + while (!innermost_loop->innermost) { + for (const auto &c : innermost_loop->children) { + if (c->node == node) { + if (!child) { + child = c.get(); + } + innermost_loop = c.get(); + break; + } + } + } + + // Do the implied splits + vector new_inner; + for (size_t i = 0; i < symbolic_loop.size(); i++) { + StageScheduleState::FuncVar v; + StageScheduleState::FuncVar &parent = state.vars[i]; + + parent.gpu_threads = gpu_label == thread && symbolic_loop[i].pure; + + int64_t factor = product_of_descendants(parent.index); + + int64_t innermost_size = innermost_loop->size[parent.index]; + + if (child && innermost_size > factor) { + factor = innermost_size; + } + + if (!parent.exists || factor == 1) { + v.exists = false; + v.extent = 1; + } else if (size[parent.index] == 1 && parent.var.is_rvar) { + // Not split in this dimension + v = parent; + v.parallel = false; + v.gpu_threads = false; + + parent.exists = false; + parent.extent = 1; + } else { + VarOrRVar inner(Var(parent.var.name() + "i")); + if (parent.var.is_rvar) { + inner = RVar(parent.var.name() + "i"); + } + + auto tail_strategy = pure_var_tail_strategy; + // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if instead. + + // If the factor evenly divides the parent extent, then + // no tail strategy is needed + if (parent.var.is_rvar || (stage->index != 0 && !parent.outermost)) { + tail_strategy = TailStrategy::GuardWithIf; + } + + if (factor > parent.extent && tail_strategy == TailStrategy::ShiftInwards) { + // Don't shift all the way off the image. + tail_strategy = TailStrategy::GuardWithIf; + } + + s.split(parent.var, parent.var, inner, (int)factor, tail_strategy); + state.schedule_source + << "\n .split(" + << parent.var.name() << ", " + << parent.var.name() << ", " + << inner.name() << ", " + << factor << ", " + << "TailStrategy::" << tail_strategy << ")"; + v = parent; + parent.extent = size[parent.index]; + v.constant_extent = (!parent.var.is_rvar && parent.exists); + v.var = inner; + v.accessor.clear(); + v.extent = factor; + v.parallel = false; + v.gpu_threads = false; + v.outermost = false; + } + new_inner.push_back(v); + } + + if (child->innermost) { + // Maybe do some unrolling + + int64_t product_of_pure_loops = 1; + bool all_pure_loops_constant_size = true; + bool all_loops_are_pure = true; + for (size_t i = 0; i < symbolic_loop.size(); i++) { + if (state.vars[i].pure) { + product_of_pure_loops *= state.vars[i].extent; + all_pure_loops_constant_size &= state.vars[i].constant_extent; + } else if (state.vars[i].exists) { + all_loops_are_pure = false; + } + } + + if (product_of_pure_loops <= get_unroll_limit(target) && all_pure_loops_constant_size) { + state.all_innermost_unrolled = all_loops_are_pure; + // There's a hope we can fit anything compute-at this level into registers if we fully unroll + std::stable_sort(state.vars.begin(), state.vars.begin() + symbolic_loop.size(), + [](const StageScheduleState::FuncVar &a, const StageScheduleState::FuncVar &b) { + return a.pure && !b.pure; + }); + + for (size_t i = 0; i < symbolic_loop.size(); i++) { + if (state.vars[i].pure && state.vars[i].exists && state.vars[i].extent > 1) { + s.unroll(state.vars[i].var); + state.schedule_source << "\n .unroll(" << state.vars[i].var.name() << ")"; + } + } + } + } + + bool found = false; + for (const auto &v : state.vars) { + if (!v.exists) continue; + here = LoopLevel(node->func, v.var); + found = true; + break; + } + if (!found) { + here = LoopLevel(node->func, Var::outermost()); + } + // internal_assert(found) << "Could not find appropriate compute_at location for children of " << node->func.name() << "\n"; + state.vars.insert(state.vars.begin(), new_inner.begin(), new_inner.end()); + } + } + if (innermost) { + internal_assert(store_at.empty()); + internal_assert(children.empty()); + return; + } + + for (auto f : store_at) { + Func(f->func).store_at(here); + } + for (auto s : size) { + num_cores /= s; + } + here.lock(); + string loop_level; + if (here.is_root()) { + loop_level = "_root()"; + } else { + loop_level = "_at(" + here.func() + ", " + here.var().name() + ")"; + } + + for (auto &c : children) { + if (c->node != node) { + Func(c->node->func).compute_at(here); + } + ancestors.push_back(state_map.get(stage).get()); + c->apply(here, state_map, num_cores, depth + 1, this, compute_site, target, ancestors, all_inlined); + ancestors.pop_back(); + if (c->node != node && c->stage->index == 0) { + auto &state = *(state_map.get(c->stage)); + state.schedule_source << "\n .compute" << loop_level; + } + } + + if (gpu_label == thread && state.all_innermost_unrolled && num_serial_loops() <= 1) { + update_producers_to_be_staged(state, all_inlined); + } + + for (auto f : store_at) { + bool computed_here = false; + for (auto &c : children) { + if (c->node == f) { + computed_here = true; + break; + } + } + if (!computed_here) { + auto &state = *(state_map.get(&(f->stages[0]))); + state.schedule_source << "\n .store" << loop_level; + } + } + } +} + +void LoopNest::update_producers_to_be_staged(StageScheduleState& state, const NodeMap& all_inlined) const { + std::vector>> pending; + std::vector edge_chain; + pending.emplace_back(stage, edge_chain); + NodeMap done; + + while (!pending.empty()) { + auto cur_pair = pending.back(); + pending.pop_back(); + + auto* s = cur_pair.first; + + for (const auto *e : s->incoming_edges) { + std::vector edge_chain = cur_pair.second; + edge_chain.push_back(e); + + // If the producer is inlined, then its producers should potentially be + // staged + if (all_inlined.contains(e->producer) && all_inlined.get(e->producer)) { + pending.emplace_back(&e->producer->stages[0], edge_chain); + continue; + } + + if (done.contains(e->producer) && done.get(e->producer)) { + continue; + } + + done.get_or_create(e->producer) = true; + + if (e->producer->is_input || !has_constant_region_required(e->producer)) { + continue; + } + + if (other_stage_has_same_producer(e->producer) || producer_computed_here_or_further_in(e->producer) || !e->all_load_jacobian_coeffs_exist()) { + continue; + } + + state.producers_to_be_staged.get_or_create(e->producer).emplace_back(this, edge_chain); + } + } +} + +double LoopNest::max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const { + gpu_loop_info.update(target, this); + std::unique_ptr thread_info; + + if (is_gpu_thread(target)) { + thread_info = gpu_loop_info.create_thread_info(); + + return thread_info->idle_lane_wastage(); + } + + double max_wastage = 0; + + for (const auto &c : children) { + max_wastage = std::max(max_wastage, c->max_idle_lane_wastage(target, gpu_loop_info)); + } + + return max_wastage; +} + +bool LoopNest::has_valid_thread_extents() const { + for (const auto& c : children) { + if (!are_valid_thread_extents(c->get_union_thread_counts(nullptr))) { + return false; + } + } + + return true; +} + +void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap& nodes_to_freeze, NodeMap& inlined_nodes) const { + if (innermost) { + for (auto it = inlined.begin(); it != inlined.end(); it++) { + const auto *f = it.key(); + if (nodes_to_freeze.contains(f)) { + inlined_nodes.insert(f, true); + std::cerr << "Freezing as inlined: " << f->func.name() << "\n"; + } + } + } + + for (const auto& c : children) { + c->collect_nodes_that_should_be_inlined(nodes_to_freeze, inlined_nodes); + } +} + +void LoopNest::collect_all_inlined(NodeMap& all_inlined) const { + if (innermost) { + for (auto it = inlined.begin(); it != inlined.end(); it++) { + const auto *f = it.key(); + all_inlined.insert(f, true); + } + } + + for (const auto& c : children) { + c->collect_all_inlined(all_inlined); + } +} + +bool Filter::enable_filter_printing() { + static bool enabled = ([]() -> bool { + std::string var = get_env_variable("ENABLE_FILTER_PRINTING"); + if (!var.empty()) { + return var == "1"; + } + return false; + })(); + return enabled; +} + +} // namespace Autoscheduler + +template<> +RefCount &ref_count(const Autoscheduler::LoopNest *t) noexcept { + return t->ref_count; +} + +template<> +void destroy(const Autoscheduler::LoopNest *t) { + delete t; +} + +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h new file mode 100644 index 000000000000..de670214dc3b --- /dev/null +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -0,0 +1,582 @@ +/** This file defines the LoopNest, which is our + * representation of a Halide schedule, and contains methods to + * generate candidates for scheduling as well as extract a + * featurization that can be used to cost each candidate. */ + +#ifndef LOOP_NEST_H +#define LOOP_NEST_H + +#include "FunctionDAG.h" +#include "GPUMemInfo.h" +#include "GPULoopInfo.h" +#include "PerfectHashMap.h" +#include "SearchSpaceOptions.h" +#include "Statistics.h" +#include "ThreadInfo.h" +#include "ASLog.h" +#include "Tiling.h" +#include +#include + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +template +using NodeMap = PerfectHashMap; + +template +using StageMap = PerfectHashMap; + +enum GPU_parallelism { block, thread, serial, simd, parallelized, none }; + +// inlined => func is inlined so has no memory store location +enum class GPUMemoryType { global, shared, local, registers, inlined }; + +bool may_subtile(); + +int64_t get_shared_memory_limit(); + +int64_t get_active_block_hardware_limit(); + +int64_t get_active_warp_hardware_limit(); + +constexpr int64_t get_register_mem_alloc_limit() { + return 128; +} + +int get_unroll_limit(const Target& target); + +bool in_range_zero_one(double x); + +bool are_valid_thread_extents(const vector& counts); + +double get_idle_lane_wastage_limit_env_var(); +double get_idle_lane_wastage_limit(); + +bool all(const vector& v); +bool accessed_at_constant_indices(const std::vector& unrolled, const FunctionDAG::Edge* e); + +// We're going to do a tree search over possible schedules to find an +// optimal one. A tree search requires a state, and a function that +// gives you children of the state (with costs). The following struct +// represents the state, which is a partial schedule. +// +// A partial schedule is a tree. Each node is some portion of the for +// loop nest of some Func. If there are no children, it's the +// innermost set of loops. If there are children, it's a loop over +// tiles of that Func. +struct LoopNest { + mutable RefCount ref_count; + + // The extents of this loop. Put another way, the number of tiles, + // not the size of each tile. + vector size; + + // The nodes inside the loop body + vector> children; + + // Funcs inlined into this inner loop, and the number of times + // each is called. Only valid if children is empty. + NodeMap inlined; + + // Funcs stored inside this loop + std::set store_at; + + // The total bounds required of any given Func over all iterations + // of this loop. In the paper, this is represented using the + // little boxes to the left of the loop nest tree figures. + mutable NodeMap bounds; + + // The Func this loop nest belongs to + const FunctionDAG::Node *node = nullptr; + + // The stage of the Func + const FunctionDAG::Node::Stage *stage = nullptr; + + // Is this the innermost loop of this func (the SIMD loop)? + bool innermost = false; + + // Are we permitted to tile this loop? + bool tileable = false; + + // Is this the parallel outer loop? + bool parallel = false; + + // What dimension is this Func vectorized over, in terms of the pure args of the Func? + int vector_dim = -1; + + // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them. + int vectorized_loop_index = -1; + + // Apply gpu threads to this loop nest + mutable GPU_parallelism gpu_label = none; + + struct FeatureIntermediates { + double inlined_calls; + double num_vectors; + double num_scalars; + double vector_size; + double innermost_pure_loop_extent; + double outer_parallelism; + double num_warps_per_block; + double num_threads_per_block; + double points_computed_per_thread; + }; + + mutable std::map>> feature_intermediates; + mutable std::map> features; + + bool is_gpu_serial(const Target& target) const { + return target.has_gpu_feature() && gpu_label == serial; + } + + bool is_gpu_thread(const Target& target) const { + return target.has_gpu_feature() && gpu_label == thread; + } + + bool is_gpu_block(const Target& target) const { + return target.has_gpu_feature() && gpu_label == block; + } + + bool is_scalar() const { + return size.size() == 0; + } + + // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension + // across all siblings of f. + vector get_union_thread_counts(const FunctionDAG::Node *f) const; + + // given a newly inserted node f into this LoopNest, gets the size of + // all of f's stages and their pure_dim indices + void get_stage_sizes(const FunctionDAG::Node *f, + vector> &stage_sizes, + vector> &pure_dims, + vector &vectorized_indices) const; + + // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes + // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32 + // if so, we will include these in the serial loop sizes + void generate_vec_dim_serial_tilings(vector &serial_sizes) const; + + // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles + // the newly inserted loop nests of f into a threads loop outside a serial loop. + // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result. + bool add_gpu_thread_tilings(const FunctionDAG::Node *f, + const MachineParams ¶ms, + const Target &target, + int v, + vector> &result, + vector max_size); + + void copy_from(const LoopNest &n); + void copy_from_including_features(const LoopNest &n); + + static void hash_combine(uint64_t &h, uint64_t next) { + // From boost + h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2)); + } + + // Hash the loop structure and sizes up to a fixed depth. This is + // used as the hash function for the coarse-to-fine beam search in + // the paper. + void structural_hash(uint64_t &h, int depth) const; + + // How many funcs are scheduled inside this loop level. Used in + // the structural hash. + size_t funcs_realized_or_inlined() const { + size_t count = inlined.size() + store_at.size(); + for (const auto &c : children) { + count += c->funcs_realized_or_inlined(); + } + return count; + } + + // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage. + struct Sites { + const LoopNest *compute = nullptr; // Its containing compute_at site + const LoopNest *store = nullptr; // Its containing store_at site + const LoopNest *produce = nullptr; // Its own outermost node + const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop + const LoopNest *task = nullptr; // The parallel for loop it belongs to + const LoopNest *thread = nullptr; // Its containing gpu_thread loop + GPUMemoryType gpu_store_memory_type; // global, local, shared? + int64_t allocation_size = 0; // Allocation size in bytes + bool is_constant_allocation = false; // Does the allocation have constant size? + int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers + bool inlined = false; // Is the Func inlined? + std::vector inlined_innermosts; // Is the Func inlined? + uint64_t hash_of_producers_stored_at_root; + + bool is_stored_in_global_mem() const { return gpu_store_memory_type == GPUMemoryType::global; } + bool is_stored_in_shared_mem() const { return gpu_store_memory_type == GPUMemoryType::shared; } + bool is_stored_in_local_mem() const { return gpu_store_memory_type == GPUMemoryType::local; } + bool is_stored_in_registers() const { return gpu_store_memory_type == GPUMemoryType::registers; } + }; + + GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const; + + std::vector unrolled_loops(const Target& target, const LoopNest* parent, const LoopNest* grandparent) const; + + void get_allocs_that_can_be_promoted_to_registers(const Target &target, + StageMap &sites, + NodeMap &can_be_promoted_to_registers, + const LoopNest *grandparent, + const LoopNest *parent) const; + + bool promote_allocs_to_registers(const Target &target, StageMap &sites) const; + + // Compute all the sites of interest for each pipeline stage + void get_sites(const Target& target, + StageMap &sites, + StageMap &shared_mem_alloc_sizes, + const LoopNest *task = nullptr, + const LoopNest *parent = nullptr, + const LoopNest *current_thread_loop = nullptr) const; + + // A helper for the working_set_at_task feature. Most features are + // computed in the recursive pass 'compute_features' below, but + // this one must be done in a second separate recursive pass. + void set_working_set_at_task_feature(int64_t working_set, + StageMap *features) const { + for (const auto &c : children) { + c->set_working_set_at_task_feature(working_set, features); + features->get(c->stage).working_set_at_task = working_set; + } + } + + bool exceeds_serial_extents_limit(const Target& target, const LoopNest* parent, bool in_threads_loop) const; + + bool node_has_dynamic_region_computed(const FunctionDAG::Node* f) const; + + bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const; + + const LoopNest* find_pure_stage_loop_nest(const FunctionDAG::Node* node) const; + + int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node* node) const; + + int get_vectorized_loop_index_from_pure_stage(const LoopNest& root) const; + + // Get the stride over "node's" storage for a unit increment in the vectorized loop's + // index + double storage_stride(const LoadJacobian& jac, int innermost_storage_dim, const FunctionDAG::Node* storage_node, const Bound& store_bounds, const LoopNest& root) const; + + Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo& thread_info, bool verbose=false) const; + + bool all_strides_exist(const LoadJacobian& jac, const FunctionDAG::Node* storage_node, const LoopNest& root) const; + + int get_actual_vector_dim(const Bound &store_bounds) const; + + void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo& global_mem_loads, SharedMemInfo& shared_mem_loads, LocalMemInfo& local_mem_loads, bool verbose=false) const; + + bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const; + + bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType& mem_type) const; + + int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType& mem_type, bool verbose=false) const; + + int vectorized_access_size(size_t loop_index, bool verbose=false) const; + + template + void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose=false) const; + + std::pair compute_local_mem_store_features(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const LoopNest& root, double serial_loop_extents) const; + + template + MemInfoType compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const; + + template + void compute_mem_load_features(const LoadJacobian& jac, int producer_innermost_dim, const FunctionDAG::Node* node, const Bound& producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo& thread_info, MemInfoType& mem_info, double serial_loop_extents, bool verbose=false) const; + + double compute_local_mem_stride(double stride, double bytes) const; + + // Assumes block, serial, thread or block, thread nesting + const LoopNest* get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const; + + std::pair get_block_and_serial_extents(const LoopNest* block) const; + + bool all_paths_to_leaves_have_thread_loop() const; + + bool has_thread_loop_descendant() const; + + void compute_warp_features(ScheduleFeatures& features, const GPULoopInfo& gpu_loop_info) const; + + // Assume that when a block is active, all its warps are active + void compute_warp_and_block_occupancy(const MachineParams& params, ScheduleFeatures &feat, const GPULoopInfo& gpu_loop_info) const; + + void compute_shared_mem_occupancy(const Target& target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; + + std::pair find_innermost_and_parent() const; + + int64_t points_accessed_per_thread(const MachineParams& params, const Target& target, const GPULoopInfo &gpu_loop_info, const std::vector& edge_chain, const LoadJacobian& jac, const LoopNest* parent, const LoopNest* grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian& serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType& mem_type, bool verbose=false) const; + + int64_t compute_licm_amortization(const LoopNest* innermost, const LoopNest* parent, const ScheduleFeatures& feat, const LoadJacobian& jac, int producer_dims) const; + + void memoize_points_computed_minimum(StageMap& memoized_features, const StageMap *features) const; + + vector> collect_producers(const StageMap &sites) const; + + uint64_t compute_hash_of_producers_stored_at_root(const StageMap &sites) const; + + void collect_stages(std::set& stages) const; + + void memoize_features(StageMap& memoized_features, const StageMap *features) const; + + void compute_working_set_from_features(int64_t *working_set, + const StageMap *features) const; + + void recompute_inlined_features(const StageMap &sites, StageMap *features) const; + + std::pair compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const; + + // Do a recursive walk over the loop nest computing features to feed the cost model. + void compute_features(const FunctionDAG &dag, + const MachineParams ¶ms, + const Target& target, + const StageMap &sites, + int64_t instances, + int64_t parallelism, + const LoopNest *parent, + const LoopNest *grandparent, + const LoopNest &root, + int64_t *working_set, + int64_t *working_set_local_constant, + int64_t *working_set_local_dynamic, + StageMap *features, + GPULoopInfo gpu_loop_info, + bool use_memoized_features, + const StageMap &total_shared_mem_alloc_sizes, + Statistics& stats, + bool verbose=false) const; + + bool is_root() const { + // The root is the sole node without a Func associated with + // it. + return node == nullptr; + } + + // Set the region required of a Func at this site. + const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const { + return bounds.emplace(f, b); + } + + // Get the region required of a Func at this site, from which we + // know what region would be computed if it were scheduled here, + // and what its loop nest would be. + const Bound &get_bounds(const FunctionDAG::Node *f) const; + + // Get the region required of a Func at this site (but only to satisfy the + // consumers along the given edge chain), from which we know what region + // would be computed if it were scheduled here and what its loop nest + // would be. + const Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector& edge_chain) const; + + void dump() const; + + std::string to_string() const; + + // Recursively print a loop nest representation to stderr + template + void dump(T& stream, string prefix, const LoopNest *parent) const; + + // Does this loop nest access the given Func + bool calls(const FunctionDAG::Node *f) const; + + // What is the maximum number of inlined calls to a Func that + // occur within this loop. Used to prune states that would + // generate too much code. + int64_t max_inlined_calls() const; + + // Does this loop nest access an input buffer? Used to select + // trail strategies when splitting loops. We don't want to read + // out of bounds on inputs, even if we don't intend to use the + // values read. It could create annoying assertion failures for + // the user. It's OK to read out of range of the values computed + // on internal Funcs though. Allocation bounds inference just pads + // out the bounds so that it won't fault. + bool accesses_input_buffer() const; + + // Does this loop nest contain a computation of the given Func. + bool computes(const FunctionDAG::Node *f) const; + + // Above here most methods query the loop nest. Below we have + // methods that mutate the loop nest. + + // Inline a Func into all consumers within this loop. + void inline_func(const FunctionDAG::Node *f); + + // Compute a Func at this site. + bool compute_here(const FunctionDAG::Node *f, + bool tileable, + int v, + bool in_threads_loop, + const Target &target); + + // Parallelize this loop according to the given tiling. + IntrusivePtr parallelize_in_tiles(const MachineParams ¶ms, + const vector &tiling, + const LoopNest *parent, + const Target& target, + bool inner_tiling, + bool adjust_tiling, + bool move_all_rvars_inward=true, + const vector &rvars_to_move_inward={}) const; + + int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const; + int64_t get_total_constant_local_mem_alloc_size() const; + + // All store ats further in than the block level must be fixed + // sized allocations. This method checks if f will require a dynamic + // allocation + bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const; + + // Return all possible ways to compute f in tiles somewhere within + // this loop nest. + // in_threads_loop tracks whether or not function is going to be placed inside a + // loop marked gpu_threads, in which case f's loops cannot be gpu_threads + vector> compute_in_tiles(const FunctionDAG::Node *f, + const LoopNest *parent, + const MachineParams ¶ms, + const Target &target, + const SearchSpaceOptions &search_space_options, + int v, + bool in_realization, + bool in_threads_loop, + bool is_pre_pass, + vector union_counts=vector()) const; + + // Below here we have methods that apply a schedule to a Halide pipeline. + + // A model of the state of the loop nest of a Func while applying + // Halide's scheduling directives. + + // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member. + struct StageScheduleState { + // How much parallelism do we need to exploit with this Func? + double num_cores = 0; + + // Which storage dimension is vectorized? We need to reorder it innermost + int vector_dim = -1; + int vectorized_loop_index = -1; + + // The various Vars and RVars used for scheduling a Func. + struct FuncVar { + // The top-level var or rvar this was split off from + VarOrRVar orig; + + // This var. + VarOrRVar var; + + // Source code to access this Var/RVar. Used for printing + // valid Halide source for this schedule. + string accessor; + + // Our estimate of the extent of this var. This is exact + // when constant_extent flag is true. + int64_t extent = 0; + + // Which index in the symbolic loop nest does this var + // belong to. + size_t index = 0; + + // Some flags. + bool innermost_pure_dim = false, + outermost = false, + parallel = false, + exists = false, + pure = false, + constant_extent = false; + + bool vectorized = false; + bool gpu_threads = false; + + FuncVar() : orig(Var()), var(Var()) {} + }; + const FunctionDAG::Node* node; + const FunctionDAG::Node::Stage* stage; + bool parallel = false; + bool vectorized = false; + bool all_innermost_unrolled = false; + FuncVar vectorized_var; + + // In order from innermost to outermost. Each group of d is one tiling level. + vector vars; + + // In order from innermost to outermost. Each group of d is one tiling level. + vector ordered_vars; + vector gpu_thread_extents; + + NodeMap>>> producers_to_be_staged; + + // From outermost in + vector ancestors; + + std::ostringstream schedule_source; + }; + + bool has_constant_region_computed(const FunctionDAG::Node* node) const; + bool has_constant_region_required(const FunctionDAG::Node* node) const; + bool other_stage_has_same_producer(const FunctionDAG::Node* producer) const; + int num_serial_loops(const FunctionDAG::Node::Stage* stage) const; + int num_serial_loops() const; + bool producer_computed_here_or_further_in(const FunctionDAG::Node* producer) const; + + void update_producers_to_be_staged(StageScheduleState& state, const NodeMap& all_inlined) const; + bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const; + + // Apply the schedule represented by this loop nest to a Halide pipeline. + void apply(LoopLevel here, + StageMap> &state_map, + double num_cores, + int depth, + const LoopNest *parent, + const LoopNest *compute_site, + const Target& target, + std::vector& ancestors, + const NodeMap& all_inlined) const; + + double max_idle_lane_wastage(const Target& target, GPULoopInfo gpu_loop_info) const; + + bool has_valid_thread_extents() const; + + void collect_nodes_that_should_be_inlined(const NodeMap& nodes_to_freeze, NodeMap& inlined_nodes) const; + + void collect_all_inlined(NodeMap& all_inlined) const; + + int64_t product_of_self_and_descendants(int loop_index) const; + int64_t product_of_descendants(int loop_index) const; + + void get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest=nullptr) const; +}; + +struct Filter { + const LoopNest* loop_nest; + bool logging = false; + + Filter(const LoopNest* loop_nest) + : loop_nest{loop_nest} + , logging{enable_filter_printing()} + { + if (logging) { + std::cerr << "\nState filtered: \n"; + loop_nest->dump(); + std::cerr << "Reason: "; + } + } + + template + Filter &operator<<(T &&x) { + if (logging) { + std::cerr << std::forward(x); + } + return *this; + } + + static bool enable_filter_printing(); +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // LOOP_NEST_H diff --git a/src/autoschedulers/anderson2021/LoopNestParser.h b/src/autoschedulers/anderson2021/LoopNestParser.h new file mode 100644 index 000000000000..799012d6cf20 --- /dev/null +++ b/src/autoschedulers/anderson2021/LoopNestParser.h @@ -0,0 +1,212 @@ +#ifndef LOOP_NEST_PARSER_H +#define LOOP_NEST_PARSER_H + +#include +#include +#include +#include +#include + +#include "ASLog.h" +#include "FunctionDAG.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +class LoopNestParser { + void parse(const std::vector& loop_nest) { + std::unordered_map> stage_to_loop_nest; + for (const auto& line : loop_nest) { + if (line.empty()) { + continue; + } + + if (line.at(0) == '#') { + continue; + } + + std::istringstream iss(line); + std::vector tokens{ + std::istream_iterator(iss), + std::istream_iterator() + }; + + std::string stage = tokens.at(0); + bool is_inlined = tokens.at(0) == "inlined:"; + + if (tokens.at(0) == "realize:" || is_inlined) { + stage = tokens.at(1); + } + + if (stage == "gpu_none") { + continue; + } + + all_stages.insert(stage); + + if (is_inlined) { + inlined.insert(stage); + continue; + } + + if (tokens.back() == "gpu_none") { + partially_scheduled.insert(stage); + } + + if (line.at(0) != ' ' && compute_root_stages.count(stage) == 0) { + compute_root_stages[stage] = -1; + } + + if (tokens.back() == "gpu_simd" && compute_root_stages.count(stage) == 1 && compute_root_stages[stage] == -1) { + std::string vector_dim = tokens[tokens.size() - 3]; + compute_root_stages[stage] = std::stoi(vector_dim.substr(0, vector_dim.size() - 1)); + } + + if (partially_scheduled.count(stage) == 0) { + stage_to_loop_nest[stage].push_back(line); + } + } + + for (const auto& entry : stage_to_loop_nest) { + std::string loop_nest = ""; + for (const auto& line : entry.second) { + loop_nest += line + "\n"; + } + + per_stage_loop_nests[entry.first] = loop_nest; + } + + // If a stage appears in a 'realize: ' line but nowhere else, remove it + std::vector to_remove; + for (const auto& entry : compute_root_stages) { + if (entry.second == -1) { + to_remove.push_back(entry.first); + } + } + + for (const auto& key : to_remove) { + compute_root_stages.erase(key); + partially_scheduled.erase(key); + all_stages.erase(key); + per_stage_loop_nests.erase(key); + } + } + + std::vector loop_nest; + std::unordered_map per_stage_loop_nests; + std::unordered_set inlined; + std::unordered_set partially_scheduled; + std::unordered_map compute_root_stages; + std::unordered_set all_stages; + +public: + LoopNestParser(const std::vector& loop_nest) + : loop_nest{loop_nest} + { + parse(loop_nest); + } + + void dump() const { + aslog(0) << "All stages:\n"; + for (const auto& s : all_stages) { + aslog(0) << s << "\n"; + } + + aslog(0) << "\ncompute_root stages:\n"; + for (const auto& s : compute_root_stages) { + aslog(0) << s.first << " with vector_dim = " << s.second << "\n"; + } + + aslog(0) << "\nPartially scheduled stages:\n"; + for (const auto& s : partially_scheduled) { + aslog(0) << s << " with vector_dim = " << compute_root_stages.at(s) << "\n"; + } + + aslog(0) << "\nInlined stages:\n"; + for (const auto& s : inlined) { + aslog(0) << s << "\n"; + } + + aslog(0) << "\nFull loop nest:\n"; + for (const auto& s : loop_nest) { + aslog(0) << s << "\n"; + } + aslog(0) << "\n"; + } + + bool is_in_partial_schedule(const FunctionDAG::Node* node) const { + return node && all_stages.count(node->func.name()) > 0; + } + + bool contains_sub_loop_nest_for_shared_stages(const LoopNestParser& other) const { + return contains_sub_loop_nest(other, true); + } + + // 'only_consider_shared_stages': check if 'other' is contained in this loop + // nest, but ignore stages that are present in 'other' but not present in + // this loop nest + bool contains_sub_loop_nest(const LoopNestParser& other, bool only_consider_shared_stages=false) const { + for (const auto& stage : other.all_stages) { + if (all_stages.count(stage) == 0) { + if (only_consider_shared_stages) { + continue; + } + return false; + } + + if (other.partially_scheduled.count(stage) == 1) { + if (compute_root_stages.count(stage) == 0) { + return false; + } + + return other.compute_root_stages.at(stage) == compute_root_stages.at(stage); + } + + if (other.inlined.count(stage) > 0) { + if (inlined.count(stage) == 0) { + return false; + } + continue; + } else if (inlined.count(stage) > 0) { + return false; + } + + if (other.per_stage_loop_nests.at(stage) != per_stage_loop_nests.at(stage)) { + return false; + } + } + + return true; + } + + static LoopNestParser from_string(const std::string& str) { + std::istringstream in(str); + std::string line; + std::vector loop_nest; + + while (std::getline(in, line)) { + loop_nest.push_back(line); + } + + return LoopNestParser(loop_nest); + } + + static std::unique_ptr from_file(const std::string& filename) { + std::ifstream file(filename); + std::string line; + std::vector loop_nest; + + while (std::getline(file, line)) { + loop_nest.push_back(line); + } + + return std::make_unique(loop_nest); + } +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile new file mode 100644 index 000000000000..0e54bccbabca --- /dev/null +++ b/src/autoschedulers/anderson2021/Makefile @@ -0,0 +1,292 @@ +THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST))) +SRC = $(strip $(shell dirname $(THIS_MAKEFILE))) +HALIDE_SRC_ROOT = $(realpath $(SRC)/../../../) +COMMON_DIR ?= $(realpath $(SRC)/../common/) + +HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib +HL_TARGET ?= host-cuda + +$(info Looking for Halide distro at $(HALIDE_DISTRIB_PATH). If this is incorrect, set the make variable HALIDE_DISTRIB_PATH) + +# Don't include an autoscheduler in the generator deps +AUTOSCHEDULER= +include $(HALIDE_SRC_ROOT)/apps/support/Makefile.inc + +# Add the relative location of libHalide.so in the rpath in a distro +ifeq ($(UNAME), Darwin) +HALIDE_RPATH_FOR_BIN = '-Wl,-rpath,@executable_path/../lib' +HALIDE_RPATH_FOR_LIB = '-Wl,-rpath,@loader_path' +else +HALIDE_RPATH_FOR_BIN = '-Wl,-rpath,$$ORIGIN/../lib' +HALIDE_RPATH_FOR_LIB = '-Wl,-rpath,$$ORIGIN' +endif + +CXXFLAGS += -I$(COMMON_DIR) + +ENABLE_DEBUG_OUTPUT ?= false + +AUTOSCHED_SAMPLES_OUT ?= $(SRC)/samples + +AUTOSCHED_WEIGHT_OBJECTS=$(BIN)/baseline_weights.o + +$(BIN)/binary2cpp: $(HALIDE_SRC_ROOT)/tools/binary2cpp.cpp + @mkdir -p $(@D) + $(CXX) $< -o $@ + +$(BIN)/baseline_weights.cpp: $(BIN)/binary2cpp $(SRC)/baseline.weights + @mkdir -p $(@D) + $(BIN)/binary2cpp baseline_weights < $(SRC)/baseline.weights > $@ + +$(BIN)/baseline_weights.o: $(BIN)/baseline_weights.cpp + $(CXX) -c $< -o $@ + +AUTOSCHED_COST_MODEL_LIBS=\ +$(BIN)/cost_model/cost_model.a \ +$(BIN)/cost_model/train_cost_model.a \ + +$(BIN)/cost_model.generator: $(SRC)/cost_model_generator.cpp \ + $(SRC)/cost_model_schedule.h \ + $(SRC)/NetworkSize.h \ + $(GENERATOR_DEPS) + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_EXPORT_DYNAMIC) + +$(BIN)/auto_schedule_runtime.a: $(BIN)/cost_model.generator + @mkdir -p $(@D) + $^ -r auto_schedule_runtime -o $(BIN) target=$(HL_TARGET) + +$(BIN)/cost_model/%.a: $(BIN)/cost_model.generator + @mkdir -p $(@D) + $^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime auto_schedule=false enable_debug_output=$(ENABLE_DEBUG_OUTPUT) -e stmt,static_library,h,assembly + +# It's important to use dynamic lookups for undefined symbols here: all of libHalide +# is expected to be present (in the loading binary), so we explicitly make the symbols +# undefined rather than dependent on libHalide.so. +$(BIN)/libautoschedule_anderson2021.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ + $(SRC)/AutoSchedule.h \ + $(SRC)/ASLog.cpp \ + $(SRC)/DefaultCostModel.h \ + $(SRC)/DefaultCostModel.cpp \ + $(SRC)/Weights.h \ + $(SRC)/Weights.cpp \ + $(SRC)/FunctionDAG.h \ + $(SRC)/FunctionDAG.cpp \ + $(SRC)/LoopNest.h \ + $(SRC)/LoopNest.cpp \ + $(SRC)/LoopNestParser.h \ + $(SRC)/GPUMemInfo.h \ + $(SRC)/GPULoopInfo.h \ + $(SRC)/GPULoopInfo.cpp \ + $(SRC)/ThreadInfo.h \ + $(SRC)/Featurization.h \ + $(SRC)/CostModel.h \ + $(SRC)/PerfectHashMap.h \ + $(SRC)/SearchSpace.h \ + $(SRC)/SearchSpace.cpp \ + $(SRC)/SearchSpaceOptions.h \ + $(SRC)/State.h \ + $(SRC)/State.cpp \ + $(SRC)/Statistics.h \ + $(SRC)/Tiling.h \ + $(SRC)/Tiling.cpp \ + $(AUTOSCHED_WEIGHT_OBJECTS) \ + $(AUTOSCHED_COST_MODEL_LIBS) \ + $(GENERATOR_DEPS) \ + $(BIN)/auto_schedule_runtime.a + @mkdir -p $(@D) + $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) + +$(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ + $(SRC)/ASLog.cpp \ + $(SRC)/DefaultCostModel.h \ + $(SRC)/DefaultCostModel.cpp \ + $(SRC)/Weights.h \ + $(SRC)/Weights.cpp \ + $(SRC)/CostModel.h \ + $(SRC)/NetworkSize.h \ + $(AUTOSCHED_COST_MODEL_LIBS) \ + $(AUTOSCHED_WEIGHT_OBJECTS) \ + $(BIN)/auto_schedule_runtime.a + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) + +$(BIN)/featurization_to_sample: $(SRC)/featurization_to_sample.cpp + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ + +$(BIN)/get_host_target: $(SRC)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ + +$(BIN)/weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ + +# This is the value that machine_params defaults to if no custom value is specified; +# see MachineParams::generic() +HL_MACHINE_PARAMS ?= 80,25165824,160 + +# A sample generator to autoschedule. Note that if it statically links +# to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the +# autoscheduler can't find the libHalide symbols that it needs. +$(GENERATOR_BIN)/demo.generator: demo_generator.cpp $(GENERATOR_DEPS) + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter %.cpp,$^) -o $@ $(LIBHALIDE_LDFLAGS) + +# To use the autoscheduler, set a few environment variables and use the -p flag to the generator to load the autoscheduler as a plugin +$(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) + @mkdir -p $(@D) + HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ + $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) -s Anderson2021 + +$(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) -I$(BIN)/$* $^ -o $@ $(HALIDE_SYSTEM_LIBS) $(IMAGE_IO_FLAGS) + +# demonstrates single-shot use of the autoscheduler +demo: $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) + $< --benchmarks=all --benchmark_min_time=1 --estimate_all + +# demonstrates an autotuning loop +# (using $(BIN) and $(SRC) here seems overkill, but makes copy-n-paste elsewhere easier) +autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/get_host_target $(BIN)/retrain_cost_model $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) $(SRC)/autotune_loop.sh + HL_MACHINE_PARAMS=80,1,1 \ + SAMPLES_DIR=test_autotuned_samples \ + bash $(SRC)/autotune_loop.sh \ + $(GENERATOR_BIN)/demo.generator \ + demo \ + "" \ + $(SRC)/baseline.weights \ + $(BIN) \ + 0 + +$(BIN)/test_perfect_hash_map: test_perfect_hash_map.cpp PerfectHashMap.h + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $< -o $@ + +$(BIN)/test_function_dag: test_function_dag.cpp FunctionDAG.h FunctionDAG.cpp ASLog.h ASLog.cpp + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) + +# Simple jit-based test +$(BIN)/%/test: test.cpp $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $^ -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) + +test_perfect_hash_map: $(BIN)/test_perfect_hash_map + $^ + +test_function_dag: $(BIN)/test_function_dag + $^ + +run_test: $(BIN)/$(HL_TARGET)/test + HL_WEIGHTS_DIR=$(SRC)/baseline.weights LD_LIBRARY_PATH=$(BIN) $< + +.PHONY: test clean + +# Note that when running the *test*, we want to ensure that we generate samples +# to a subdir of $(BIN), so that they don't get inadvertently generated into +# our source tree. (Normally we want samples/ to be retained, to avoid data loss; +# for the test target, however, it's imperative it go into a transitory directory, +# to avoid eventually consuming all disk space on the buildbot...) +test: AUTOSCHED_SAMPLES_OUT = $(BIN)/test_samples_out + +# Note that 'make build' and 'make test' is used by Halide buildbots +# to spot-check changes, so it's important to try a little of each of +# the important paths here, including single-shot and autotune-loop +build: $(BIN)/$(HL_TARGET)/test \ + $(BIN)/test_perfect_hash_map \ + $(BIN)/test_function_dag \ + $(BIN)/$(HL_TARGET)/included_schedule_file.rungen \ + $(GENERATOR_BIN)/demo.generator \ + $(BIN)/featurization_to_sample \ + $(BIN)/get_host_target \ + $(BIN)/retrain_cost_model \ + $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) + +test: test_bounds test_tiling test_storage_strides test_parser test_state run_test test_perfect_hash_map test_function_dag demo included_schedule_file autotune + +TEST_DIR=$(SRC)/test + +$(BIN)/test_bounds: $(TEST_DIR)/bounds.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp + @mkdir -p $(@D) + $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) + +test_bounds: $(BIN)/test_bounds + $^ + +$(BIN)/test_tiling: $(TEST_DIR)/tiling.cpp Tiling.h Tiling.cpp + @mkdir -p $(@D) + $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) + +test_tiling: $(BIN)/test_tiling + $^ + +$(BIN)/test_storage_strides: $(TEST_DIR)/storage_strides.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp + @mkdir -p $(@D) + $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) + +test_storage_strides: $(BIN)/test_storage_strides + $^ + +$(BIN)/test_parser: $(TEST_DIR)/parser.cpp LoopNestParser.h ASLog.h ASLog.cpp + @mkdir -p $(@D) + $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) + +test_parser: $(BIN)/test_parser + $^ + +$(BIN)/test_state: $(TEST_DIR)/state.cpp State.h State.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp + @mkdir -p $(@D) + $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) + +test_state: $(BIN)/test_state + $^ + +clean: + rm -rf $(BIN) + +# A sample generator to demonstrate including autogenerated .sample.h +# files for scheduling purposes; the catch here is that we'll need +# to be able to compile the Generator two different ways: +# +# - one that will be used to generate the .schedule.h +# - one that will consume the .schedule.h generated above +# +# We'll use the preprocessor (GENERATING_SCHEDULE) to distinguish between these two. + +$(GENERATOR_BIN)/included_schedule_file_none.generator: included_schedule_file_generator.cpp $(GENERATOR_DEPS) + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -DGENERATING_SCHEDULE -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) + +# This is the target you build to (re)generate the schedule file. +# (Note that we only need the schedule output, so we pass `-e schedule` to +# the Generator so that it can skip producing other outputs.) +$(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_file_none.generator $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) + @mkdir -p $(@D) + HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ + $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* auto_schedule=true -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) -s Anderson2021 -e schedule + +# Note that this depends on included_schedule_file.schedule.h rather than $(BIN)/%/included_schedule_file.schedule.h -- +# the former should be generated by something like +# +# make bin/host/included_schedule_file.schedule.h +# cp bin/host/included_schedule_file.schedule.h included_schedule_file.schedule.h +# +$(GENERATOR_BIN)/included_schedule_file.generator: included_schedule_file_generator.cpp included_schedule_file.schedule.h $(GENERATOR_DEPS) + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) + +# Note that this does not depend on libauto_schedule nor does it call +# the autoscheduler at build time; it includes the generated schedule (included_schedule_file.schedule.h), +# which has been added to our local source control. +$(BIN)/%/included_schedule_file.a: $(GENERATOR_BIN)/included_schedule_file.generator + @mkdir -p $(@D) + $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* + +$(BIN)/%/included_schedule_file.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/included_schedule_file.registration.cpp $(BIN)/%/included_schedule_file.a + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) -I$(BIN)/$* $^ -o $@ $(HALIDE_SYSTEM_LIBS) $(IMAGE_IO_FLAGS) + +included_schedule_file: $(BIN)/$(HL_TARGET)/included_schedule_file.rungen + $^ --benchmarks=all --benchmark_min_time=1 --estimate_all diff --git a/src/autoschedulers/anderson2021/NetworkSize.h b/src/autoschedulers/anderson2021/NetworkSize.h new file mode 100644 index 000000000000..fa8f5566110f --- /dev/null +++ b/src/autoschedulers/anderson2021/NetworkSize.h @@ -0,0 +1,12 @@ +#ifndef HALIDE_NETWORK_SIZE_H +#define HALIDE_NETWORK_SIZE_H + +namespace Halide { +// The size of the best cost model network found. Needed by the cost +// model and also the cost model training script. +const int head1_channels = 8, head1_w = 40, head1_h = 7; +const int head2_channels = 24, head2_w = 73; +const int conv1_channels = 32; // Only 30 are used (needs to be a multiple of 8 for vectorization in cost_model_generator.cpp) +} // namespace Halide + +#endif // HALIDE_NETWORK_SIZE_H diff --git a/src/autoschedulers/anderson2021/PerfectHashMap.h b/src/autoschedulers/anderson2021/PerfectHashMap.h new file mode 100644 index 000000000000..c6315a7009e8 --- /dev/null +++ b/src/autoschedulers/anderson2021/PerfectHashMap.h @@ -0,0 +1,415 @@ +#ifndef PERFECT_HASH_MAP_H +#define PERFECT_HASH_MAP_H + +#include +#include +#include + +// Avoid a dependence on libHalide by defining a local variant we can use +struct PerfectHashMapAsserter { + const bool c; + + PerfectHashMapAsserter(bool c) + : c(c) { + } + + template + PerfectHashMapAsserter &operator<<(T &&t) { + if (!c) { + std::cerr << t; + } + return *this; + } + ~PerfectHashMapAsserter() { + if (!c) { + exit(-1); + } + } +}; + +// A specialized hash map used in the autoscheduler. It can only grow, +// and it requires a perfect hash in the form of "id" and "max_id" +// fields on each key. If the keys don't all have a consistent max_id, +// or if you call make_large with the wrong max_id, you get UB. If you +// think that might be happening, uncomment the assertions below for +// some extra checking. + +template +class PerfectHashMap { + + using storage_type = std::vector>; + + storage_type storage; + + int occupied = 0; + + // Equivalent to storage[i], but broken out into a separate method + // to allow for bounds checks when debugging this. + std::pair &storage_bucket(int i) { + /* + phm_assert(i >= 0 && i < (int)storage.size()) + << "Out of bounds access: " << i << " " << storage.size() << "\n"; + */ + return storage[i]; + } + + const std::pair &storage_bucket(int i) const { + /* + phm_assert(i >= 0 && i < (int)storage.size()) + << "Out of bounds access: " << i << " " << storage.size() << "\n"; + */ + return storage[i]; + } + + enum { + Empty = 0, // No storage allocated + Small = 1, // Storage is just an array of key/value pairs + Large = 2 // Storage is an array with empty slots, indexed by the 'id' field of each key + } state = Empty; + + void upgrade_from_empty_to_small() { + storage.resize(max_small_size); + state = Small; + } + + void upgrade_from_empty_to_large(int n) { + storage.resize(n); + state = Large; + } + + void upgrade_from_small_to_large(int n) { + phm_assert(occupied <= max_small_size) << occupied << " " << max_small_size << "\n"; + storage_type tmp(n); + state = Large; + tmp.swap(storage); + int o = occupied; + for (int i = 0; i < o; i++) { + emplace_large(tmp[i].first, std::move(tmp[i].second)); + } + occupied = o; + } + + // Methods when the map is in the empty state + T &emplace_empty(const K *n, T &&t) { + upgrade_from_empty_to_small(); + storage_bucket(0).first = n; + storage_bucket(0).second = std::move(t); + occupied = 1; + return storage_bucket(0).second; + } + + const T &get_empty(const K *n) const { + phm_assert(0) << "Calling get on an empty PerfectHashMap"; + return unreachable_value(); + } + + T &get_empty(const K *n) { + phm_assert(0) << "Calling get on an empty PerfectHashMap"; + return unreachable_value(); + } + + T &get_or_create_empty(const K *n) { + occupied = 1; + return emplace_empty(n, T()); + } + + bool contains_empty(const K *n) const { + return false; + } + + // Methods when the map is in the small state + int find_index_small(const K *n) const { + int i; + for (i = 0; i < (int)occupied; i++) { + if (storage_bucket(i).first == n) return i; + } + return i; + } + + T &emplace_small(const K *n, T &&t) { + int idx = find_index_small(n); + if (idx >= max_small_size) { + upgrade_from_small_to_large((int)(n->max_id)); + return emplace_large(n, std::move(t)); + } + auto &p = storage_bucket(idx); + if (p.first == nullptr) { + occupied++; + p.first = n; + } + p.second = std::move(t); + return p.second; + } + + const T &get_small(const K *n) const { + int idx = find_index_small(n); + return storage_bucket(idx).second; + } + + T &get_small(const K *n) { + int idx = find_index_small(n); + return storage_bucket(idx).second; + } + + T &get_or_create_small(const K *n) { + int idx = find_index_small(n); + if (idx >= max_small_size) { + upgrade_from_small_to_large((int)(n->max_id)); + return get_or_create_large(n); + } + auto &p = storage_bucket(idx); + if (p.first == nullptr) { + occupied++; + p.first = n; + } + return p.second; + } + + bool contains_small(const K *n) const { + int idx = find_index_small(n); + return (idx < max_small_size) && (storage_bucket(idx).first == n); + } + + // Methods when the map is in the large state + T &emplace_large(const K *n, T &&t) { + auto &p = storage_bucket(n->id); + if (!p.first) occupied++; + p.first = n; + p.second = std::move(t); + return p.second; + } + + const T &get_large(const K *n) const { + return storage_bucket(n->id).second; + } + + T &get_large(const K *n) { + return storage_bucket(n->id).second; + } + + T &get_or_create_large(const K *n) { + auto &p = storage_bucket(n->id); + if (p.first == nullptr) { + occupied++; + p.first = n; + } + return storage_bucket(n->id).second; + } + + bool contains_large(const K *n) const { + return storage_bucket(n->id).first != nullptr; + } + + void check_key(const K *n) const { + /* + phm_assert(n->id >= 0 && n->id < n->max_id) + << "Invalid hash key: " << n->id << " " << n->max_id << "\n"; + phm_assert(state != Large || (int)storage.size() == n->max_id) + << "Inconsistent key count: " << n->max_id << " vs " << storage.size() << "\n"; + */ + } + + // Helpers used to pacify compilers + T &unreachable_value() { + return storage_bucket(0).second; + } + + const T &unreachable_value() const { + return storage_bucket(0).second; + } + +public: + // Jump straight to the large state + void make_large(int n) { + if (state == Empty) { + upgrade_from_empty_to_large(n); + } else if (state == Small) { + upgrade_from_small_to_large(n); + } + } + + T &emplace(const K *n, T &&t) { + check_key(n); + switch (state) { + case Empty: + return emplace_empty(n, std::move(t)); + case Small: + return emplace_small(n, std::move(t)); + case Large: + return emplace_large(n, std::move(t)); + } + return unreachable_value(); + } + + T &insert(const K *n, const T &t) { + check_key(n); + T tmp(t); + switch (state) { + case Empty: + return emplace_empty(n, std::move(tmp)); + case Small: + return emplace_small(n, std::move(tmp)); + case Large: + return emplace_large(n, std::move(tmp)); + } + return unreachable_value(); + } + + const T &get(const K *n) const { + check_key(n); + switch (state) { + case Empty: + return get_empty(n); + case Small: + return get_small(n); + case Large: + return get_large(n); + } + return unreachable_value(); + } + + T &get(const K *n) { + check_key(n); + switch (state) { + case Empty: + return get_empty(n); + case Small: + return get_small(n); + case Large: + return get_large(n); + } + return unreachable_value(); + } + + T &get_or_create(const K *n) { + check_key(n); + switch (state) { + case Empty: + return get_or_create_empty(n); + case Small: + return get_or_create_small(n); + case Large: + return get_or_create_large(n); + } + return unreachable_value(); + } + + bool contains(const K *n) const { + check_key(n); + switch (state) { + case Empty: + return contains_empty(n); + case Small: + return contains_small(n); + case Large: + return contains_large(n); + } + return false; // Unreachable + } + + size_t size() const { + return occupied; + } + + struct iterator { + std::pair *iter, *end; + + void operator++(int) { + do { + iter++; + } while (iter != end && iter->first == nullptr); + } + + void operator++() { + (*this)++; + } + + const K *key() const { + return iter->first; + } + + T &value() const { + return iter->second; + } + + bool operator!=(const iterator &other) const { + return iter != other.iter; + } + + bool operator==(const iterator &other) const { + return iter == other.iter; + } + + std::pair &operator*() { + return *iter; + } + }; + + struct const_iterator { + const std::pair *iter, *end; + + void operator++(int) { + do { + iter++; + } while (iter != end && iter->first == nullptr); + } + + void operator++() { + (*this)++; + } + + const K *key() const { + return iter->first; + } + + const T &value() const { + return iter->second; + } + + bool operator!=(const const_iterator &other) const { + return iter != other.iter; + } + + bool operator==(const const_iterator &other) const { + return iter == other.iter; + } + + const std::pair &operator*() const { + return *iter; + } + }; + + iterator begin() { + if (state == Empty) return end(); + iterator it; + it.iter = storage.data(); + it.end = it.iter + storage.size(); + if (it.key() == nullptr) it++; + phm_assert(it.iter == it.end || it.key()); + return it; + } + + iterator end() { + iterator it; + it.iter = it.end = storage.data() + storage.size(); + return it; + } + + const_iterator begin() const { + if (storage.empty()) return end(); + const_iterator it; + it.iter = storage.data(); + it.end = it.iter + storage.size(); + if (it.key() == nullptr) it++; + phm_assert(it.iter == it.end || it.key()); + return it; + } + + const_iterator end() const { + const_iterator it; + it.iter = it.end = storage.data() + storage.size(); + return it; + } +}; + +#endif diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp new file mode 100644 index 000000000000..c5de0e8f48a2 --- /dev/null +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -0,0 +1,685 @@ +#include "SearchSpace.h" + +using std::set; +using std::vector; + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +bool use_randomized_tilings() { + static std::string randomization_str = get_env_variable("HL_RANDOMIZE_TILINGS"); + return randomization_str == "1"; +} + +SearchSpace::SearchSpace(const FunctionDAG &dag, + const MachineParams ¶ms, + const Target &target, + const std::string &search_space_options, + std::mt19937 &rng, + CostModel *cost_model, + Statistics &stats, + const LoopNestParser* partial_schedule) + : dag{dag} + , params{params} + , target{target} + , search_space_options{search_space_options} + , rng{rng} + , cost_model{cost_model} + , stats{stats} + , randomize_tilings{use_randomized_tilings()} + , partial_schedule{partial_schedule} +{ + memoized_compute_root_blocks.make_large(dag.nodes.size()); +} + +void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest* new_root) { + int vector_dim = -1; + bool loop_nest_found = false; + for (auto &c : new_root->children) { + if (c->node == node && c->stage->index == 0) { + vector_dim = c->vector_dim; + loop_nest_found = true; + break; + } + } + + internal_assert(loop_nest_found); + + auto& blocks = memoized_compute_root_blocks.get_or_create(node)[vector_dim]; + + for (auto &c : new_root->children) { + if (c->node == node) { + LoopNest *new_block = new LoopNest; + new_block->copy_from_including_features(*c.get()); + blocks.push_back(new_block); + ++stats.num_block_memoization_misses; + } + } +} + +bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, + std::function &&)> &accept_child, + const FunctionDAG::Node *node, + int& num_children) const { + if (!memoized_compute_root_blocks.contains(node)) { + return false; + } + + int vector_dim = -1; + for (const auto& c : state->root->children) { + if (c->node == node && c->stage->index == 0) { + vector_dim = c->vector_dim; + break; + } + } + + if (memoized_compute_root_blocks.get(node).count(vector_dim) == 0) { + return false; + } + + auto blocks = memoized_compute_root_blocks.get(node).at(vector_dim); + + size_t num_stages = node->stages.size(); + for (size_t i = 0; i < blocks.size(); i += num_stages) { + auto child = state->make_child(); + LoopNest *new_root = new LoopNest; + new_root->copy_from(*state->root); + child->root = new_root; + child->num_decisions_made++; + + int block_index = 0; + for (const auto& c : new_root->children) { + if (c->node == node) { + break; + } + ++block_index; + } + + for (size_t j = 0; j < num_stages; ++j) { + LoopNest* new_block = new LoopNest; + new_block->copy_from_including_features(*blocks[i + j]); + new_root->children[block_index++] = new_block; + } + + if (child->calculate_cost(dag, params, target, cost_model, stats)) { + num_children++; + accept_child(std::move(child)); + ++stats.num_block_memoization_hits; + } + } + + return true; +} + +vector SearchSpace::filter_parallel_tile_options(IntrusivePtr state, + const FunctionDAG::Node *node, + vector>& inner_tilings, + const vector& pure_size) const { + vector options; + vector insufficient_parallelism; + for (size_t i = 0; i < inner_tilings.size(); i++) { + auto &t = inner_tilings[i]; + SearchSpace::ParallelTileOption o; + o.inner_tiling = t; + + for (size_t j = 0; j < pure_size.size(); j++) { + t[j] = (pure_size[j] + t[j] - 1) / t[j]; + } + + t.swap(o.outer_tiling); + + // Compute max idle cores across the other stages of the Func + int64_t min_total = 0, max_total = 0; + o.idle_core_wastage = 1; + for (const auto &c : state->root->children) { + if (c->node == node) { + int64_t total = 1; + int64_t max_available = 1; + for (auto &l : c->stage->loop) { + if (!l.rvar) { + total *= o.outer_tiling[l.pure_dim]; + max_available *= c->size[l.pure_dim]; + } + } + max_total = std::max(max_total, total); + + // If a stage does not have enough parallelism regardless of the + // tiling (i.e. its size is < params.parallelism * 2 before + // splitting), then the only tiling worth considering is the + // one that retains the full extent in this dimension + // (outer_tiling == size). In that case, skip over updating + // min_total, otherwise it will be filtered out below + if (max_available >= params.parallelism * 2 || total != max_available) { + if (min_total != 0) { + min_total = std::min(min_total, total); + } else { + min_total = total; + } + const double tasks_per_core = ((double)total) / params.parallelism; + o.idle_core_wastage = std::max(o.idle_core_wastage, + std::ceil(tasks_per_core) / + tasks_per_core); + } + } + } + + o.min_parallelism = min_total; + o.max_parallelism = max_total; + + // Filter out the less useful options + bool ok = + (min_total >= params.parallelism * 2 && + (max_total <= params.parallelism * 16 || target.has_gpu_feature())); + + if (!ok) { + insufficient_parallelism.emplace_back(std::move(o)); + continue; + } + + options.emplace_back(std::move(o)); + } + + int64_t parallelism_limit = params.parallelism; + while (options.empty()) { + for (auto& o : insufficient_parallelism) { + if (o.min_parallelism >= parallelism_limit) { + options.emplace_back(std::move(o)); + } + } + + parallelism_limit /= 2; + } + + std::sort(options.begin(), options.end()); + + return options; +} + +vector SearchSpace::filter_thread_tile_options(vector>& loop_nests) const { + vector options; + for (const auto& loop_nest : loop_nests) { + if (!loop_nest->has_valid_thread_extents()) { + Filter(loop_nest.get()) << "Invalid thread extents\n"; + continue; + } + + ThreadTileOption o; + o.loop_nest = loop_nest; + o.max_idle_lane_wastage = loop_nest->max_idle_lane_wastage(target, {loop_nest.get()}); + options.emplace_back(std::move(o)); + } + + std::sort(options.begin(), options.end()); + + return options; +} + +void SearchSpace::process_pending_states(std::unordered_map& primary_options, + std::unordered_map& secondary_options, + int &num_children, + std::function &&)> &accept_child, + const FunctionDAG::Node* node) { + for (auto& entry : primary_options) { + size_t N = entry.second.size(); + if (N > 1 && !is_in_partial_schedule(node)) { + N = std::log2(entry.second.size()); + } + + std::shuffle(entry.second.begin(), entry.second.end(), rng); + + size_t accepted = 0; + for (size_t i = 0; i < entry.second.size() && accepted < N; ++i) { + if (entry.second[i]->calculate_cost(dag, params, target, cost_model, stats)) { + num_children++; + accept_child(std::move(entry.second[i])); + accepted++; + stats.num_tilings_accepted++; + } + } + } + + if (num_children > 0) { + return; + } + + for (auto& entry : secondary_options) { + for (size_t i = 0; i < entry.second.size(); ++i) { + if (entry.second[i]->calculate_cost(dag, params, target, cost_model, stats)) { + num_children++; + accept_child(std::move(entry.second[i])); + stats.num_tilings_accepted++; + break; + } + } + } +} + +void SearchSpace::generate_children(IntrusivePtr state, + std::function &&)> &accept_child, + int pass_idx, + bool is_pre_pass) { + const IntrusivePtr root = state->root; + + internal_assert(root.defined() && root->is_root()); + + if (state->num_decisions_made == 2 * (int)dag.nodes.size()) { + return; + } + + int next_node = state->num_decisions_made / 2; + int phase = state->num_decisions_made % 2; + + if (!may_subtile()) { + // When emulating the older search space, we do all + // parallelizing last, so that it is independent of the + // tiling decisions. + next_node = state->num_decisions_made % dag.nodes.size(); + phase = state->num_decisions_made / dag.nodes.size(); + } + + // Enumerate all legal ways to schedule the next Func + const FunctionDAG::Node *node = &dag.nodes[next_node]; + for (const auto *e : node->outgoing_edges) { + internal_assert(root->computes(e->consumer->node)) + << "Partially scheduled code doesn't compute " << e->consumer->name + << ", which is one of the consumers of " << node->func.name(); + } + + //ScopedTimer scoped_timer{"generate_children() for " + node->func.name()}; + bool must_inline = inlined_nodes.contains(node); + bool must_compute_root = compute_root_nodes.contains(node); + + if (node->is_input || (phase == 1 && must_compute_root)) { + // We don't need to schedule nodes that represent inputs, + // and there are no other decisions to be made about them + // at this time. + // aslog(0) << "Skipping over scheduling input node: " << node->func.name() << "\n"; + auto child = state->make_child(); + child->num_decisions_made++; + accept_child(std::move(child)); + return; + } + + if (!node->outgoing_edges.empty() && !root->calls(node)) { + aslog(0) << "In state:\n"; + state->dump(); + aslog(0) << node->func.name() << " is consumed by:\n"; + for (const auto *e : node->outgoing_edges) { + aslog(0) << e->consumer->name << "\n"; + aslog(0) << "Which in turn consumes:\n"; + for (const auto *e2 : e->consumer->incoming_edges) { + aslog(0) << " " << e2->producer->func.name() << "\n"; + } + } + internal_error << "Pipeline so far doesn't use next Func: " << node->func.name() << '\n'; + } + + int num_children = 0; + + + if (phase == 0) { + // Injecting realizations + { + state->update_always_consider_inline_options(node); + + if (is_in_partial_schedule(node)) { + state->add_to_always_consider_inline_options(node); + } + + // 1) Inline it + if (search_space_options.compute_inline() && node->stages.size() == 1 && !node->is_output && !must_compute_root) { + LoopNest *new_root = new LoopNest; + new_root->copy_from(*root); + new_root->inline_func(node); + if (add_child(state, new_root, accept_child)) { + num_children++; + } + } + } + + if (must_inline && num_children > 0) { + std::cerr << "Must inline success: " << node->func.name() << "\n"; + return; + } + + if (must_inline) { + std::cerr << "Unable to inline: " << node->func.name() << "\n"; + } + + // Some search-space pruning. If a node is pointwise, and + // so are all its inputs and so is its sole output, and + // inlining it is legal, just inline it. This saves time + // on long chains of pointwise things. + must_inline = (node->is_pointwise && + (num_children > 0) && + (node->outgoing_edges.size() == 1)); + if (must_inline) { + for (const auto *e : node->stages[0].incoming_edges) { + must_inline &= e->producer->is_pointwise; + } + for (const auto *e : node->outgoing_edges) { + must_inline &= (e->consumer->node->is_pointwise || + e->consumer->node->is_boundary_condition); + } + if (must_inline) { + return; + } + } + + if (must_compute_root) { + LoopNest *new_root = new LoopNest; + new_root->copy_from(*root); + const auto &nodes = compute_root_nodes.get(node); + for (const auto &n : nodes) { + const auto* compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{}); + new_root->children.push_back(compute_root_loop); + } + new_root->store_at.insert(node); + + add_child(state, new_root, accept_child); + return; + } + + // Construct a list of plausible dimensions to vectorize + // over. Currently all of them. TODO: Pre-prune the list + // of sane dimensions to vectorize a Func over to reduce + // branching factor. + vector vector_dims; + if (!node->is_input && !node->is_output) { + for (int v = 0; v < node->dimensions; v++) { + const auto &p = root->get_bounds(node)->region_computed(v); + if (p.extent() >= 16) { + vector_dims.push_back(v); + if (!is_in_partial_schedule(node)) { + break; + } + } + } + } + // Outputs must be vectorized over their innermost + // dimension, because we don't have control of the + // storage. TODO: Check which dimension has a stride==1 + // constraint instead of assuming 0. + if (vector_dims.empty()) { + vector_dims.push_back(0); + } + + // 2) Realize it somewhere + std::unordered_map primary_options; + std::unordered_map secondary_options; + for (int vector_dim : vector_dims) { + Timer timer; + auto tile_options = root->compute_in_tiles(node, nullptr, params, target, search_space_options, vector_dim, false, false, is_pre_pass); + stats.compute_in_tiles_time += timer.elapsed(); + + timer.restart(); + auto options = filter_thread_tile_options(tile_options); + stats.filter_thread_tiles_time += timer.elapsed(); + + for (const auto& o : options) { + if (!randomize_tilings && num_children >= 1 && o.max_idle_lane_wastage > 0.5) { + Filter(o.loop_nest.get()) << "Excess idle lane wastage\n" + << "max_idle_lane_wastage = " << o.max_idle_lane_wastage << "\n"; + break; + } + + ++stats.num_tilings_generated; + + if (!randomize_tilings) { + if (add_child(state, o.loop_nest, accept_child)) { + num_children++; + } + continue; + } + + auto child = state->make_child(); + child->root = std::move(o.loop_nest); + child->num_decisions_made++; + uint64_t h = child->structural_hash(pass_idx); + + if (o.max_idle_lane_wastage > 0.5) { + secondary_options[h].push_back(child); + continue; + } + + primary_options[h].push_back(child); + } + } + + if (randomize_tilings) { + process_pending_states(primary_options, secondary_options, num_children, accept_child, node); + } + } else { + // We are parallelizing the loops of the func we just injected a realization for. + + bool should_parallelize = false; + IntrusivePtr pure_stage; + + if (params.parallelism > 1) { + for (auto &c : root->children) { + if (c->node == node && node->dimensions > 0) { + if (c->stage->index == 0) { + pure_stage = c; + } + should_parallelize = true; + } + } + } + + if (!should_parallelize) { + // The Func must be scalar, or not compute_root, or + // we're not asking to use multiple cores. Just + // return a copy of the parent state + num_children++; + auto child = state->make_child(); + child->num_decisions_made++; + accept_child(std::move(child)); + return; + } + + if (add_states_from_memoized_blocks(state, accept_child, node, num_children)) { + return; + } + + // When GPU scheduling we approach tiling in two steps. + // step 1) convert (none, SIMD) loops to (parallel, serial, SIMD) loops with specialized serial sizes + auto parallel_tilings = generate_compute_root_serial_tilings(pure_stage, node); + + internal_assert(parallel_tilings.size() > 0) << " zero parallel tilings\n"; + + std::unordered_map>> primary_options; + std::unordered_map>> secondary_options; + for (auto ¶llel_t: parallel_tilings) { + LoopNest parallel_root; + parallel_root.copy_from(*root); + + // step 1) parallelize all loop nests for this node into (parallel, serial) with given serial tiles + for (auto &c : parallel_root.children) { + if (c->node == node) { + c = c->parallelize_in_tiles(params, parallel_t, ¶llel_root, target, false, true); + } + } + + // step 2) split all parallel loops for this node into to (blocks, thread) loop + vector> stage_sizes; + vector> pure_dims; + vector vectorized_indices; + parallel_root.get_stage_sizes(node, stage_sizes, pure_dims, vectorized_indices); + // at root level sibling thread counts are in separate blocks, extents are irrelevant + vector max_size((int)(stage_sizes[0].size()), 1); + + auto block_tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, node->dimensions-1, vectorized_indices, false, true); + + // If no options, create a thread tiling as large as possible with block size (1,1,1). + // This can happen if the loops are too small to generate desired gpu tiles. + if (block_tilings.empty()) { + LoopNest *new_root = new LoopNest; + new_root->copy_from(parallel_root); + for (auto &c : new_root->children) { + if (c->node == node) { + vector tiling((int)(c->size.size()), 1); + c = c->parallelize_in_tiles(params, tiling, new_root, target, false, true); + } + } + if (add_child(state, new_root, accept_child)) { + num_children++; + memoize_blocks(node, new_root); + } + internal_assert(false) << "block tilings empty"; + return; + } + + Timer timer; + auto options = filter_parallel_tile_options(state, node, block_tilings, stage_sizes[0]); + stats.filter_parallel_tiles_time += timer.elapsed(); + + double prev_idle_core_wastage = 0; + for (const auto &o : options) { + if (!randomize_tilings && num_children >= 1 && o.idle_core_wastage > 1.2 && o.idle_core_wastage != prev_idle_core_wastage) { + // We have considered several options, and the + // remaining ones leave lots of cores idle. + break; + } + prev_idle_core_wastage = o.idle_core_wastage; + + ++stats.num_tilings_generated; + + LoopNest *new_root = new LoopNest; + new_root->copy_from(parallel_root); + + for (auto &c : new_root->children) { + if (c->node == node) { + c = c->parallelize_in_tiles(params, o.inner_tiling, new_root, target, true, false); + } + } + + if (!randomize_tilings) { + if (add_child(state, new_root, accept_child)) { + num_children++; + memoize_blocks(node, new_root); + } + continue; + } + + auto child = state->make_child(); + child->root = std::move(new_root); + child->num_decisions_made++; + uint64_t h = child->structural_hash(pass_idx); + + if (o.idle_core_wastage > 1.2) { + secondary_options[h].push_back(child); + continue; + } + + primary_options[h].push_back(child); + } + } + + if (randomize_tilings) { + process_pending_states(primary_options, secondary_options, num_children, accept_child, node); + } + } + + if (num_children == 0) { + aslog(0) << "Warning: Found no legal way to schedule " + << node->func.name() << " in the following State:\n"; + state->dump(); + // All our children died. Maybe other states have had + // children. Carry on. + } +} + +struct ClearInlinedMutator { + void operator()(LoopNest* new_loop_nest) const { + new_loop_nest->inlined = {}; + } +}; + +void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { + std::vector> node_ids_and_costs; + NodeMap node_costs; + size_t num_nodes = 0; + for (const auto& n : dag.nodes) { + if (n.is_input) { + continue; + } + + int i = 0; + for (const auto& s : n.stages) { + if (!node_costs.contains(dag.stage_id_to_node_map.at(s.id))) { + node_costs.get_or_create(dag.stage_id_to_node_map.at(s.id)) = 0; + } + + node_costs.get(dag.stage_id_to_node_map.at(s.id)) += best->cost_per_stage[i++]; + } + + ++num_nodes; + } + + for (auto it = node_costs.begin(); it != node_costs.end(); it++) { + node_ids_and_costs.push_back({it.key()->id, it.value()}); + } + + for (const auto& n : node_ids_and_costs) { + internal_assert(n.first >= 0); + } + + std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(), [](const std::pair& a, const std::pair& b) { + return a.second < b.second; + }); + + size_t num_to_freeze = num_nodes - std::log2(num_nodes); + NodeMap nodes_to_freeze; + for (size_t i = 0; i < num_to_freeze; ++i) { + auto id = node_ids_and_costs[i].first; + std::cerr << "Freezing " << dag.nodes[id].func.name() << " with cost = " << node_ids_and_costs[i].second << "\n"; + nodes_to_freeze.insert(&dag.nodes[id], true); + } + + best->root->collect_nodes_that_should_be_inlined(nodes_to_freeze, inlined_nodes); + + ClearInlinedMutator mutator{}; + + for (const auto& c : best->root->children) { + if (nodes_to_freeze.contains(c->node)) { + auto new_loop_nest = deep_copy_loop_nest(c, mutator); + compute_root_nodes.get_or_create(c->node).push_back(new_loop_nest); + std::cerr << "Freezing as compute_root: " << c->node->func.name() << "\n"; + } + } +} + +vector> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr& pure_stage, const FunctionDAG::Node *node) const { + std::vector vec_dim_serial_sizes; + pure_stage->generate_vec_dim_serial_tilings(vec_dim_serial_sizes); + + return generate_serial_tilings(pure_stage->size, + node->dimensions - 1, + node->dimensions - 1, + pure_stage->vectorized_loop_index, + vec_dim_serial_sizes, + false, + true); +} + +bool SearchSpace::add_child(const IntrusivePtr& state, + const IntrusivePtr& new_root, + std::function &&)> &accept_child) const { + auto child = state->make_child(); + child->root = std::move(new_root); + child->num_decisions_made++; + if (child->calculate_cost(dag, params, target, cost_model, stats)) { + accept_child(std::move(child)); + return true; + } + return false; +} + +bool SearchSpace::is_in_partial_schedule(const FunctionDAG::Node *node) const { + return partial_schedule && partial_schedule->is_in_partial_schedule(node); +} + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/SearchSpace.h b/src/autoschedulers/anderson2021/SearchSpace.h new file mode 100644 index 000000000000..538c441a42a2 --- /dev/null +++ b/src/autoschedulers/anderson2021/SearchSpace.h @@ -0,0 +1,111 @@ +#ifndef SEARCH_SPACE_H +#define SEARCH_SPACE_H + +#include "CostModel.h" +#include "DefaultCostModel.h" +#include "Featurization.h" +#include "FunctionDAG.h" +#include "LoopNest.h" +#include "LoopNestParser.h" +#include "PerfectHashMap.h" +#include "ASLog.h" +#include "SearchSpaceOptions.h" +#include "State.h" +#include +#include +#include +#include + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +struct SearchSpace { + using StateVector = std::vector>; + const FunctionDAG &dag; + const MachineParams ¶ms; + const Target ⌖ + SearchSpaceOptions search_space_options; + std::mt19937 &rng; + CostModel *cost_model; + Statistics &stats; + bool randomize_tilings; + const LoopNestParser* partial_schedule; + + NodeMap inlined_nodes; + NodeMap>> compute_root_nodes; + NodeMap>>> memoized_compute_root_blocks; + + SearchSpace(const FunctionDAG &dag, + const MachineParams ¶ms, + const Target &target, + const std::string &search_space_options, + std::mt19937 &rng, + CostModel *cost_model, + Statistics &stats, + const LoopNestParser* partial_schedule); + + // Sort / filter parallel tile options + struct ParallelTileOption { + vector outer_tiling; + vector inner_tiling; + double idle_core_wastage; + int64_t min_parallelism; + int64_t max_parallelism; + bool operator<(const ParallelTileOption &other) const { + return idle_core_wastage < other.idle_core_wastage; + } + + // Ensure we don't accidentally copy this type + ParallelTileOption() = default; + ParallelTileOption(ParallelTileOption &&) = default; + ParallelTileOption &operator=(ParallelTileOption &&) = default; + ParallelTileOption(const ParallelTileOption &) = delete; + ParallelTileOption &operator=(const ParallelTileOption &) = delete; + }; + + vector filter_parallel_tile_options(IntrusivePtr state, + const FunctionDAG::Node *node, + vector>& inner_tilings, + const vector& pure_size) const; + + vector filter_thread_tile_options(vector>& loop_nests) const; + + void memoize_blocks(const FunctionDAG::Node *node, LoopNest* new_root); + + bool add_states_from_memoized_blocks(IntrusivePtr state, + std::function &&)> &accept_child, + const FunctionDAG::Node *node, + int& num_children) const; + + + // Generate successor states for given 'state' + void generate_children(IntrusivePtr state, + std::function &&)> &accept_child, + int pass_idx, + bool is_pre_pass); + + void freeze_lowest_cost_stages(const IntrusivePtr best); + + vector> generate_compute_root_serial_tilings(const IntrusivePtr& pure_stage, const FunctionDAG::Node *node) const; + + bool add_child(const IntrusivePtr& state, + const IntrusivePtr& new_root, + std::function &&)> &accept_child) const; + + void process_pending_states(std::unordered_map& primary_options, + std::unordered_map& secondary_options, + int &num_children, + std::function &&)> &accept_child, + const FunctionDAG::Node* node); + + bool is_in_partial_schedule(const FunctionDAG::Node *node) const; +}; + + + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // SEARCH_SPACE_H diff --git a/src/autoschedulers/anderson2021/SearchSpaceOptions.h b/src/autoschedulers/anderson2021/SearchSpaceOptions.h new file mode 100644 index 000000000000..7d22e20b9072 --- /dev/null +++ b/src/autoschedulers/anderson2021/SearchSpaceOptions.h @@ -0,0 +1,68 @@ +#ifndef SEARCH_SPACE_OPTIONS_H +#define SEARCH_SPACE_OPTIONS_H + +#include + +#include "ASLog.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +struct SearchSpaceOptions { + constexpr static size_t option_compute_root = 0; + constexpr static size_t option_compute_inline = 1; + constexpr static size_t option_compute_at_block = 2; + constexpr static size_t option_compute_at_thread = 3; + + std::bitset<4> options; + + SearchSpaceOptions(const std::string& bit_str) + : options{bit_str} + { + aslog(0) << "Search space options:\n"; + aslog(0) << "Input string: " << bit_str << "\n"; + aslog(0) << "Compute root: " << compute_root() << "\n"; + aslog(0) << "Compute inline: " << compute_inline() << "\n"; + aslog(0) << "Compute at block: " << compute_at_block() << "\n"; + aslog(0) << "Compute at thread: " << compute_at_thread() << "\n"; + } + + bool compute_root() const { + return options.test(SearchSpaceOptions::option_compute_root) || compute_at_block() || compute_at_thread(); + } + + bool compute_root_only() const { + return options.count() == 1 && compute_root(); + } + + bool compute_inline() const { + return options.test(SearchSpaceOptions::option_compute_inline); + } + + bool compute_inline_only() const { + return options.count() == 1 && compute_inline(); + } + + bool compute_at_block() const { + return options.test(SearchSpaceOptions::option_compute_at_block); + } + + bool compute_at_block_only() const { + return options.count() == 1 && compute_at_block(); + } + + bool compute_at_thread() const { + return options.test(SearchSpaceOptions::option_compute_at_thread); + } + + bool compute_at_thread_only() const { + return options.count() == 1 && compute_at_thread(); + } +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // SEARCH_SPACE_OPTIONS_H diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp new file mode 100644 index 000000000000..aa1cd5d8c683 --- /dev/null +++ b/src/autoschedulers/anderson2021/State.cpp @@ -0,0 +1,1291 @@ +#include "State.h" + +using std::set; +using std::vector; + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +double get_stack_memory_adjustment_factor() { + string stack_factor_str = get_env_variable("HL_STACK_FACTOR"); + if (stack_factor_str.empty()) { + return 0.95; + } + + return std::atof(stack_factor_str.c_str()); +} + +int64_t get_stack_memory_limit() { + static double stack_factor = get_stack_memory_adjustment_factor(); + return stack_factor * 103232; +} + +uint64_t State::structural_hash(int depth) const { + uint64_t h = num_decisions_made; + internal_assert(root.defined()); + root->structural_hash(h, depth); + return h; +} + +// Compute the parent and depth of every loop nest node +void State::compute_loop_nest_parents(map> &p, + const LoopNest *here, int depth) const { + for (const auto &c : here->children) { + p.emplace(c.get(), pair{here, depth}); + compute_loop_nest_parents(p, c.get(), depth + 1); + } +} + +const LoopNest *State::deepest_valid_compute_location(const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap& total_shared_mem_alloc_sizes) const { + std::vector ancestors; + + // Innermost loop nests are never considered as compute locations + if (!loop->innermost) { + ancestors.push_back(loop); + } + + const LoopNest *cur_loop = loop; + while (parent.count(cur_loop) > 0) { + ancestors.push_back(parent.at(cur_loop).first); + cur_loop = ancestors.back(); + } + + if (ancestors.size() == 0) { + return root; + } + + const LoopNest *candidate = ancestors.back(); + bool first = true; + + int64_t new_shared_mem_alloc_size = 0; + int64_t new_register_alloc_size = 0; + + for (auto it = ancestors.rbegin(); it != ancestors.rend(); it++) { + if (first) { + first = false; + continue; + } + + if ((*it)->gpu_label == block) { + new_shared_mem_alloc_size = node.bytes_per_point; + for (int i = 0; i < node.dimensions; ++i) { + new_shared_mem_alloc_size *= (*it)->get_bounds(&node)->region_computed(i).extent(); + } + + int64_t total = new_shared_mem_alloc_size + total_shared_mem_alloc_sizes.get((*it)->stage); + if (total > get_shared_memory_limit()) { + continue; + } + } + + if ((*it)->gpu_label == thread || (*it)->gpu_label == serial) { + int64_t total = node.bytes_per_point; + for (int i = 0; i < node.dimensions; ++i) { + total *= (*it)->get_bounds(&node)->region_computed(i).extent(); + } + + if (total > get_register_mem_alloc_limit()) { + continue; + } + + new_register_alloc_size = total; + } + + // If the region_computed does not shrink, ancestors.at(i) (the loop + // nest one level further in) will never be considered as a compute + // location + if (!(*it)->region_computed_shrinks(&node, candidate)) { + break; + } + + candidate = *it; + } + + if (candidate->gpu_label == block) { + total_shared_mem_alloc_sizes.get(candidate->stage) += new_shared_mem_alloc_size; + internal_assert(total_shared_mem_alloc_sizes.get(candidate->stage) <= get_shared_memory_limit()); + } + + internal_assert(new_register_alloc_size <= get_register_mem_alloc_limit()); + internal_assert(!candidate->innermost); + return candidate; +} + +int64_t State::total_loop_extents_of_ancestors(const map> &parent, const LoopNest *loop) const { + int64_t total = 1; + + if (loop->is_root()) { + return total; + } + + const LoopNest *cur_loop = loop; + while (true) { + for (size_t i = 0; i < cur_loop->size.size(); ++i) { + total *= cur_loop->size[i]; + } + + if (parent.count(cur_loop) == 0) { + break; + } + + cur_loop = parent.at(cur_loop).first; + } + + return total; +} + +const LoopNest *State::deepest_common_ancestor(const map> &parent, const LoopNest *a, const LoopNest *b) const { + if (a->is_root()) return a; + if (b->is_root()) return b; + if (a == b) return a; + + // Walk the deeper one up until they're at the same depth + auto it_a = parent.find(a); + auto it_b = parent.find(b); + internal_assert(it_a != parent.end() && it_b != parent.end()); + while (it_a->second.second > it_b->second.second) { + a = it_a->second.first; + it_a = parent.find(a); + } + while (it_b->second.second > it_a->second.second) { + b = it_b->second.first; + it_b = parent.find(b); + } + + while (1) { + // Walk each up one + a = it_a->second.first; + b = it_b->second.first; + if (a == b) return a; + it_a = parent.find(a); + it_b = parent.find(b); + internal_assert(it_a != parent.end() && it_b != parent.end()); + } + + // unreachable + return nullptr; +} + +bool State::has_loop_nest_without_thread_loops() const { + for (const auto& c : root->children) { + if (c->gpu_label != block) { + continue; + } + + for (const auto& block_c : c->children) { + if (!block_c->all_paths_to_leaves_have_thread_loop()) { + return true; + } + } + } + + return false; +} + +bool State::has_compute_root_loops_without_blocks() const { + for (const auto& c : root->children) { + if (c->gpu_label == none) { + return true; + } + } + + return false; +} + +void State::FeatureLoopNestMutator::operator()(LoopNest* new_loop_nest) const { + split_compute_root_loops(new_loop_nest); + add_outer_thread_loops(new_loop_nest); +} + +// In phase 2, any compute_root loop marked 'none' will be split into +// blocks, threads, and serial loops. To enable the cost model to make a +// meaningful prediction on these pre-split loops, we assume a split into +// blocks and threads with a single full warp (if possible) +void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest* loop_nest) const { + if (!loop_nest || !loop_nest->is_root()) { + return; + } + + for (auto it = loop_nest->children.rbegin(); it != loop_nest->children.rend(); ++it) { + auto& c = *it; + if (c->gpu_label != none) { + continue; + } + + int vectorized_loop_index = c->vectorized_loop_index; + + if (c->size.size() == 0) { + continue; + } + + // Make the vectorized dimension of the inner loop 32 (or as + // close as possible) + int64_t inner_extent = std::min(c->size[vectorized_loop_index], (int64_t)32); + + if (c->stage->index == 0) { + vector tiling(c->node->dimensions, 1); + + // Split into parallelized and serial + c = c->parallelize_in_tiles(params, tiling, loop_nest, target, true, false); + + if (vectorized_loop_index >= 0) { + tiling[vectorized_loop_index] = inner_extent; + } + // Split parallelized into blocks and threads + c = c->parallelize_in_tiles(params, tiling, loop_nest, target, true, false); + } else { + // An update stage may have more or fewer dimensions than + // the pure stage, but the tiling requires its dimensions to + // be equal to the number of dimensions in the pure stage + vector tiling(c->node->dimensions, 1); + for (size_t i = 0; i < c->stage->loop.size(); i++) { + int l = c->stage->loop[i].pure_dim; + if (l == -1) { + continue; + } + + tiling[l] = c->size[i]; + } + + // For update stages, split into parallelized and serial + // (parallelize_in_tiles will move any RVars inwards and + // make them serial) + c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); + + // If vectorized_loop_index < 0, then this update stage + // likely does not loop over the vectorized loop of the + // pure stage, so it should not be split by the + // outer_vec_extent and instead only have a single thread + vector thread_tiling(c->node->dimensions, 1); + if (vectorized_loop_index >= 0) { + thread_tiling[c->stage->loop[vectorized_loop_index].pure_dim] = inner_extent; + } + + // Now that the RVars have been moved inwards, we can + // split the outer loop into blocks and threads + c = c->parallelize_in_tiles(params, thread_tiling, loop_nest, target, true, false); + } + } +} + +// If a loop nest does not have thread loops, split the outermost serial +// loops to create thread loops with extents 1 +void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest* loop_nest) const { + if (!loop_nest) { + return; + } + + if (loop_nest->gpu_label == block) { + // Example: + // block + // serial (a) + // all serial descendants + // + // (a) should be surrounded by a thread loop + for (auto& c : loop_nest->children) { + if (c->has_thread_loop_descendant()) { + continue; + } + + internal_assert(c->gpu_label == serial); + + // We want outer thread loops with extents 1 + vector tiling(c->node->dimensions, 1); + + // Mark as 'thread' so this loop is split into threads and + // serial + c->gpu_label = thread; + c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); + } + return; + } + + if (loop_nest->gpu_label == serial) { + bool has_child_with_thread_descendant = false; + + for (const auto& c : loop_nest->children) { + if (c->has_thread_loop_descendant()) { + has_child_with_thread_descendant = true; + break; + } + } + + // If there are no children with thread descendants, then this must be an all + // serial hierarchy. This may require an outer thread loop to be + // added, but if so, this will occur when this method is called + // on the nodes higher in the loop nest + if (!has_child_with_thread_descendant) { + return; + } + + // Example: + // serial + // thread + // serial (a) + // + // (a) should be surrounded by a thread loop + for (auto& c : loop_nest->children) { + if (c->has_thread_loop_descendant()) { + continue; + } + + // We want outer thread loops with extents 1 + vector tiling(c->node->dimensions, 1); + + // Mark as 'thread' so this loop is split into threads and + // serial + c->gpu_label = thread; + c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); + } + + } +} + +IntrusivePtr State::get_root_for_features(const MachineParams ¶ms, const Target& target) const { + if (!has_compute_root_loops_without_blocks() && !has_loop_nest_without_thread_loops()) { + return root; + } + + FeatureLoopNestMutator mutator{params, target}; + + // We copy the loop nest in 2 cases: + // - If the current loop nest has compute root loops without blocks (it is + // in phase 1 and the outer loops are marked 'none'), we split the loop into blocks and threads so we can compute meaningful features + // - If there are serial loops inside blocks without a surrounding + // thread loop nest, we create a surrounding thread loop nest with + // extents 1 (which Halide will do when the schedule is compiled) so + // that we can more easily compute features + auto new_root = create_feature_root(mutator); + return new_root; +} + +void State::set_gpu_store_site(const map>& parent, const LoopNest* loop, LoopNest::Sites& site) const { + // If site.store is inside a block but outside a loop, the + // GPU store site should instead be the block because the shared + // mem allocation will be hoisted + bool type_has_been_set = false; + const LoopNest *candidate_block = loop; + while (candidate_block) { + if (candidate_block->gpu_label == thread) { + site.gpu_store_memory_type = GPUMemoryType::registers; + type_has_been_set = true; + break; + } + + if (candidate_block->is_root()) { + site.gpu_store_memory_type = GPUMemoryType::global; + type_has_been_set = true; + break; + } + + if (candidate_block->gpu_label == block) { + site.store = candidate_block; + site.gpu_store_memory_type = GPUMemoryType::shared; + type_has_been_set = true; + break; + } + + candidate_block = parent.at(candidate_block).first; + } + + internal_assert(type_has_been_set); +} + +bool State::compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, StageMap *features, Statistics& stats, bool verbose) const { + auto feature_root = get_root_for_features(params, target); + + StageMap sites; + sites.make_large(dag.nodes[0].stages[0].max_id); + features->make_large(dag.nodes[0].stages[0].max_id); + internal_assert(feature_root.defined()); + StageMap total_shared_mem_alloc_sizes; + total_shared_mem_alloc_sizes.make_large(dag.nodes[0].stages[0].max_id); + feature_root->get_sites(target, sites, total_shared_mem_alloc_sizes); + if (!feature_root->promote_allocs_to_registers(target, sites)) { + return false; + } + + // For the input nodes and unscheduled outputs, the compute + // and store sites are root, and the produce and innermost + // sites are unset (nullptr) + for (const auto &n : dag.nodes) { + if (n.is_input || n.is_output) { + for (const auto &stage : n.stages) { + auto &s = sites.get_or_create(&stage); + if (s.compute == nullptr) { + s.compute = feature_root.get(); + s.store = feature_root.get(); + s.gpu_store_memory_type = GPUMemoryType::global; + } + } + } + } + + // For the unscheduled nodes, give them sites as deep as they + // could possibly be. We'll ignore the possibility of inlining + // them for now. + map> parent; + compute_loop_nest_parents(parent, feature_root.get(), 0); + for (const auto &n : dag.nodes) { + if (sites.contains(&(n.stages[0]))) { + continue; + } + const LoopNest *loop = nullptr; + for (const auto *e : n.outgoing_edges) { + const auto &consumer_site = sites.get(e->consumer); + const LoopNest *l = consumer_site.innermost; + if (!l) l = consumer_site.compute; + if (!l) { + if (aslog::aslog_level() > 0) { + dump(); + } + internal_error << e->producer->func.name() << " -> " << e->consumer->name << "\n"; + } + if (loop) { + if (consumer_site.inlined) { + // If this func is inlined, find the deepest common ancestor + // of all its inlined locations + for (const auto* innermost : consumer_site.inlined_innermosts) { + loop = deepest_common_ancestor(parent, innermost, loop); + } + } else { + loop = deepest_common_ancestor(parent, l, loop); + } + } else { + if (consumer_site.inlined) { + bool first = true; + // If this func is inlined, find the deepest common ancestor + // of all its inlined locations + for (const auto* innermost : consumer_site.inlined_innermosts) { + if (first) { + first = false; + loop = innermost; + continue; + } + + loop = deepest_common_ancestor(parent, innermost, loop); + } + } else { + loop = l; + } + } + } + internal_assert(loop) + << "Could not compute plausible site for unscheduled Func: " + << n.func.name() << "\n"; + + // If 'loop' would never be considered as a compute location (i.e. by + // LoopNest::compute_in_tiles()), walk up the loop nest until we reach a + // location that would be considered + loop = deepest_valid_compute_location(parent, n, loop, feature_root.get(), total_shared_mem_alloc_sizes); + int64_t num_realizations = total_loop_extents_of_ancestors(parent, loop); + + for (auto &stage : n.stages) { + auto &site = sites.get_or_create(&stage); + site.compute = loop; + site.store = loop; + site.num_realizations = num_realizations; + if (target.has_gpu_feature()) { + set_gpu_store_site(parent, loop, site); + } + } + } + + for (const auto& c : feature_root->children) { + sites.get(c->stage).hash_of_producers_stored_at_root = c->compute_hash_of_producers_stored_at_root(sites); + } + + Timer timer; + feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, nullptr, nullptr, nullptr, features, {feature_root.get()}, true, total_shared_mem_alloc_sizes, stats, verbose); + + stats.featurization_time += timer.elapsed(); + ++stats.num_featurizations; + + for (const auto &n : dag.nodes) { + if (sites.get(&(n.stages[0])).produce == nullptr) { + internal_assert(!features->contains(&(n.stages[0]))) + << "Somehow an input or unscheduled node ended up in the featurization: " + << n.func.name() << "\n"; + } + } + + return true; +} + +void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, std::ostream &out) const { + StageMap features; + Statistics stats; + compute_featurization(dag, params, target, &features, stats); + + for (const auto &n : dag.nodes) { + if (n.is_input) continue; + for (size_t stage_idx = n.stages.size(); stage_idx > 0; stage_idx--) { + const auto &s = n.stages[stage_idx - 1]; + const size_t num_schedule_features = ScheduleFeatures::num_features(); + const size_t num_pipeline_features = PipelineFeatures::num_features(); + const auto &sched_feat = features.get(&s); + + float buf[num_schedule_features + num_pipeline_features]; + // Save them as floats + for (size_t i = 0; i < num_schedule_features; i++) { + buf[i] = sched_feat[i]; + } + + for (size_t i = 0; i < num_pipeline_features; i++) { + buf[i + num_schedule_features] = s.features[i]; + } + + out.write((const char *)buf, sizeof(buf)); + } + } +} + +bool State::contains_store_at(const set& outermost_store_at, const IntrusivePtr& parent) const { + for (const auto& c : parent->children) { + if (c->store_at.size() > 0) { + return true; + } + + // At production for c: if not store_at root or outermost, then it + // must implicitly be store_at parent's level, so reject it + bool at_production = c->node != parent->node; + if (at_production && root->store_at.count(c->node) == 0 && outermost_store_at.count(c->node) == 0) { + return true; + } + + if (contains_store_at(outermost_store_at, c)) { + return true; + } + } + + return false; +} + +// For GPU, only allow store_at root or inside the outermost loop nest. Any +// store_ats further in will be hoisted and expanded, increasing the +// amount of shared memory required. +bool State::contains_store_at_further_in_than_outermost() const { + for (const auto& child : root->children) { + for (const auto& grandchild : child->children) { + if (contains_store_at(child->store_at, grandchild)) { + return true; + } + } + } + return false; +} + + +bool State::has_dynamic_allocation_inside_thread() const { + return root->has_dynamic_allocation_inside_thread(false); +} + +bool State::exceeds_serial_extents_limit(const Target &target) const { + if (!target.has_gpu_feature()) { + return false; + } + + return root->exceeds_serial_extents_limit(target, nullptr, false); +} + +int64_t State::get_shared_mem_alloc_size(const LoopNest* block, const LoopNest* loop) const { + int64_t result = 0; + + if (loop->gpu_label == thread) { + return result; + } + + for (const auto *node : loop->store_at) { + const auto &bounds = block->get_bounds(node); + + int64_t alloc_size = node->bytes_per_point; + for (int i = 0; i < node->dimensions; i++) { + const auto &p = bounds->region_computed(i); + alloc_size *= p.extent(); + } + + if (node->dimensions > 0) { + result += alloc_size; + } + } + + for (const auto& c : loop->children) { + result += get_shared_mem_alloc_size(block, c.get()); + } + + return result; +} + +bool State::exceeds_shared_memory_limit(const Target &target) const { + if (!target.has_gpu_feature()) { + return false; + } + + static int64_t limit = get_shared_memory_limit(); + + if (limit == 0) { + return false; + } + + for (const auto& c : root->children) { + // If the working set is too large on the GPU, shared memory will be + // exhausted, so reject any such schedules + if (get_shared_mem_alloc_size(c.get(), c.get()) > limit) { + return true; + } + } + + return false; +} + +bool State::exceeds_local_memory_limit(const Target &target) const { + if (!target.has_gpu_feature()) { + return false; + } + + for (const auto& c : root->children) { + if (c->get_total_constant_local_mem_alloc_size() > get_stack_memory_limit()) { + return true; + } + + if (c->get_total_local_mem_alloc_size() > kLocalMemoryLimit) { + return true; + } + } + + return false; +} + +bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, CostModel *cost_model, Statistics& stats, bool verbose) { + Timer timer; + if (!root->has_valid_thread_extents()) { + Filter(root.get()) << "Invalid thread extents\n"; + return false; + } + + if (exceeds_shared_memory_limit(target)) { + Filter(root.get()) << "Exceeds shared memory limit\n"; + return false; + } + + if (exceeds_local_memory_limit(target)) { + Filter(root.get()) << "Exceeds local memory limit\n"; + return false; + } + + if (exceeds_serial_extents_limit(target)) { + Filter(root.get()) << "Exceeds serial loop extent limit\n"; + return false; + } + + stats.calculate_cost_time += timer.elapsed(); + + StageMap features; + + if (!compute_featurization(dag, params, target, &features, stats, verbose)) { + Filter(root.get()) << "Contains a local allocation that likely cannot be promoted to registers\n"; + return false; + } + + cost = 0; + + if (verbose) { + for (auto it = features.begin(); it != features.end(); it++) { + auto &stage = *(it.key()); + const auto &feat = it.value(); + std::string name = stage.node->func.name(); + sanitize_names(name); + aslog(0) << "Schedule features for " << name << "_s" << stage.index << "\n"; + feat.dump(); + } + } + + internal_assert(cost_model); + + // Perform some addition pruning before burdening the cost model with silly states + for (auto it = features.begin(); it != features.end(); it++) { + if (!it.key()->node->is_wrapper) { // It's OK to repeatedly stage data + auto &feat = it.value(); + if (should_always_consider_inline(it.key()->node)) { + continue; + } + + if (feat.points_computed_total + feat.inlined_calls > 10 * feat.points_computed_minimum) { + Filter(root.get()) << "Excess recompute for " << it.key()->node->func.name() << " stage " << it.key()->index << "\n" + << "points_computed_total = " << feat.points_computed_total << "\n" + << "inlined_calls = " << feat.inlined_calls << "\n" + << "points_computed_total + inlined_calls = " << feat.points_computed_total + feat.inlined_calls << "\n" + << "points_computed_minimum = " << feat.points_computed_minimum << "\n" + << "8 * points_computed_minimum = " << 8 * feat.points_computed_minimum << "\n"; + cost = 1e50; + return false; + } + } + } + + // Avoid code size explosion from recursive inlining. + if (root->max_inlined_calls() >= 256) { + cost = 1e50; + return false; + } + + cost_model->enqueue(dag, features, &cost, &cost_per_stage); + + return true; +} + +// Make a child copy of this state. The loop nest is const (we +// make mutated copies of it, rather than mutating it), so we can +// continue to point to the same one and so this is a cheap +// operation. +IntrusivePtr State::make_child() const { + State *s = new State; + s->parent = this; + s->root = root; + s->cost = cost; + s->cost_per_stage = cost_per_stage; + s->num_decisions_made = num_decisions_made; + s->always_consider_inline = always_consider_inline; + return s; +} + +void State::dump() const { + aslog(0) << "State with cost " << cost << ":\n"; + root->dump(); + aslog(0) << schedule_source; +} + +void State::print_compute_locations() const { + StageMap> descendants; + root->get_stages_computed_in_each_compute_root_loop(descendants); + + aslog(0) << "BEGIN compute locations\n"; + for (const auto& d : descendants) { + aslog(0) << d.first->sanitized_name << " -> "; + + for (const auto& descendant : d.second) { + aslog(0) << descendant.first->sanitized_name << " "; + } + + aslog(0) << "\n"; + } + aslog(0) << "END compute locations\n"; +} + +void State::fuse_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents, const vector& constant_extents) const { + if (parallel_vars.empty() || parallel_extents.empty()) { + return; + } + + constexpr int max_blocks[3] = {2147483647, 65535, 65535}; + int block_extents[3] = {1, 1, 1}; + + std::vector block_var_assignments[3]; + + // When parallel_vars/parallel_extents/constant_extents were created in apply_schedule, + // each entry was added in reverse order. Start from the end (the + // innermost dimension) and assign each var to a gpu_block. + int i = parallel_vars.size() - 1; + for (size_t block_i = 0; block_i < 3; ++block_i) { + for (; i >= 0 && parallel_extents[i] * block_extents[block_i] <= max_blocks[block_i]; --i) { + if (parallel_extents[i] > 1 || !constant_extents[i]) { + block_extents[block_i] *= parallel_extents[i]; + block_var_assignments[block_i].push_back(i); + + // Use a single block for the first 2 innermost dimensions. The + // remaining dimensions should all be assigned to the same block and + // fused + if (block_i < 2) { + --i; + break; + } + } + } + } + + bool marked = false; + for (size_t block_i = 0; block_i < 3; ++block_i) { + for (size_t i = 1; i < block_var_assignments[block_i].size(); ++i) { + auto inner_i = block_var_assignments[block_i][0]; + auto outer_i = block_var_assignments[block_i][i]; + state->schedule_source << "\n .fuse(" << parallel_vars[inner_i].name() + << ", " << parallel_vars[outer_i].name() + << ", " << parallel_vars[inner_i].name() << ")"; + stage.fuse(parallel_vars[inner_i], + parallel_vars[outer_i], + parallel_vars[inner_i]); + } + + if (block_var_assignments[block_i].size() > 0) { + auto inner_i = block_var_assignments[block_i][0]; + state->schedule_source << "\n .gpu_blocks(" << parallel_vars[inner_i].name() << ")"; + stage.gpu_blocks(parallel_vars[inner_i]); + state->parallel = true; + marked = true; + } + } + + if (!marked) { + bool all_one = true; + for (auto extent : parallel_extents) { + all_one = all_one && extent == 1; + } + + // If all the parallel extents = 1, just mark the innermost parallel_var + // as .gpu_block() + if (all_one) { + int i = parallel_vars.size() - 1; + state->schedule_source << "\n .gpu_blocks(" << parallel_vars[i].name() << ")"; + stage.gpu_blocks(parallel_vars[i]); + state->parallel = true; + } + } +} + +void State::mark_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents) const { + int max_blocks[3] = {2147483647, 65535, 65535}; + uint8_t n_loops_tagged_gpu_blocks = 0; + + for (auto& v : parallel_vars) { + if (n_loops_tagged_gpu_blocks >= 3 || parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) { + break; + } + + state->schedule_source << "\n .gpu_blocks(" << v.name() << ")"; + stage.gpu_blocks(v); + ++n_loops_tagged_gpu_blocks; + } + + if (n_loops_tagged_gpu_blocks > 0) { + state->parallel = true; + } +} + +bool State::mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, std::unordered_set& new_serial_vars, std::ostringstream& staged_funcs_schedule_source) const { + uint8_t num_loops_tagged_gpu_thread = 0; + int64_t total_threads = 1; + int max_threads[3] = {1024, 1024, 64}; + + bool first = true; + + for (const auto& v : state->vars) { + if (!v.exists || !v.gpu_threads || v.extent == 1) { + continue; + } + + if (num_loops_tagged_gpu_thread >= 3 || total_threads >= MAX_THREADS_PER_BLOCK || v.extent > max_threads[num_loops_tagged_gpu_thread]) { + break; + } + + Var new_outer(v.var.name() + "_serial_outer"); + new_serial_vars.insert(new_outer.name()); + stage.split(v.var, new_outer, v.var, (int)v.extent, TailStrategy::GuardWithIf); + stage.gpu_threads(v.var); + state->schedule_source << "\n .split(" << v.var.name() << ", " << new_outer.name() << ", " << v.var.name() << ", " << v.extent << ", TailStrategy::GuardWithIf)"; + state->schedule_source << "\n .gpu_threads(" << v.var.name() << ")"; + num_loops_tagged_gpu_thread++; + + if (first) { + first = false; + + Func func(state->node->func); + + for (const auto& to_be_staged : state->producers_to_be_staged) { + const auto* producer_node = to_be_staged.first; + + for (const auto& cur_pair : to_be_staged.second) { + const LoopNest* loop_nest = cur_pair.first; + const std::vector& edge_chain = cur_pair.second; + + internal_assert(edge_chain.at(0)->consumer == loop_nest->stage); + internal_assert(edge_chain.back()->producer == producer_node); + + if (edge_chain.size() > 1) { + std::string s = func.name(); + for (size_t i = 0; i < edge_chain.size() - 1; ++i) { + s = edge_chain.at(i)->producer->func.name() + ".clone_in(" + s + ")"; + } + aslog(0) << "Chain with length > 1: " << producer_node->func.name() << ".in(" << s << ")\n"; + continue; + } + + auto clone_in_chain = func; + auto clone_in_chain_source_str = func.name(); + + for (size_t i = 0; i < edge_chain.size() - 1; ++i) { + clone_in_chain = Func(edge_chain.at(i)->producer->func).clone_in(clone_in_chain); + clone_in_chain_source_str = edge_chain.at(i)->producer->func.name() + ".clone_in(" + clone_in_chain_source_str + ")"; + } + + Func producer(producer_node->func); + producer.in(clone_in_chain).store_in(MemoryType::Register).compute_at(func, v.var.var); + staged_funcs_schedule_source + << producer.name() + << ".in(" + << clone_in_chain_source_str + << ").store_in(MemoryType::Register).compute_at(" + << func.name() + << ", " + << v.var.var.name() + << ")"; + + const auto& bounds = loop_nest->get_bounds_along_edge_chain(producer_node, edge_chain); + + int i = 0; + for (const auto& l : producer_node->stages[0].loop) { + Var unrolled_var(l.var); + + int extent = bounds->region_required(i++).extent(); + producer.in(clone_in_chain).bound_extent(unrolled_var, extent); + staged_funcs_schedule_source + << "\n .bound_extent(" + << unrolled_var.name() + << ", " + << extent + << ")"; + + producer.in(clone_in_chain).unroll(unrolled_var); + staged_funcs_schedule_source << "\n .unroll(" << unrolled_var.name() << ")"; + } + staged_funcs_schedule_source << ";\n"; + } + } + } + } + + return num_loops_tagged_gpu_thread > 0; +} + +bool State::can_fuse_gpu(const vector& parallel_extents) const { + int64_t total = 1; + for (auto extent : parallel_extents) { + total *= extent; + } + + // Max grid size in x dimension + constexpr int64_t max_blocks = 2147483647; + return total < max_blocks; +} + +// Apply the schedule represented by this state to a Halide +// Pipeline. Also generate source code for the schedule for the +// user to copy-paste to freeze this schedule as permanent artifact. +void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target) { + StageMap> state_map; + std::vector ancestors; + + NodeMap all_inlined; + root->collect_all_inlined(all_inlined); + root->apply(LoopLevel::root(), state_map, params.parallelism, 0, nullptr, nullptr, target, ancestors, all_inlined); + + std::ostringstream src; + std::unordered_set new_serial_vars; + + src << "auto pipeline = get_pipeline();\n"; + + // Print handles for all the Funcs + int i = (int)(dag.nodes.size() - 1); + for (const auto &n : dag.nodes) { + if (!n.is_input) { + src << "Func " << n.func.name() << " = pipeline.get_func(" << i << ");\n"; + } + i--; + } + + // Gather all Vars and RVars so that we can declare them in the emitted source + map vars, rvars; + for (auto &p : state_map) { + for (auto &v : p.second->vars) { + if (v.exists) { + if (v.var.is_rvar) { + rvars.emplace(v.var.name(), v.accessor); + } else { + vars.emplace(v.var.name(), v.accessor); + } + } + } + } + if (!vars.empty()) { + for (const auto &p : vars) { + if (p.second.empty()) { + src << "Var " << p.first << "(\"" << p.first << "\");\n"; + } else { + src << "Var " << p.first << "(" << p.second << ");\n"; + } + } + } + if (!rvars.empty()) { + for (const auto &p : rvars) { + if (p.second.empty()) { + src << "RVar " << p.first << "(\"" << p.first << "\");\n"; + } else { + src << "RVar " << p.first << "(" << p.second << ");\n"; + } + } + } + + for (auto &p : state_map) { + if (p.first->node->is_input) continue; + + Stage stage(p.first->stage); + + // Do all the reorders and pick which vars to + // parallelize. + vector vars; + vector parallel_vars; + vector parallel_extents; + vector constant_extents; + bool any_parallel_vars = false, any_parallel_rvars = false; + for (auto it = p.second->vars.rbegin(); it != p.second->vars.rend(); it++) { + if (!it->exists) continue; + if (!it->parallel) break; + any_parallel_rvars |= it->var.is_rvar; + any_parallel_vars |= !it->var.is_rvar; + parallel_extents.push_back(it->extent); + parallel_vars.push_back(it->var); + constant_extents.push_back(it->constant_extent); + } + + if (p.second->vars.size() > 1) { + p.second->schedule_source << "\n .reorder("; + bool first = true; + for (auto &v : p.second->vars) { + if (v.exists) { + vars.push_back(v.var); + p.second->ordered_vars.push_back(v); + if (!first) { + p.second->schedule_source << ", "; + } + first = false; + p.second->schedule_source << v.var.name(); + } + } + p.second->schedule_source << ")"; + stage.reorder(vars); + } + + // Halide doesn't let you fuse an RVar with a Var, even if + // they are both pure. + bool can_fuse = !(any_parallel_vars && any_parallel_rvars); + if (can_fuse) { + fuse_gpu_blocks(p.second.get(), stage, parallel_vars, parallel_extents, constant_extents); + } else { + if (target.has_gpu_feature()) { + mark_gpu_blocks(p.second.get(), stage, parallel_vars, parallel_extents); + } else { + for (const auto &v : parallel_vars) { + p.second->schedule_source << "\n .parallel(" << v.name() << ")"; + stage.parallel(v); + } + } + } + + if (!parallel_vars.empty()) { + p.second->parallel = true; + } + + // Reorder the vector dimension innermost + if (p.first->index == 0 && p.second->vector_dim > 0) { + vector storage_vars = Func(p.first->node->func).args(); + for (int i = p.second->vector_dim; i > 0; i--) { + std::swap(storage_vars[i], storage_vars[i - 1]); + } + p.second->schedule_source << "\n .reorder_storage("; + bool first = true; + for (auto v : storage_vars) { + if (!first) { + p.second->schedule_source << ", "; + } + first = false; + p.second->schedule_source << v.name(); + } + p.second->schedule_source << ")"; + Func(p.first->node->func).reorder_storage(storage_vars); + } + } + + std::ostringstream staged_funcs_schedule_source; + + if (target.has_gpu_feature()) { + std::set invalid; + // Iterate from output backwards + for (const auto &n : dag.nodes) { + for (auto &p : state_map) { + if (&n != p.second->node) { + continue; + } + + if (p.first->node->is_input) continue; + + Stage stage(p.first->stage); + + // If at least one loop has been marked gpu_thread, we need to + // ensure that it is enclosed by a gpu_block loop. Check if this + // loop nest or one of its ancestors has been marked gpu_block + bool has_enclosing_parallel = p.second->parallel; + + if (!has_enclosing_parallel) { + for (auto* ancestor : p.second->ancestors) { + if (ancestor->parallel) { + has_enclosing_parallel = true; + break; + } + } + } + + bool thread_loop_exists = mark_gpu_threads(p.second.get(), stage, new_serial_vars, staged_funcs_schedule_source); + // The stage has no threads and no blocks. This is likely an update + // stage where the reduction is a serial loop + if (!thread_loop_exists && !has_enclosing_parallel) { + stage.gpu_single_thread(); + p.second->schedule_source << "\n .gpu_single_thread()"; + continue; + } + + if (!thread_loop_exists || has_enclosing_parallel) { + continue; + } + + // There is no outer loop marked as gpu_block. + // Split the outer loop to create a new outer var with + // extent = 1 and mark it gpu_blocks() + const auto& outer_var = p.second->ordered_vars.back(); + vector vars; + for (const auto& v : p.second->ordered_vars) { + vars.push_back(v.var); + } + + Var new_outer(outer_var.var.name() + "_outer"); + stage.split(outer_var.var, new_outer, outer_var.var, (int)outer_var.extent); + + new_serial_vars.insert(new_outer.name()); + p.second->schedule_source + << "\n .split(" + << outer_var.var.name() << ", " + << new_outer.name() << ", " + << outer_var.var.name() << ", " + << outer_var.extent << ")"; + + // If there are store_ats at Var::outermost(), we need to ensure + // that those store_ats are retained at the Var::outermost level + vars.push_back(new_outer); + vars.push_back(Var::outermost()); + + p.second->schedule_source << "\n .reorder("; + bool first = true; + + for (const auto& v : vars) { + if (!first) { + p.second->schedule_source << ", "; + } + if (v.name() == "__outermost") { + p.second->schedule_source << "Var::outermost()"; + } else { + p.second->schedule_source << v.name(); + } + first = false; + } + p.second->schedule_source << ")"; + + stage.reorder(vars); + stage.gpu_blocks(new_outer); + p.second->parallel = true; + p.second->schedule_source << "\n .gpu_blocks(" << new_outer.name() << ")"; + } + } + } + + for (const auto& v : new_serial_vars) { + src << "Var " << v << "(\"" << v << "\");\n"; + } + + for (auto &p : state_map) { + if (p.first->node->is_input) continue; + + // Dump the schedule source string + src << p.first->name + << p.second->schedule_source.str() + << ";\n"; + } + + src << staged_funcs_schedule_source.str(); + + // Sanitize the names of things to make them legal source code. + schedule_source = src.str(); + sanitize_names(schedule_source); +} + +bool State::should_always_consider_inline(const FunctionDAG::Node *node) const { + return always_consider_inline.contains(node) && always_consider_inline.get(node); +} + +void State::add_to_always_consider_inline_options(const FunctionDAG::Node *node) { + always_consider_inline.get_or_create(node) = true; +} + +void State::update_always_consider_inline_options(const FunctionDAG::Node *node) { + if (node->is_output) { + return; + } + + if (node->stages.size() > 1) { + return; + } + + if (is_func_trivial_to_inline(node->func)) { + always_consider_inline.get_or_create(node) = true; + return; + } + + if (node->is_pointwise) { + NodeMap currently_inlined; + root->collect_all_inlined(currently_inlined); + + std::unordered_set non_inlined_consumers; + std::unordered_set done; + std::vector pending; + pending.push_back(node); + + while (!pending.empty()) { + const auto *cur_node = pending.back(); + pending.pop_back(); + + if (done.count(cur_node)) { + continue; + } + done.insert(cur_node); + + for (const auto *e : cur_node->outgoing_edges) { + if (!currently_inlined.contains(e->consumer->node) || !currently_inlined.get(e->consumer->node)) { + non_inlined_consumers.insert(e->consumer->node); + continue; + } + + pending.push_back(e->consumer->node); + } + } + + if (non_inlined_consumers.size() > 1) { + return; + } + + internal_assert(non_inlined_consumers.size() == 1); + always_consider_inline.get_or_create(node) = true; + } +} + +} // namespace Autoscheduler + +template<> +RefCount &ref_count(const Autoscheduler::State *t) noexcept { + return t->ref_count; +} + +template<> +void destroy(const Autoscheduler::State *t) { + delete t; +} + +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h new file mode 100644 index 000000000000..33c05fa419f5 --- /dev/null +++ b/src/autoschedulers/anderson2021/State.h @@ -0,0 +1,253 @@ +#ifndef STATE_H +#define STATE_H + +#include "CostModel.h" +#include "DefaultCostModel.h" +#include "Featurization.h" +#include "FunctionDAG.h" +#include "LoopNest.h" +#include "PerfectHashMap.h" +#include "ASLog.h" +#include +#include +#include + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +using std::string; +using std::vector; +using std::map; +using std::pair; +using std::set; +using std::unordered_set; +using std::string; +using std::vector; + +bool verify_memoized_features(); + +bool is_memoize_blocks_enabled(); + +double get_stack_memory_adjustment_factor(); + +constexpr int kLocalMemoryLimit = 524288; // 512 KB + +// Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM) +// = 103232 bytes +// Not all 103232 bytes will be free for allocations so reduce it by factor to +// allow a buffer +int64_t get_stack_memory_limit(); + +bool use_adjusted_tilings(); + +bool compute_root_and_inline_only(); + +struct NoOpMutator { + void operator()(LoopNest* new_loop_nest) const {} +}; + +template +void deep_copy_loop_nest(LoopNest* new_loop_nest, const LoopNest* new_loop_nest_parent, const IntrusivePtr& existing_loop_nest, const PostCreateMutator& post_create_mutator) { + new_loop_nest->copy_from(*existing_loop_nest); + + for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) { + LoopNest* new_child = new LoopNest; + new_loop_nest->children[i] = new_child; + deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator); + } + + post_create_mutator(new_loop_nest); +} + +template +LoopNest* deep_copy_loop_nest(const IntrusivePtr& loop_nest, const PostCreateMutator& post_create_mutator) { + LoopNest* new_loop_nest = new LoopNest; + deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator); + return new_loop_nest; +} + +struct State { + mutable RefCount ref_count; + IntrusivePtr root; + IntrusivePtr parent; + double cost = 0; + std::vector cost_per_stage; + NodeMap always_consider_inline; + int num_decisions_made = 0; + bool penalized = false; + string schedule_source; + + State() = default; + State(const State &) = delete; + State(State &&) = delete; + void operator=(const State &) = delete; + void operator=(State &&) = delete; + + uint64_t structural_hash(int depth) const; + + // Compute the parent and depth of every loop nest node + void compute_loop_nest_parents(map> &p, + const LoopNest *here, int depth) const; + + const LoopNest *deepest_common_ancestor(const map> &parent, + const LoopNest *a, const LoopNest *b) const; + + // We use the post_create_mutator so that the loop nests can be modified + // before they become IntrusivePtr as children and cannot be modified + template + LoopNest* create_feature_root(const PostCreateMutator& post_create_mutator) const { + LoopNest* new_root = new LoopNest; + deep_copy_loop_nest(new_root, nullptr, root, post_create_mutator); + return new_root; + } + + bool has_loop_nest_without_thread_loops() const; + + bool has_compute_root_loops_without_blocks() const; + + struct FeatureLoopNestMutator { + const MachineParams& params; + const Target& target; + + void operator()(LoopNest* new_loop_nest) const; + + // In phase 2, any compute_root loop marked 'none' will be split into + // blocks, threads, and serial loops. To enable the cost model to make a + // meaningful prediction on these pre-split loops, we assume a split into + // blocks and threads with a single full warp (if possible) + void split_compute_root_loops(LoopNest* loop_nest) const; + + // If a loop nest does not have thread loops, split the outermost serial + // loops to create thread loops with extents 1 + void add_outer_thread_loops(LoopNest* loop_nest) const; + }; + + IntrusivePtr get_root_for_features(const MachineParams ¶ms, const Target& target) const; + + void set_gpu_store_site(const map>& parent, const LoopNest* loop, LoopNest::Sites& site) const; + + bool compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, StageMap *features, Statistics& stats, bool verbose=false) const; + + void save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, std::ostream &out) const; + + bool contains_store_at(const set& outermost_store_at, const IntrusivePtr& parent) const; + + // For GPU, only allow store_at root or inside the outermost loop nest. Any + // store_ats further in will be hoisted and expanded, increasing the + // amount of shared memory required. + bool contains_store_at_further_in_than_outermost() const; + + bool has_dynamic_allocation_inside_thread() const; + + bool exceeds_serial_extents_limit(const Target &target) const; + + int64_t get_shared_mem_alloc_size(const LoopNest* block, const LoopNest* loop) const; + + bool exceeds_shared_memory_limit(const Target &target) const; + + bool exceeds_local_memory_limit(const Target &target) const; + + bool calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, CostModel *cost_model, Statistics& stats, bool verbose = false); + + // Make a child copy of this state. The loop nest is const (we + // make mutated copies of it, rather than mutating it), so we can + // continue to point to the same one and so this is a cheap + // operation. + IntrusivePtr make_child() const; + + void dump() const; + + void print_compute_locations() const; + + void fuse_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents, const vector& constant_extents) const; + + void mark_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents) const; + + bool mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, std::unordered_set& new_serial_vars, std::ostringstream& staged_funcs_schedule_source) const; + + bool can_fuse_gpu(const vector& parallel_extents) const; + + // Apply the schedule represented by this state to a Halide + // Pipeline. Also generate source code for the schedule for the + // user to copy-paste to freeze this schedule as permanent artifact. + void apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target); + + bool should_always_consider_inline(const FunctionDAG::Node *node) const; + void add_to_always_consider_inline_options(const FunctionDAG::Node *node); + void update_always_consider_inline_options(const FunctionDAG::Node *node); + + const LoopNest *deepest_valid_compute_location(const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap& total_shared_mem_alloc_sizes) const; + int64_t total_loop_extents_of_ancestors(const map> &parent, const LoopNest *loop) const; +}; + +// A priority queue of states, sorted according to increasing +// cost. Never shrinks, to avoid reallocations. +// Can't use std::priority_queue because it doesn't support unique_ptr. +class StateQueue { +private: + struct CompareStates { + bool operator()(const IntrusivePtr &a, const IntrusivePtr &b) const { + return a->cost > b->cost; + } + }; + + std::vector> storage; + size_t sz = 0; + +public: + void emplace(IntrusivePtr &&s) { + if (sz >= storage.size()) { + storage.resize(std::max(sz * 2, (size_t)64)); + } + internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n"; + storage[sz] = std::move(s); + sz++; + std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{}); + } + + IntrusivePtr pop() { + internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n"; + std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{}); + sz--; + return std::move(storage[sz]); + } + + const IntrusivePtr &top() { + return storage[0]; + } + + bool empty() const { + return sz == 0; + } + + size_t size() const { + return sz; + } + + void swap(StateQueue &other) { + storage.swap(other.storage); + std::swap(sz, other.sz); + } + + IntrusivePtr operator[](int idx) const { + return storage[idx]; + } + + void resort() { + std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{}); + } + + void clear() { + for (size_t i = 0; i < sz; i++) { + storage[i] = IntrusivePtr{}; + } + sz = 0; + } +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // STATE_H diff --git a/src/autoschedulers/anderson2021/Statistics.h b/src/autoschedulers/anderson2021/Statistics.h new file mode 100644 index 000000000000..e8efee149fa2 --- /dev/null +++ b/src/autoschedulers/anderson2021/Statistics.h @@ -0,0 +1,133 @@ +#ifndef STATISTICS_H +#define STATISTICS_H + +#include +#include + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +using Clock = std::chrono::high_resolution_clock; + +template +struct ScopedStatistic { + const T& value; + std::string msg; + + ScopedStatistic(const T& value, const std::string& msg) + : value{value} + , msg{msg} + {} + + ~ScopedStatistic() { + aslog(0) << msg << " = " << value << "\n"; + } +}; + +struct ScopedTimer { + std::chrono::time_point start; + std::string msg; + + ScopedTimer(const std::string& msg) + : start{Clock::now()} + , msg{msg} + { + aslog(0) << "Start: " << msg << "\n"; + } + + ~ScopedTimer() { + auto duration = Clock::now() - start; + auto ms = std::chrono::duration_cast(duration).count(); + aslog(0) << "Duration (ms): " << msg << " = " << ms << "\n"; + } +}; + +struct Timer { + std::chrono::time_point start; + + Timer() + : start{Clock::now()} + { + } + + void restart() { + start = Clock::now(); + } + + std::chrono::duration elapsed() const { + return Clock::now() - start; + } +}; + +struct Statistics { + int num_featurizations{0}; + int num_states_added{0}; + int num_block_memoization_hits{0}; + int num_block_memoization_misses{0}; + int num_memoized_featurizations{0}; + int num_memoization_hits{0}; + int num_memoization_misses{0}; + int num_tilings_accepted{0}; + int num_tilings_generated{0}; + std::chrono::duration generate_children_time{0}; + std::chrono::duration calculate_cost_time{0}; + std::chrono::duration enqueue_time{0}; + std::chrono::duration compute_in_tiles_time{0}; + std::chrono::duration filter_thread_tiles_time{0}; + std::chrono::duration filter_parallel_tiles_time{0}; + std::chrono::duration feature_write_time{0}; + std::chrono::duration featurization_time{0}; + int num_schedules_enqueued{0}; + std::chrono::duration cost_model_evaluation_time{0}; + + double total_generate_children_time() const { + return std::chrono::duration_cast(generate_children_time).count(); + } + + double total_compute_in_tiles_time() const { + return std::chrono::duration_cast(compute_in_tiles_time).count(); + } + + double total_filter_thread_tiles_time() const { + return std::chrono::duration_cast(filter_thread_tiles_time).count(); + } + + double total_filter_parallel_tiles_time() const { + return std::chrono::duration_cast(filter_parallel_tiles_time).count(); + } + + double total_feature_write_time() const { + return std::chrono::duration_cast(feature_write_time).count(); + } + + double total_calculate_cost_time() const { + return std::chrono::duration_cast(calculate_cost_time).count(); + } + + double total_featurization_time() const { + return std::chrono::duration_cast(featurization_time).count(); + } + + double average_featurization_time() const { + return total_featurization_time() / (double)num_featurizations; + } + + double total_enqueue_time() const { + return std::chrono::duration_cast(enqueue_time).count(); + } + + double total_cost_model_evaluation_time() const { + return std::chrono::duration_cast(enqueue_time + cost_model_evaluation_time).count(); + } + + double average_cost_model_evaluation_time() const { + return total_cost_model_evaluation_time() / (double)num_schedules_enqueued; + } +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // STATISTICS_H diff --git a/src/autoschedulers/anderson2021/ThreadInfo.h b/src/autoschedulers/anderson2021/ThreadInfo.h new file mode 100644 index 000000000000..a6d990adbb29 --- /dev/null +++ b/src/autoschedulers/anderson2021/ThreadInfo.h @@ -0,0 +1,275 @@ +#ifndef THREAD_INFO_H +#define THREAD_INFO_H + +/** \file + * + * Data structure containing information about GPU threads for a particular + * location in the loop nest and its surrounding block. Useful when computing + * GPU features + */ + +#include + +#include "Errors.h" +#include "FunctionDAG.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +#define MAX_THREADS_PER_BLOCK 1024 + +struct LoopNest; + +// Sort / filter thread tile options +struct ThreadTileOption { + IntrusivePtr loop_nest; + double max_idle_lane_wastage; + bool operator<(const ThreadTileOption &other) const { + return max_idle_lane_wastage < other.max_idle_lane_wastage; + } + + // Ensure we don't accidentally copy this type + ThreadTileOption() = default; + ThreadTileOption(ThreadTileOption &&) = default; + ThreadTileOption &operator=(ThreadTileOption &&) = default; + ThreadTileOption(const ThreadTileOption &) = delete; + ThreadTileOption &operator=(const ThreadTileOption &) = delete; +}; + +struct ThreadInfo { + ThreadInfo(int vectorized_loop_index, const std::vector& size, const std::vector& loop, const std::vector& max_thread_counts) { + init_threads_in_this_block(max_thread_counts); + + std::size_t num_thread_loops = 0; + + if (vectorized_loop_index != -1 && size[vectorized_loop_index] != 1) { + threads[num_thread_loops] = size[vectorized_loop_index]; + num_threads *= size[vectorized_loop_index]; + num_thread_loops = 1; + loop_indices.push_back(vectorized_loop_index); + loop_vars.push_back(loop[vectorized_loop_index].var); + } + + for (std::size_t i = 0; i < size.size() && num_thread_loops < 3; i++) { + if (size[i] == 1 || (int)i == vectorized_loop_index) { + continue; + } + + if (num_threads * size[i] > MAX_THREADS_PER_BLOCK) { + break; + } + + threads[num_thread_loops] = size[i]; + num_threads *= size[i]; + ++num_thread_loops; + loop_indices.push_back(i); + loop_vars.push_back(loop[i].var); + } + + if (loop_indices.size() == 0) { + internal_assert(size.size() > 0); + ++num_thread_loops; + loop_indices.push_back(0); + loop_vars.push_back(loop[0].var); + } + + internal_assert(num_threads <= num_threads_in_this_block); + internal_assert(loop_indices.size() == num_thread_loops); + internal_assert(loop_vars.size() == num_thread_loops); + internal_assert(loop_indices.size() > 0 && loop_indices.size() <= 3); + internal_assert(loop_vars.size() > 0 && loop_vars.size() <= 3); + + count_num_active_warps_per_block(); + } + + template + void for_each_thread_id(const Fn& fn) const { + int thread_id = 0; + for (int z = 0; z < threads_in_this_block[2]; z++) { + for (int y = 0; y < threads_in_this_block[1]; y++) { + for (int x = 0; x < threads_in_this_block[0]; x++) { + // Skip any threads in this loop nest with extent less than the + // extents of the largest thread loops in this block + // for thread.x in [0, 10]: + // ... + // for thread.x in [0, 5]: + // ... + // For the 2nd loop, skip threads with x id >= 5 + bool active = x < threads[0] + && y < threads[1] + && z < threads[2]; + + fn(thread_id, active, thread_id == num_threads_in_this_block - 1); + ++thread_id; + } + } + } + } + + template + void for_each_thread_id_in_first_warp(Fn& fn) const { + int thread_id = 0; + for (int z = 0; z < threads_in_this_block[2]; z++) { + for (int y = 0; y < threads_in_this_block[1]; y++) { + for (int x = 0; x < threads_in_this_block[0]; x++) { + // Skip any threads in this loop nest with extent less than the + // extents of the largest thread loops in this block + // for thread.x in [0, 10]: + // ... + // for thread.x in [0, 5]: + // ... + // For the 2nd loop, skip threads with x id >= 5 + bool active = x < threads[0] + && y < threads[1] + && z < threads[2]; + + bool last_thread = thread_id == 31; + fn(thread_id, x, y, z, active, last_thread); + ++thread_id; + + if (last_thread) { + return; + } + } + } + } + } + + template + void for_each_thread_id_in_tail_warp(Fn& fn) const { + int thread_id = final_warp_initial_thread_id; + int last_thread_id = thread_id + num_threads_in_final_warp - 1; + + for (; thread_id <= last_thread_id; ++thread_id) { + int z = thread_id / (threads_in_this_block[1] * threads_in_this_block[0]); + int y = (thread_id - z * threads_in_this_block[1] * threads_in_this_block[0]) / threads_in_this_block[0]; + int x = thread_id % threads_in_this_block[0]; + + internal_assert(z < threads_in_this_block[2]); + internal_assert(y < threads_in_this_block[1]); + internal_assert(x < threads_in_this_block[0]); + + bool active = x < threads[0] + && y < threads[1] + && z < threads[2]; + + fn(thread_id, x, y, z, active, thread_id == last_thread_id); + } + } + + template + void for_each_active_thread_id(const Fn& fn) const { + for_each_thread_id([&](int thread_id, bool is_active, bool is_last_thread) { + if (!is_active) { + return; + } + + fn(thread_id, is_last_thread); + }); + } + + double warp_lane_utilization() const { + return (double)num_active_threads / (double)(num_active_warps_per_block * 32); + } + + double idle_lane_wastage() const { + return ((double)(num_active_warps_per_block * 32) - (double)num_active_threads) / MAX_THREADS_PER_BLOCK; + } + + double block_occupancy() const { + return (double)num_threads / MAX_THREADS_PER_BLOCK; + } + + int num_warps_per_block = 0; + int num_active_warps_per_block = 0; + int num_regular_active_warps_per_block = 0; + bool has_tail_warp = false; + int final_warp_initial_thread_id = 0; + int num_threads_in_final_warp = 0; + + int threads_in_this_block[3] = {1, 1, 1}; + int64_t num_threads_in_this_block = 1; + + int threads[3] = {1, 1, 1}; + int64_t num_threads = 1; + int64_t num_active_threads = 0; + + std::vector loop_indices; + std::vector loop_vars; + +private: + void init_threads_in_this_block(const std::vector& max_thread_counts) { + int num_thread_loops = 0; + for (auto c : max_thread_counts) { + if (c == 1) { + continue; + } + + if (num_thread_loops >= 3 || num_threads_in_this_block * c > MAX_THREADS_PER_BLOCK) { + break; + } + + threads_in_this_block[num_thread_loops] = c; + num_threads_in_this_block *= c; + ++num_thread_loops; + } + + num_warps_per_block = num_threads_in_this_block / 32; + if (num_threads_in_this_block % 32 != 0) { + num_warps_per_block++; + } + } + + void count_num_active_warps_per_block() { + bool current_warp_is_active = false; + int num_active_threads_in_cur_warp = 0; + int num_active_threads_in_first_warp = 0; + int num_threads_in_cur_warp = 0; + bool first_warp = true; + + for_each_thread_id([&](int thread_id, bool is_active, bool is_last_thread) { + current_warp_is_active |= is_active; + + if (is_active) { + ++num_active_threads_in_cur_warp; + ++num_active_threads; + } + ++num_threads_in_cur_warp; + + if ((thread_id + 1) % 32 == 0 || is_last_thread) { + if (current_warp_is_active) { + ++num_active_warps_per_block; + + if (first_warp) { + first_warp = false; + num_active_threads_in_first_warp = num_active_threads_in_cur_warp; + } + + if (is_last_thread) { + num_threads_in_final_warp = num_threads_in_cur_warp; + has_tail_warp = num_active_threads_in_first_warp != num_active_threads_in_cur_warp; + final_warp_initial_thread_id = thread_id - num_threads_in_cur_warp + 1; + + internal_assert(num_threads_in_final_warp <= 32); + } + } + + current_warp_is_active = false; + num_threads_in_cur_warp = 0; + num_active_threads_in_cur_warp = 0; + } + }); + + num_regular_active_warps_per_block = num_active_warps_per_block; + if (has_tail_warp) { + --num_regular_active_warps_per_block; + } + } +}; + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // THREAD_INFO_H diff --git a/src/autoschedulers/anderson2021/Tiling.cpp b/src/autoschedulers/anderson2021/Tiling.cpp new file mode 100644 index 000000000000..14b5c8f3f54d --- /dev/null +++ b/src/autoschedulers/anderson2021/Tiling.cpp @@ -0,0 +1,331 @@ +#include "Tiling.h" + +#include + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +bool all_ones(const std::vector& nums) { + for (const auto& n : nums) { + if (n != 1) { + return false; + } + } + return true; +} + +bool equal_to_existing_size(const std::vector& s, const std::vector& nums) { + for (size_t i = 0; i < s.size(); ++i) { + if (s[i] != nums[i]) { + return false; + } + } + return true; +} + +vector> generate_serial_tilings(const vector &s, int d, + int last_d, + int vectorized_index, + const vector &vec_dim_serial_sizes, + bool filter_small_outer_extents, + bool allow_inner_ones) { + vector> result; + if (d == -1) { + result.push_back(vector()); + } else { + vector> v; + v = generate_serial_tilings(s, d - 1, last_d, vectorized_index, vec_dim_serial_sizes, filter_small_outer_extents, allow_inner_ones); + for (auto t : v) { + t.push_back(0); + bool used_full_extent = false; + // include odd serial sizes that encourage multiples of 16 as thread tile size + if (vec_dim_serial_sizes.size() > 0 && d == vectorized_index) { + for (int inner : vec_dim_serial_sizes) { + int outer = (s[d] + inner - 1) / inner; + if (filter_small_outer_extents && outer < 16) { + continue; + } + t.back() = outer; + + if (d == last_d && (equal_to_existing_size(s, t) || all_ones(t))) { + continue; + } + used_full_extent = inner == s[d]; + result.push_back(t); + } + } + + int max = (s[d] == 3 || s[d] == 5 || s[d] == 7) ? s[d] : 8; + int factor = (s[d] == 3 || s[d] == 5 || s[d] == 7) ? s[d] : 2; + + // always consider the even tile sizes: 1, 2, 4, 8 + for (int inner = 1; inner <= max; inner *= factor) { + if (inner > s[d]) { + break; + } + if (inner == s[d] && used_full_extent) { + continue; + } + int outer = (s[d] + inner - 1) / inner; + if (d == vectorized_index && filter_small_outer_extents && outer < 16) { + continue; + } + t.back() = outer; + if (d == last_d && ((!allow_inner_ones && equal_to_existing_size(s, t)) || all_ones(t))) { + continue; + } + result.push_back(t); + } + } + } + return result; +} + +// Given a multi-dimensional box of dimensionality d, generate a list +// of candidate tile sizes for it, logarithmically spacing the sizes +// using the given factor. If 'allow_splits' is false, every dimension +// must either be one, or the full extent of the box. This function is +// used to generate candidate tilings when tiling for +// producer-consumer fusion, or tiling for parallelism. +// inner_sizes is optional vector of fixed sizes to choose from for inner loop. +// used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop +vector> generate_tilings(const vector &s, int d, int factor, + bool allow_splits, + const vector &inner_sizes) { + vector> result; + if (d == -1) { + result.push_back(vector()); + } else { + vector> v; + v = generate_tilings(s, d - 1, factor, allow_splits); + // If we're already generated too many tiling configurations + // for the inner loops, search the outer loops with coarser + // granularity. + while (v.size() > (size_t)factor * 100) { + factor *= 2; + } + + for (auto &t : v) { + bool is_full = false, is_one = false; + // Skip trivial tilings + if ((size_t)d == s.size() - 1) { + is_one = is_full = true; + for (int i = 0; i < d; i++) { + is_one &= (t[i] == 1); + is_full &= (t[i] == s[i]); + } + } + t.push_back(0); + if (!allow_splits) { + if (!is_one) { + t.back() = 1; + result.push_back(t); + } + if (s[d] != 1 && !is_full) { + t.back() = s[d]; + result.push_back(t); + } + } else { + if (!inner_sizes.empty()) { // using fixed set of inner loop extents + for (int inner : inner_sizes) { + int outer = (s[d] + inner - 1) / inner; + if (is_one && outer == 1) continue; + if (is_full && outer == s[d]) continue; + t.back() = outer; + result.push_back(t); + } + } else { + int max_inner = 0; + for (int inner = 1; inner < s[d]; inner *= factor) { + int outer = (s[d] + inner - 1) / inner; + if (is_one && outer == 1) continue; + if (is_full && outer == s[d]) continue; + // Stop when we hit inner sizes that would do too much recompute + if (inner > 1 && inner * outer * 7 > s[d] * 8) break; + max_inner = inner; + t.back() = outer; + result.push_back(t); + } + + for (int outer = 1; outer <= s[d]; outer *= factor) { + int inner = (s[d] + outer - 1) / outer; + if (is_one && outer == 1) continue; + if (is_full && outer == s[d]) continue; + // Stop when we get into the regime covered by the loop above. + if (outer > 1 && inner < max_inner * 2) break; + // Or when the wasted compute gets too bad. + if (inner * outer * 7 > s[d] * 8) break; + t.back() = outer; + result.push_back(t); + } + + // The sequence above (in terms of the inner loop) + // goes 1 2 4 8 16 ... but 3 is an important inner + // tiling factor for matrix multiply/gemm-type loops + // which try to use 12 vector registers. + int inner3 = 3; + int outer3 = (s[d] + inner3 - 1) / inner3; + if (factor == 2 && inner3 < s[d] && outer3 < s[d] && outer3 > 1) { + if (inner3 * outer3 * 7 <= s[d] * 8) { + t.back() = outer3; + result.push_back(t); + } + } + } + } + } + } + return result; +} + +// Moves vectorized dimension first and also removes dimensions with size 1 +// to reflect actual thread dimensions when loop nests are lowered +void lowered_dims(const vector &size, int vector_loop_i, vector &lowered_size) { + if (vector_loop_i >= 0 && size[vector_loop_i] > 1) { + lowered_size.push_back(size[vector_loop_i]); + } + for (int dim = 0; dim < (int)(size.size()); dim++) { + if (dim != vector_loop_i && size[dim] > 1) { + lowered_size.push_back(size[dim]); + } + } +} + +// Creates tilings for gpu thread loops. +// Innermost thread loop is always the vectorized dim and its extent is a multiple of 32. +// Other loop extents are sized to be powers of 2 such that total extent is < 1024 +// called either when we are creating parallel -> (blocks, threads) loop when computing at root +// OR when we are creating none -> (threads, SIMD) loop when computing at a serial loop +// serial_inner = True when we're generating (thread, serial) tilings, False when generating (block,thread) tilings +// max_s holds max gpu_thread counts across all sibling loop nests in each dimension. Used to +// make sure union of thread counts is under 1024 threshold. +vector> generate_gpu_tilings(const vector> &stage_sizes, + const vector> &pure_dims, + const vector &max_s, + int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage) { + vector> result; + if (d == -1) { + result.push_back(vector()); + } else { + // set max thread count 64 for now in all dims + int64_t max_threads_extent = 64, total_threads_limit = 1024; // less than 1024 to limit states + int factor = 2, innermost_warp_extent = 16, max_serial_ext = 16; + + if (is_compute_root_stage && pure_dims[0].size() == 1) { + innermost_warp_extent = 1; + } + + vector> v; + v = generate_gpu_tilings(stage_sizes, pure_dims, max_s, d - 1, vectorized_indices, serial_inner, is_compute_root_stage); + + for (auto t : v) { + enum validity { serial_count_err, + thread_count_err, + valid_tiling }; + + // helper function detects whether tiling is legal: cannot exceed max thread count, + // have more than three dimensions with ext > 1, or result in large serial loops + std::function is_valid_tiling = [&]() { + if (d == ((int)(stage_sizes[0].size()) - 1)) { + vector lowered_size, thread_t; + thread_t = t; + lowered_dims(thread_t, vectorized_indices[0], lowered_size); + // see how tiling will be applied to other stages of this func and update max_s accordingly + vector new_max_s = max_s; + for (size_t stage = 0; stage < pure_dims.size(); stage++) { + vector stage_thread_t, stage_lowered_size; + for (size_t i = 0; i < pure_dims[stage].size(); i++) { + if (pure_dims[stage][i] >= 0) { + stage_thread_t.push_back(thread_t[pure_dims[stage][i]]); + } else { // impure dims have extent 1 + stage_thread_t.push_back(1); + } + } + lowered_dims(stage_thread_t, vectorized_indices[stage], stage_lowered_size); + // adjust max_size to account for other stages thread counts when we apply this tiling + for (size_t dim = 0; dim < stage_lowered_size.size(); dim++) { + if (dim >= new_max_s.size()) { + new_max_s.push_back(stage_lowered_size[dim]); + } else { + new_max_s[dim] = std::max(new_max_s[dim], stage_lowered_size[dim]); + } + } + } + int64_t union_threads; + int64_t total_threads_used = 1, not_ext1 = 0; + int max_dim = std::max((int)(new_max_s.size()), (int)(lowered_size.size())); + for (int dim = 0; dim < max_dim; dim++) { + if (dim >= (int)(new_max_s.size())) { + union_threads = lowered_size[dim]; + } else if (dim >= (int)(lowered_size.size())) { + union_threads = new_max_s[dim]; + } else { + union_threads = std::max(lowered_size[dim], new_max_s[dim]); + } + not_ext1 = not_ext1 + ((union_threads > 1) ? 1 : 0); + total_threads_used *= union_threads; + } + if (total_threads_used > total_threads_limit || not_ext1 > 3) { + return thread_count_err; + } + if (serial_inner) { + for (int dd = 0; dd < (int)(stage_sizes[0].size()); dd++) { + int64_t other_ext = (stage_sizes[0][dd] + t[dd] - 1) / t[dd]; + if (other_ext > max_serial_ext) { + return serial_count_err; + } + } + } + } + return valid_tiling; + }; + + t.push_back(0); + + // if the vector dimension has extent < innermost_warp_extent we use 1 warp for it + int64_t min_threads = (d == vectorized_indices[0]) ? innermost_warp_extent : 1; + bool full_extent_considered = false; + + for (int64_t threads_ext = min_threads; threads_ext <= max_threads_extent; threads_ext *= factor) { + full_extent_considered |= threads_ext == stage_sizes[0][d]; + if (threads_ext > stage_sizes[0][d]) { + break; + } + // reject if inner exceeds hardware thread limit + if ((d == vectorized_indices[0] && threads_ext > max_threads_extent) || (d != vectorized_indices[0] && threads_ext > 16)) { + break; + } + int64_t other_ext = (stage_sizes[0][d] + threads_ext - 1) / threads_ext; + if (d != vectorized_indices[0] && threads_ext > 1 && threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) break; + t.back() = threads_ext; + validity valid_result = is_valid_tiling(); + if (valid_result == serial_count_err) { + continue; + } else if (valid_result == thread_count_err) { + break; + } else { + result.push_back(t); + } + + if (threads_ext >= stage_sizes[0][d]) { + break; + } + } + + if (!full_extent_considered && stage_sizes[0][d] < max_threads_extent) { + t.back() = stage_sizes[0][d]; + validity valid_result = is_valid_tiling(); + if (valid_result != serial_count_err && valid_result != thread_count_err) { + result.push_back(t); + } + } + } + } + return result; +} + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + diff --git a/src/autoschedulers/anderson2021/Tiling.h b/src/autoschedulers/anderson2021/Tiling.h new file mode 100644 index 000000000000..031f91ab9049 --- /dev/null +++ b/src/autoschedulers/anderson2021/Tiling.h @@ -0,0 +1,58 @@ +#ifndef TILING_H +#define TILING_H + +#include +#include + +using std::vector; + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +bool all_ones(const std::vector& nums); + +bool equal_to_existing_size(const std::vector& s, const std::vector& nums); + +vector> generate_serial_tilings(const vector &s, int d, + int last_d, + int vectorized_index, + const vector &vec_dim_serial_sizes, + bool filter_small_outer_extents=false, + bool allow_inner_ones=false); + +// Given a multi-dimensional box of dimensionality d, generate a list +// of candidate tile sizes for it, logarithmically spacing the sizes +// using the given factor. If 'allow_splits' is false, every dimension +// must either be one, or the full extent of the box. This function is +// used to generate candidate tilings when tiling for +// producer-consumer fusion, or tiling for parallelism. +// inner_sizes is optional vector of fixed sizes to choose from for inner loop. +// used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop +vector> generate_tilings(const vector &s, int d, int factor, + bool allow_splits, + const vector &inner_sizes = vector()); + + +/** moves vectorized dimension first and also removes dimensions with size 1 + to reflect actual thread dimensions when loop nests are lowered **/ +void lowered_dims(const vector &size, int vector_loop_i, vector &lowered_size); + +// creates tilings for gpu threads loops. +// Innermost thread loop is always the vectorized dim and its extent is a multiple of 32. +// Other loop extents are sized to be powers of 2 such that total extent is < 1024 +// called either when we are creating parallel -> (blocks, threads) loop when computing at root +// OR when we are creating none -> (threads, SIMD) loop when computing at a serial loop +// serial_inner = True when we're generating (thread, serial) tilings, False when generating (block,thread) tilings +// max_s hold max gpu_thread counts of all siblings in each dimension. Used to make sure union of +// thread counts is under 1024 threshold. +vector> generate_gpu_tilings(const vector> &stage_sizes, + const vector> &pure_dims, + const vector &max_s, + int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage); + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // TILING_H diff --git a/src/autoschedulers/anderson2021/Weights.cpp b/src/autoschedulers/anderson2021/Weights.cpp new file mode 100644 index 000000000000..adf419f0f7ea --- /dev/null +++ b/src/autoschedulers/anderson2021/Weights.cpp @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "Featurization.h" +#include "HalideBuffer.h" +#include "NetworkSize.h" +#include "Weights.h" + +namespace Halide { +namespace Internal { + +using Halide::Runtime::Buffer; + +constexpr uint32_t kSignature = 0x68776631; + +void Weights::randomize(uint32_t seed) { + std::mt19937 rng(seed); + // Fill the weights with random values + for_each_buffer([&rng](Buffer &w) { + w.for_each_value([&rng](float &f) { + f = ((float)rng()) / ((float)rng.max()) - 0.5f; + }); + }); +} + +/* + Structure of the .weights file format: + + uint32 signature always 0x68776631 ('hwf1') + uint32 PipelineFeatures::version + uint32 ScheduleFeatures::version + uint32 buffer-count + uint32 dimension-count + uint32x(dimension-count) dimension-extent + float32x(element-count) data + + (all values little-endian) +*/ + +bool Weights::load(std::istream &i) { + uint32_t signature; + i.read((char *)&signature, sizeof(signature)); + if (i.fail() || signature != kSignature) return false; + + i.read((char *)&pipeline_features_version, sizeof(pipeline_features_version)); + if (i.fail()) return false; + + i.read((char *)&schedule_features_version, sizeof(schedule_features_version)); + if (i.fail()) return false; + + uint32_t buffer_count; + i.read((char *)&buffer_count, sizeof(buffer_count)); + if (i.fail() || buffer_count != 6) return false; + + const auto load_one = [&i](Buffer &buf) -> bool { + uint32_t dimension_count; + i.read((char *)&dimension_count, sizeof(dimension_count)); + if (i.fail() || dimension_count != (uint32_t)buf.dimensions()) return false; + for (uint32_t d = 0; d < dimension_count; d++) { + uint32_t extent; + i.read((char *)&extent, sizeof(extent)); + if (i.fail() || (int)extent != (int)buf.extent(d)) return false; + } + i.read((char *)(buf.data()), buf.size_in_bytes()); + if (i.fail()) return false; + return true; + }; + + if (!load_one(head1_filter)) return false; + if (!load_one(head1_bias)) return false; + if (!load_one(head2_filter)) return false; + if (!load_one(head2_bias)) return false; + if (!load_one(conv1_filter)) return false; + if (!load_one(conv1_bias)) return false; + + return true; +} +bool Weights::load_from_file(const std::string &filename) { + std::ifstream i(filename, std::ios_base::binary); + return load(i); +} + +bool Weights::save(std::ostream &o) const { + const uint32_t signature = kSignature; + o.write((const char *)&signature, sizeof(signature)); + if (o.fail()) return false; + + o.write((const char *)&pipeline_features_version, sizeof(pipeline_features_version)); + if (o.fail()) return false; + + o.write((const char *)&schedule_features_version, sizeof(schedule_features_version)); + if (o.fail()) return false; + + const uint32_t buffer_count = 6; + o.write((const char *)&buffer_count, sizeof(buffer_count)); + if (o.fail()) return false; + + const auto save_one = [&o](const Buffer &buf) -> bool { + const uint32_t dimension_count = buf.dimensions(); + o.write((const char *)&dimension_count, sizeof(dimension_count)); + if (o.fail()) return false; + for (uint32_t d = 0; d < dimension_count; d++) { + uint32_t extent = buf.extent(d); + o.write((const char *)&extent, sizeof(extent)); + if (o.fail()) return false; + } + o.write((const char *)(buf.data()), buf.size_in_bytes()); + if (o.fail()) return false; + return true; + }; + + if (!save_one(head1_filter)) return false; + if (!save_one(head1_bias)) return false; + if (!save_one(head2_filter)) return false; + if (!save_one(head2_bias)) return false; + if (!save_one(conv1_filter)) return false; + if (!save_one(conv1_bias)) return false; + + return true; +} + +bool Weights::save_to_file(const std::string &filename) const { + std::ofstream o(filename, std::ios_base::trunc | std::ios_base::binary); + return save(o); +} + +bool Weights::load_from_dir(const std::string &dir) { + const auto buffer_from_file = [](const std::string &filename, Buffer &buf) -> bool { + std::ifstream i(filename, std::ios_base::binary); + i.read((char *)(buf.data()), buf.size_in_bytes()); + i.close(); + if (i.fail()) return false; + return true; + }; + + if (!buffer_from_file(dir + "/head1_conv1_weight.data", head1_filter)) return false; + if (!buffer_from_file(dir + "/head1_conv1_bias.data", head1_bias)) return false; + if (!buffer_from_file(dir + "/head2_conv1_weight.data", head2_filter)) return false; + if (!buffer_from_file(dir + "/head2_conv1_bias.data", head2_bias)) return false; + if (!buffer_from_file(dir + "/trunk_conv1_weight.data", conv1_filter)) return false; + if (!buffer_from_file(dir + "/trunk_conv1_bias.data", conv1_bias)) return false; + + // Old style data doesn't record the versions, so just assume they are current + pipeline_features_version = PipelineFeatures::version(); + schedule_features_version = ScheduleFeatures::version(); + + return true; +} + +bool Weights::save_to_dir(const std::string &dir) const { + const auto buffer_to_file = [](const Buffer &buf, const std::string &filename) -> bool { + std::ofstream o(filename, std::ios_base::trunc | std::ios_base::binary); + o.write((const char *)(buf.data()), buf.size_in_bytes()); + o.close(); + if (o.fail()) return false; + return true; + }; + + if (!buffer_to_file(head1_filter, dir + "/head1_conv1_weight.data")) return false; + if (!buffer_to_file(head1_bias, dir + "/head1_conv1_bias.data")) return false; + if (!buffer_to_file(head2_filter, dir + "/head2_conv1_weight.data")) return false; + if (!buffer_to_file(head2_bias, dir + "/head2_conv1_bias.data")) return false; + if (!buffer_to_file(conv1_filter, dir + "/trunk_conv1_weight.data")) return false; + if (!buffer_to_file(conv1_bias, dir + "/trunk_conv1_bias.data")) return false; + return true; +} + +} // namespace Internal +} // namespace Halide diff --git a/src/autoschedulers/anderson2021/Weights.h b/src/autoschedulers/anderson2021/Weights.h new file mode 100644 index 000000000000..c2d2220a03c2 --- /dev/null +++ b/src/autoschedulers/anderson2021/Weights.h @@ -0,0 +1,54 @@ +#ifndef _WEIGHTS +#define _WEIGHTS + +#include +#include +#include + +#include "Featurization.h" +#include "HalideBuffer.h" +#include "NetworkSize.h" + +namespace Halide { +namespace Internal { + +struct Weights { + uint32_t pipeline_features_version = PipelineFeatures::version(); + uint32_t schedule_features_version = ScheduleFeatures::version(); + + Halide::Runtime::Buffer head1_filter{head1_channels, head1_w, head1_h}; + Halide::Runtime::Buffer head1_bias{head1_channels}; + + Halide::Runtime::Buffer head2_filter{head2_channels, head2_w}; + Halide::Runtime::Buffer head2_bias{head2_channels}; + + Halide::Runtime::Buffer conv1_filter{conv1_channels, head1_channels + head2_channels}; + Halide::Runtime::Buffer conv1_bias{conv1_channels}; + + template + void for_each_buffer(F f) { + f(head1_filter); + f(head1_bias); + f(head2_filter); + f(head2_bias); + f(conv1_filter); + f(conv1_bias); + } + + void randomize(uint32_t seed); + + bool load(std::istream &i); + bool save(std::ostream &o) const; + + bool load_from_file(const std::string &filename); + bool save_to_file(const std::string &filename) const; + + // Load/save from the 'classic' form of six raw data files + bool load_from_dir(const std::string &dir); + bool save_to_dir(const std::string &dir) const; +}; + +} // namespace Internal +} // namespace Halide + +#endif // _WEIGHTS diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh new file mode 100644 index 000000000000..a9e5c23fa886 --- /dev/null +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -0,0 +1,575 @@ +#!/bin/bash + +# Build the generator to autotune. This script will be autotuning the +# autoscheduler's cost model training pipeline, which is large enough +# to be interesting. +if [ $# -lt 6 -o $# -gt 7 ]; then + echo "Usage: $0 /path/to/some.generator generatorname halide_target weights_file autoschedule_bin_dir train_only [generator_args_sets]" + exit +fi + +set -eu + +source $(dirname $0)/scripts/utils.sh +find_halide HALIDE_ROOT + +#trap "exit" INT TERM +#trap "kill 0" EXIT + +GENERATOR=${1} +PIPELINE=${2} +HL_TARGET=${3} +START_WEIGHTS_FILE=${4} +AUTOSCHED_BIN=${5} +TRAIN_ONLY=${6} + +LEARNING_RATE=${LEARNING_RATE:-0.001} + +# Read the generator-arg sets into an array. Each set is delimited +# by space; multiple values within each set are are delimited with ; +# e.g. "set1arg1=1;set1arg2=foo set2=bar set3arg1=3.14;set4arg2=42" +if [ $# -ge 7 ]; then + IFS=' ' read -r -a GENERATOR_ARGS_SETS_ARRAY <<< "${7}" +else + declare -a GENERATOR_ARGS_SETS_ARRAY= +fi + +# Ensure the length is at least 1 +if [ ${#GENERATOR_ARGS_SETS_ARRAY[@]} -eq 0 ]; then + GENERATOR_ARGS_SETS_ARRAY=( '' ) +fi + +COMPILATION_TIMEOUT=600s +BENCHMARKING_TIMEOUT=10s + +if [ -z ${CXX+x} ]; then + echo The CXX environment variable must be set. Exiting... + exit +fi + +if [ -z ${HL_TARGET} ]; then + get_host_target ${HALIDE_ROOT} HL_TARGET + HL_TARGET=${HL_TARGET}-cuda-cuda_capability_70 +fi +echo Training target is: ${HL_TARGET} + +if [ -z ${GENERATOR} ]; then +GENERATOR=./bin/demo.generator +fi + +if [ -z ${PIPELINE} ]; then +PIPELINE=demo +fi + +SEARCH_SPACE_OPTIONS=${SEARCH_SPACE_OPTIONS:-"1111"} + +SAMPLES=${SAMPLES_DIR} +mkdir -p ${SAMPLES} + +WEIGHTS=${SAMPLES}/updated.weights +if [[ -f ${WEIGHTS} ]]; then + echo Using existing weights "${WEIGHTS}" +else + # Only copy over the weights if we don't have any already, + # so that restarted jobs can continue from where they left off + cp ${START_WEIGHTS_FILE} ${WEIGHTS} + echo Copying starting weights from ${START_WEIGHTS_FILE} to ${WEIGHTS} +fi + +# We could add this unconditionally, but it's easier to wade thru +# results if we only add if needed +#for F in disable_llvm_loop_opt; do + #if [[ ! ${HL_TARGET} =~ .*${F}.* ]]; then + #HL_TARGET="${HL_TARGET}-${F}" + #fi +#done + +if [ $(uname -s) = "Darwin" ]; then + LOCAL_CORES=`sysctl -n hw.ncpu` +else + LOCAL_CORES=`nproc` +fi +LOCAL_CORES=80 +echo Local number of cores detected as ${LOCAL_CORES} + +# A batch of this many samples is built in parallel, and then +# benchmarked serially. +BATCH_SIZE=80 +NUM_CORES=80 +EPOCHS=200 +NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + +RANDOMIZE_TILINGS="${RANDOMIZE_TILINGS:-1}" +USE_FREEZE="${USE_FREEZE:-1}" + +echo "Randomize tilings = ${RANDOMIZE_TILINGS}" +echo "Use freeze = ${USE_FREEZE}" +echo "# GPUs = ${NUM_GPUS}" + +USE_BENCHMARK_QUEUE="${USE_BENCHMARK_QUEUE:-1}" +BENCHMARK_QUEUE_DIR=${SAMPLES}/benchmark_queue + +RETRAIN_AFTER_EACH_BATCH=${RETRAIN_AFTER_EACH_BATCH:-1} +COMPILE_ONLY=${COMPILE_ONLY:-0} + +if [[ $COMPILE_ONLY == 1 ]]; then + echo "Compile only: ON" + RETRAIN_AFTER_EACH_BATCH=0 + USE_BENCHMARK_QUEUE=0 +else + echo "Compile only: OFF" +fi + +ENABLE_BEAM_SEARCH=${ENABLE_BEAM_SEARCH:-1} +if [[ ${ENABLE_BEAM_SEARCH} == 1 ]]; then + echo "Beam search: ON" +else + echo "Beam search: OFF" +fi + +# Latest git hash +GIT_HASH=$(git rev-parse --verify HEAD) + +if [[ $TRAIN_ONLY != 1 ]]; then + get_timeout_cmd TIMEOUT_CMD +else + echo "Train only mode: ON" + EPOCHS=10000 +fi + +record_command() { + BATCH=${1} + SAMPLE_ID=${2} + CMD=${3} + TXT=${4} + FAILED=${5} + BATCH_DIR=${SAMPLES}/${BATCH} + + echo $CMD > ${BATCH_DIR}/${SAMPLE_ID}/${TXT}.txt + + if [[ ${FAILED} == 1 && -f ${BATCH_DIR}/${SAMPLE_ID}/sample.sample ]]; then + # Delete the .sample file so it doesn't get included in re-training + rm -f ${BATCH_DIR}/${SAMPLE_ID}/sample.sample + fi +} + +# Build a single featurization of the pipeline with a random schedule +make_featurization() { + D=${1} + RANDOM_DROPOUT_SEED=${2} + FNAME=${3} + EXTRA_GENERATOR_ARGS=${4} + BATCH=${5} + SAMPLE_ID=${6} + USED_WEIGHTS=${7} + + mkdir -p ${D} + rm -f "${D}/${FNAME}.featurization" + rm -f "${D}/${FNAME}.sample" + + if [[ $D == */0 && ${ENABLE_BEAM_SEARCH} == 1 ]]; then + # Sample 0 in each batch is best effort beam search, with no randomness + dropout=100 + beam=32 + else + # The other samples are random probes biased by the cost model + dropout=1 # 1% chance of operating entirely greedily + beam=1 + fi + + local -r shared_memory_limit=48 + local -r shared_memory_sm_limit=96 + + GPU=$((RANDOM % NUM_GPUS)) + CMD="CUDA_VISIBLE_DEVICES=${GPU} \ + HL_SEARCH_SPACE_OPTIONS=${SEARCH_SPACE_OPTIONS} + HL_SEED=${RANDOM_DROPOUT_SEED} \ + HL_WEIGHTS_DIR=${WEIGHTS} \ + HL_RANDOMIZE_TILINGS=${RANDOMIZE_TILINGS} \ + HL_FREEZE_INLINE_COMPUTE_ROOT=${USE_FREEZE} \ + HL_RANDOM_DROPOUT=${dropout} \ + HL_BEAM_SIZE=${beam} \ + HL_SHARED_MEMORY_LIMIT=${shared_memory_limit} \ + HL_SHARED_MEMORY_SM_LIMIT=${shared_memory_sm_limit} \ + HL_MACHINE_PARAMS=${HL_MACHINE_PARAMS} \ + HL_DEBUG_AUTOSCHEDULE=1 \ + HL_DEBUG_CODEGEN=1 \ + /bin/time -f 'Compile time (s): %e' ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \ + ${GENERATOR} \ + -g ${PIPELINE} \ + -f ${FNAME} \ + -o ${D} \ + -e stmt,assembly,static_library,c_header,registration,schedule,featurization \ + target=${HL_TARGET} \ + auto_schedule=true \ + ${EXTRA_GENERATOR_ARGS} \ + -p ${AUTOSCHED_BIN}/libautoschedule_anderson2021.so \ + -s Anderson2021 \ + 2> ${D}/compile_err.txt > ${D}/compile_log.txt" + + FAILED=0 + eval $CMD || FAILED=1 + + echo "git rev-parse --verify HEAD = ${GIT_HASH}" >> ${D}/compile_err.txt + + record_command $BATCH $SAMPLE_ID "${CMD/$WEIGHTS/$USED_WEIGHTS}" "autoschedule_command" $FAILED + if [[ $FAILED == 1 ]]; then + echo "Autoschedule failed or timed out for ${D}" | tee -a ${D}/compile_err.txt + if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then + touch "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-failed" + fi + return + fi + + LIBPNG_CFLAGS=$(libpng-config --cflags) + LIBPNG_LIBS=$(libpng-config --ldflags) + CMD="${CXX} \ + -std=c++11 \ + -O3 + -I ../../include \ + ${LIBPNG_CFLAGS} \ + ${AUTOSCHED_BIN}/host-cuda/RunGenMain.o \ + ${D}/*.registration.cpp \ + ${D}/*.a \ + -o ${D}/bench \ + -ljpeg ${LIBPNG_LIBS} -ldl -lpthread" + + eval $CMD + FAILED=0 + if [[ $? != 0 ]]; then + echo "Compile failed ${D}" | tee -a ${D}/compile_err.txt + FAILED=1 + if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then + touch "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-failed" + fi + else + if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then + touch "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}" + fi + fi + + rm ${D}/${FNAME}.a + rm ${D}/${FNAME}.s + rm ${D}/${FNAME}.h + rm ${D}/${FNAME}.registration.cpp + rm ${D}/compile_log.txt +} + +IMAGES_DIR="${HALIDE_ROOT}/apps/images" + +# Benchmark one of the random samples +benchmark_sample() { + D=${1} + BATCH=${3} + SAMPLE_ID=${4} + GPU_INDEX=${8} + + if [[ ! -f ${D}/bench ]]; then + if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then + mv "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-benchmarking-gpu_${GPU_INDEX}" "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-completed" + fi + return + fi + + CMD="CUDA_VISIBLE_DEVICES=${GPU_INDEX} HL_NUM_THREADS=${NUM_CORES} \ + ${TIMEOUT_CMD} -k ${BENCHMARKING_TIMEOUT} ${BENCHMARKING_TIMEOUT} \ + ${D}/bench" + + get_bench_args ${IMAGES_DIR} ${PIPELINE} ${D} BENCH_ARGS + CMD="${CMD} \ + ${BENCH_ARGS} \ + --benchmarks=all" + + CMD="${CMD} 2> ${D}/bench_err.txt" + + eval $CMD | tee ${D}/bench.txt + + FAILED=0 + if [[ ! -s ${D}/bench.txt ]]; then + echo "Benchmarking failed or timed out for ${D}" + FAILED=1 + fi + + record_command $BATCH $SAMPLE_ID "$CMD" "benchmark_command" $FAILED + + if [[ ${FAILED} == 1 ]]; then + if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then + mv "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-benchmarking-gpu_${GPU_INDEX}" "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-completed" + fi + return + fi + + # Add the runtime, pipeline id, and schedule id to the feature file + R=$(cut -d' ' -f8 < ${D}/bench.txt) + P=$5 + S=$2 + FNAME=$6 + + ${AUTOSCHED_BIN}/featurization_to_sample ${D}/${FNAME}.featurization $R $P $S ${D}/${FNAME}.sample || echo "featurization_to_sample failed for ${D} (probably because benchmarking failed)" + + rm ${D}/${FNAME}.featurization + rm ${D}/bench + rm ${D}/${FNAME}.stmt + + if [[ $USE_BENCHMARK_QUEUE == 1 ]]; then + mv "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-benchmarking-gpu_${GPU_INDEX}" "${BENCHMARK_QUEUE_DIR}/${BATCH}-${SAMPLE_ID}-completed" + fi +} + +NUM_BATCHES=${NUM_BATCHES:-1} +TOTAL_NUM_SAMPLES=$((NUM_BATCHES*BATCH_SIZE*${#GENERATOR_ARGS_SETS_ARRAY[@]})) + +echo "Num batches: ${NUM_BATCHES}" +echo "Total number of samples to be generated: ${TOTAL_NUM_SAMPLES}" + +if [[ ${RETRAIN_AFTER_EACH_BATCH} == 1 ]]; then + NUM_SAMPLES_PER_QUEUE=$((BATCH_SIZE*${#GENERATOR_ARGS_SETS_ARRAY[@]})) +else + NUM_SAMPLES_PER_QUEUE=$((NUM_BATCHES*BATCH_SIZE*${#GENERATOR_ARGS_SETS_ARRAY[@]})) +fi + +MAX_BENCHMARK_TIME=$((NUM_SAMPLES_PER_QUEUE*660)) + +echo "Number of samples per queue: ${NUM_SAMPLES_PER_QUEUE}" +echo "Max. benchmark time: ${MAX_BENCHMARK_TIME}" + +echo "Retrain after each batch: ${RETRAIN_AFTER_EACH_BATCH}" + +benchmark_loop() { + mkdir -p ${BENCHMARK_QUEUE_DIR} + + START_TIME="$SECONDS" + MAX_TIME=${MAX_BENCHMARK_TIME} + sleep 1 + + echo "Starting benchmark loop for samples in ${SAMPLES}/*" + echo "Max. benchmark loop time = ${MAX_TIME} seconds" + + local num_completed=0 + while [[ 1 ]]; do + unset waitlist + + for FILE in $(ls ${BENCHMARK_QUEUE_DIR}); do + if [[ $FILE == *"failed" ]]; then + # The sample failed to compile + num_completed=$((num_completed+1)) + rm "${BENCHMARK_QUEUE_DIR}/${FILE}" + continue + fi + + SAMPLE_ID=$(echo "${FILE}" | cut -d- -f 2) + BATCH=$(echo "${FILE}" | cut -d- -f 1) + SAMPLE_DIR="${SAMPLES}/${BATCH}/${SAMPLE_ID}" + + # We sometimes encounter spurious permission denied errors. Usually, + # retrying will resolve them so remove from this file the + # '-completed' tag and let it be benchmarked again + if [[ -f "${SAMPLE_DIR}/bench_err.txt" ]]; then + if grep -q "Permission denied" "${SAMPLE_DIR}/bench_err.txt"; then + FILE=${FILE%-completed} + fi + fi + + if [[ -f "${SAMPLE_DIR}/bench.txt" ]] && [[ $FILE == *"-completed" ]]; then + # Benchmarking has been completed + num_completed=$((num_completed+1)) + rm "${BENCHMARK_QUEUE_DIR}/${FILE}" + continue + fi + + if [[ $FILE == *"benchmarking"* ]]; then + # Sample is still benchmarking + continue + fi + + BATCH_ID=$(echo "${BATCH}" | cut -d_ -f 2) + EXTRA_ARGS_IDX=$(echo "${BATCH}" | cut -d_ -f 3) + DIR=${SAMPLES}/${BATCH} + + while [[ 1 ]]; do + if find_unused_gpu ${BENCHMARK_QUEUE_DIR} ${NUM_GPUS} gpu_id; then + S=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID) + FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID) + # Mark this file with gpu_${gpu_id} so we know that GPU is + # occupied + mv "${BENCHMARK_QUEUE_DIR}/${FILE}" "${BENCHMARK_QUEUE_DIR}/${FILE}-benchmarking-gpu_${gpu_id}" + benchmark_sample "${DIR}/${SAMPLE_ID}" $S $BATCH $SAMPLE_ID $EXTRA_ARGS_IDX $FNAME $BATCH_ID $gpu_id & + waitlist+=("$!") + break + else + # All GPUs are in use + sleep 0.1 + fi + done + done + + if [[ num_completed -eq NUM_SAMPLES_PER_QUEUE ]]; then + wait "${waitlist[@]}" + echo "Benchmarking complete." + break + fi + + ELAPSED_TIME=$(("SECONDS"-START_TIME)) + if [[ ELAPSED_TIME -ge MAX_TIME ]]; then + echo "Benchmark queue has been active for more than ${MAX_TIME} seconds. Exiting." + for pid in ${waitlist[@]}; do + kill $pid + done + break + fi + done + + TOTAL_BENCHMARK_TIME=$(("SECONDS"-START_TIME)) + echo "Benchmark time for batch: ${TOTAL_BENCHMARK_TIME}" + rm -rf ${BENCHMARK_QUEUE_DIR} +} + +MAX_AUTOSCHEDULE_JOBS=${LOCAL_CORES} + +BENCHMARK_QUEUE_ENABLED=0 + +if [[ $USE_BENCHMARK_QUEUE == 1 ]] && [[ $TRAIN_ONLY != 1 ]]; then + echo "Benchmark queue = ON" + # This includes 1 job for the benchmark loop + MAX_AUTOSCHEDULE_JOBS=$((LOCAL_CORES-NUM_GPUS)) + BENCHMARK_QUEUE_ENABLED=1 +else + echo "Benchmark queue = OFF" +fi + +echo "Max. autoschedule jobs = ${MAX_AUTOSCHEDULE_JOBS}" + +SECONDS=0 + +if [[ $TRAIN_ONLY != 1 ]]; then + if [[ $BENCHMARK_QUEUE_ENABLED == 1 && $RETRAIN_AFTER_EACH_BATCH == 0 ]]; then + echo "Starting benchmark queue" + benchmark_loop & + benchmark_loop_pid=("$!") + echo "Starting PID: ${benchmark_loop_pid}" + fi + + for ((BATCH_IDX=0;BATCH_IDX<${NUM_BATCHES};BATCH_IDX++)); do + if [[ $BENCHMARK_QUEUE_ENABLED == 1 && $RETRAIN_AFTER_EACH_BATCH == 1 ]]; then + echo "Starting benchmark queue" + benchmark_loop & + benchmark_loop_pid=("$!") + echo "Starting PID: ${benchmark_loop_pid}" + fi + + while [[ 1 ]]; do + BATCH_ID=$(od -vAn -N3 -tu4 < /dev/urandom | awk '{print $1}') + + if [ ! -d "${SAMPLES}/batch_${BATCH_ID}_0" ]; then + break + fi + done + + echo "Starting compiling of new batch with id: ${BATCH_ID}" + + for ((EXTRA_ARGS_IDX=0;EXTRA_ARGS_IDX<${#GENERATOR_ARGS_SETS_ARRAY[@]};EXTRA_ARGS_IDX++)); do + # Compile a batch of samples using the generator in parallel + BATCH=batch_${BATCH_ID}_${EXTRA_ARGS_IDX}_${RANDOMIZE_TILINGS}_${USE_FREEZE} + DIR=${SAMPLES}/${BATCH} + + # Copy the weights being used into the batch folder so that we can repro failures + mkdir -p ${DIR}/ + cp ${WEIGHTS} ${DIR}/used.weights + + EXTRA_GENERATOR_ARGS=${GENERATOR_ARGS_SETS_ARRAY[EXTRA_ARGS_IDX]/;/ } + + if [ ! -z "${EXTRA_GENERATOR_ARGS}" ]; then + echo "Adding extra generator args (${EXTRA_GENERATOR_ARGS}) for batch_${BATCH_ID}" + fi + + echo ${EXTRA_GENERATOR_ARGS} > ${DIR}/extra_generator_args.txt + + # Do parallel compilation in batches, so that machines with fewer than BATCH_SIZE cores + # don't get swamped and timeout unnecessarily + unset waitlist; + first=$(printf "%04d%04d" $BATCH_ID 0) + last=$(printf "%04d%04d" $BATCH_ID $(($BATCH_SIZE-1))) + echo Compiling ${BATCH_SIZE} samples from ${first} to ${last} + CUR_SECONDS="$SECONDS" + for ((SAMPLE_ID=0;SAMPLE_ID<${BATCH_SIZE};SAMPLE_ID++)); do + while [[ 1 ]]; do + RUNNING=$(jobs -r | wc -l) + if [[ RUNNING -ge MAX_AUTOSCHEDULE_JOBS ]]; then + sleep 1 + else + break + fi + done + + RANDOM_DROPOUT_SEED=$(printf "%04d%04d" $BATCH_ID $SAMPLE_ID) + FNAME=$(printf "%s_batch_%04d_sample_%04d" ${PIPELINE} $BATCH_ID $SAMPLE_ID) + make_featurization "${DIR}/${SAMPLE_ID}" $RANDOM_DROPOUT_SEED $FNAME "$EXTRA_GENERATOR_ARGS" $BATCH $SAMPLE_ID ${DIR}/used.weights & + waitlist+=("$!") + done + + # benchmark them serially using rungen + if [[ $USE_BENCHMARK_QUEUE == 0 && ${COMPILE_ONLY} == 0 ]]; then + wait "${waitlist[@]}" + COMPILE_TIME=$((SECONDS-CUR_SECONDS)) + echo "Compile time for batch: ${COMPILE_TIME}" + + CUR_SECONDS="$SECONDS" + for ((SAMPLE_ID=0;SAMPLE_ID<${BATCH_SIZE};SAMPLE_ID=SAMPLE_ID+NUM_GPUS)); do + for ((INDEX=0;INDEXze-Pnq7PE^{o53@7FVS%})0&B9Z8StK0wo)D}L? zgw!A0GEFvw^YIR9c=Hb=<2LX#`{T@F z31hCUcBGriNV2I2BWi1@{;vvhjWR{u@j8mMEn&G!a-hiHOz8)|)6bl4}ZDd!$7(vv~<@hsKHd?N2IYvFoj7s}Q( z!>==ua@Qd;EQmpn=_ztFh-4AH)*ju>t*JxX+;}?n z=rbgvSCDahwAgdP9!hIJPK&!AA~g}qn_f=ZBS%x_pMOC+^Cq#B4%EMh#5A)cNQO3K zcdIIf-SS>Adp-xNtDnQPs)=W;-hu;JuaH({1+NLY6n-LQC&amfc14yJsV0ku$}P(g@4`n{Qes1=WBuR=k!S=V5_sw*{4P*Q2^ zP8dHNqEyd)$EpHf@y)MVQQZ0p+K~-tc&#Gki5wWH7eQ)XBwi`okH&wSNPllR*E6dp zi;+9nRNX*0e2pM!Y@+hPF+lX?noVk_rO>bY#q7SX;24{Rw4>$ZH+(E*q^u!_!jaI= z{Ec$W)Lil3Ze%Vfhlg-Cy*8`rBffSXKNdr}Xkb~|xh)2djWR^*n z_I`oX|F&43En)KyJ;v;eO>laCgH+pAl6vb!sx6sA!9AW+oX0)7c)Fe{df#DHzPsUk zG?mS6RMEoXNEqmvQsjY~EbZBJs`q|IzJJE>G{+&VY||60wBHVkBgx8)N0&%G;u1!@ zxk2WyCsXvYEbiX@9(k_%kE$Y%@Rn5*$bQy&8vTc`Ka^`}gCY()ejY@!_!wp7JfZ8} zHz}%VAHsJ%LN{U0)Ym2n_dE&GS<{&G?lwlgVKlJsJ{qLVrIvC3ky5jjwoiHtMbbX5 z8dneNjKL(5B(m`2Ib>XTMww^&2ES4Qk>lG2xpDxdtuRJ|<4R;svZHXVJg!&q8$2|o zfnB8hIiW-TIn$X>aVYVH-ba>_ttHcTRMQyv+ajl%U|*_8QlHm|R{%?z!^ zQ|GbS7=5&q+_v(PN)ojPhAbC1%D zzbPG+`)QNQSQxT{-1+YVe9qdduy0(5O--C+qC+;a(f_fgd_$Nfjf zr9#@fuLvgBW$^T?W)TJPbinHdNfnRTi1jiOS;c0Jj(?1_?p35Uy?`_hkBJ92A>nT) z?A@~viFbmT^w%YE)#}&q40y&w@>P>9s#Za=T1Q+r&=uw<4AJg!i^c`tp`5X^A#eW3 zBZj6>+AC+uwfanhYTn~$O&w1wn$LNpFKeHF0@aV+!*Y-gGbxxSJUNyv?wtp_TeGht z|4Gqj^5t4eZW^3FCB^zA5W0sk*+-6UytJddPz-c4LK_}a_5xonK3zanNm z`3DmjU!dZQ0xHneMd!GB63^17;$_xsaG!5T=uuA@qvv5^bs1Xk-Diy*4%F4B84Hfj zfW!l0ObaK;W<%E7NI28^LmSbuD@uz+W06?q$8Xj=L4N)=F8d?LrrfiQqP4zK48LA)FT?P(jV0H_A;jE2iALjLauy&^#N4{K^KFeCGt0ZM$e=cPt%g zdRg2frh?RwFU8t3Cz7L2I~ATej3B)auymSEGwOx=V>JuEJR_knI>j#ZxrDWABe8f! zC`qtV+;znZSUmm(m8Ui=J#LcfQy=bbG=U`fe~G&dQ=)NDAgS|yD#O=GQ2k9#>IG%2 zCG01eIdyPXt`F^5wXFKjI27D0fbJM0(tSOP9Is|8)f4}ufM*tTWx)q@vWmi=ANHd( zq+K!U+nNPjvqX_z-NZMcb#sz%i9`GKFm-XmaKFOsJ8AkW6> zG{~ch0yYIwPXBCl4*5X(eID@z5oe)sn#cBh4Z(=3);RF{Wf;b#aI^GQW;tjt%D?0z zd7BtD%{er3QY_ZLyh*0AY^u7wmRu){BvJJ)o5sGW*c{RbcdIQt|Na&f*qGA6i_cNA z=_E{QvZ$%Hio*X;3H4oprkz!IoqhqU=A4CG5zFL1P1%kY>11^{j57Omr%Z7cQVTu` z`s}Xw=oKG2@>Kz=hZU4#dWU3xt;rsr*oJ98T}c|@&cnZZQ`QYzXr)Xi$DA+}j9d<% z_UqIby@o92SwmAg0#ThlQB86=mF_;zO;tlL!g@3#C#pT z5#}|MtY-{siM_y&E;1#H^Hxk<@QI`wQ`qc>^KoFh1@x_#lW)Q!ZnC-+Zx$Xz?ybvc z;)8KCql|nDyFlVUUa6Y002ME%QKRz?Dh*hKz^XFTE_+Od&)nh3rjoy#1Q&d=;rw+3 z5;K}0oq9&Bt!RPst4_S++){WC>Q0i%=TJ1=XQQLaD6()7_nDGOiQ!IUm-mF(8C01AdAHw4+}lU4D}Svq3~j=|OP3znba0wXscI`a}QR5Z0z!0qK|^ zadOmT(%*KQwMU&L&Ei&a*m9H(1XiGA!gVxNRL~i^hQeX{QUB1E>U+%P+4J5a(Y%>S z2hCPatG&oYY$UVN&xJ`=GktSUMr)@4))9COYC%K!xfD|3m|>)UHi9WDFO$f73pYDg z0@>z7uKIk9#0QJ{==c^gw$CNw6Mo8qt>3V6OfhUExyXDGPX4XmpfzSWyvJKZ7G@we zbaCg-^@2V#JWYPnekX<6h(+JkBJC-*70x$0A&+`9>f_TtW zs>-k=_o!@IU$O|!V{P~YFFSjW zk33E+gXCue_YJrM$?f}Sx9mjHz9%Rr^)uvGKXdJnYLZ$VM}hr9n(?70G$U%^tkX#3 z?vJ6(k(Ald1lK1Yu^_n;eXg9Kps^X`-#7;P4$Uz5V=r|Li-6?vU8Q8vL@vh zFS&JL1Z!~gM4}^+G+0cI_asdF?N%!IW=pc&0o<#<8Hq;N#Fe-RYx&3jXq67 zr-o3{wY%_|V~5(b3Pc-xrHiL3VLg5$Rhi#}=diKVVxpj(Js-f@wU#Li9y8bWRVEnmQKy8i4{jyH%`4#b;R8GcE$$Tj4`}&q>fXyAEmMy}w*NBx z{XS8kO$QSFEhy6XC~_WLpl)xT!(-P0s*O5A*1L~V+OLP?+wB`SnRB1g_*I^EHHI6z zEn``-WXwMI6&^+LJm0vIL_wcm^wJjP59UBIGgPeg>pb-v7egZsC{ZY{0VaJRebr&o zXkS6W+^HygE@*4pMEY^C2(oMMxXL_-aQ^GX$kA#u~gr59|q#-SlnKQ;o%zO z#`U5ROikjNA(X%J2A3GDLhHO_y0=2;)8yq4&5`of@L^=%JOWNz6>#?aMbJ3>M+ z$HK8@|8qJrRk+XnmIzvMknpTyS<3}&sMM8Q`=W;Ow;acB_Rlcla|gvuIFEu(uV}~e zZFo^|2MQYzTVhlo)bcteUMPm+6En7VRsxF0o`(I_L|&xx9FcD;n1}Nj>eD$1-lZ#O zW$hYV*w+rpaXD$ttV6)P6+A8H2t!98l#&0CXiz1k{-{RPmxbiMcE7N{7SV6_LQpt- zGhS{jL&EKgWPJMqQ-5w$2I!6B_Dgd~<{C&+%V$iZbq2AU9Z)jijp|Kvxdo1FkbE3&DT9=YL|A7tXm9W04$5pLzXRboAO($ZXu1-?JKc9jM_( z!>cIFzXV-6T|$N1O3EmBPbG?EYVf=XN$pbc&+iNa_s5cv<3`9wO=CW^33f6-{v}nZ z8D6f8zWIVO9fY%B*d>-4ZUwWr4i!^d! zK_{^Mav0@X8Dp)^U083gW3{HGLY)wFK?0~-r+UaYm_S->M6UTK@Z!luq<>AP$S1bA zCo4pIvL!ip`HlN%r%~?mP?C?_OU6TTNs}AG^Zop}dUq+`RCX7$hY0=1t2q=P@gUFX zQLN1aOBP?o~`M~D;&{uUrecd>ctPqPOTOFCJ>k2Yo z>q2WOkehfpeW-PyTfa<%Q zp}kqR5ZPrfLG_Zxb-cy6Xh-cLR6KCxaKDX~eX-p0VG_4* zdqh3sYvH}c1DVggsp;5RQcd2^%(HJm)_bnFU44pBSP0SDFIf!^x1n@=g|@Fh=*YCU z&=;pF$NhPjT5sEu^MEj}IXja?{cf_Bz#m-Q|AnAsbdmEW7KP7`!~A(XqW8T;`=&~s z6dw&!ZzZ|i&*G7tS3%XOm2Bhu$^7kMu4(b&CYc@NmDGc3Bd(F$^*FO@SVenJ_(HL2 zH&b=oBKL0Vx!yEa6ik^zCYG0B(MDuADvBHbo~E=M4f1}u2-jZ*LGgDeE54GzYKsQY zS?96WH?M{joJJk`ntaKyLRXaKGNc`Uax9p@ycQ{{TW9Q!z(EIwgWPZ z9}7OwNuIZV5bR2_XwcGDa-LZWf49-l9{{RdrKF30rBcByb8_jGMD`>9q2Lv@=zY2s zimtPm;iFP``L%HM`7h$1KeEVYMK08@6S?N!trSo^n>waEg%Z~(H|i(~dYe+NyD$0m zDj+|T^CbITqO?0dl&aS5BggT6>{OQyXxBHA^%@s)?Dkgq>1`rO*BGOA$R;wdy+xWg z+n8IL8QDfXrH?fUP}J%0XmtpA{-;8t*;(xEa+lI$S3|LXE9(2rhwIc`xN`7eS6}Zmlus9XM(ylzEhLg>Z{Zp82nX;EWO;^$1`}V`r?Fnjag!8s@3AdPV zi;2Vy*@nyN$g*2^(vrlJ=KXutwlj((t`k^V+ZMKE&l%WN^`+!V`$*O>Sm@3AvS+4p zj9mKy+yCr9^mi-xZ#6(>L>{Sbu49&7PhfY>2>jzY*fIx1_rFDIuNm)>+1{uuXjCRCatIFIYgpSM&)Q(eb0ilUIzEgHo} zQ6zgM%QiM%%C0L)5vSikiBr$ATzh|v{;?f(swEI@IAqh7QVCgBiH(baQs{5)3FrTG zo5KfNA>N&W5QnEIF#iHo`e|k4LZGyZh=!%LW5yRllpic2QRuHMYu%5Sd?c0n^=hTJ zr}qmviwx$eGs)>lAneQIQJh-Fip8Vp#jZHC6y2fRqyP#&DuH@l1f{)ON#3P5S+`BE zpfb6`e4A%d%c;TSvT7%~g*72Dx;wRx8&A2#t5MN+0qJE{Q2jM?lAV@_jlNtWS%_1% z-V~-4naNsEmKBqLE91+NG!z6F!K?-^S2XI2)RqPAEoltXEtQO`rKJ6t;{4l+D59V;(_X+I9vv`5HHtYuA&w`h*d zH;8KgCbbr${X!*^1wD)>4+)6)V{hCy&2)hDV;8iMoaF9CN zw~~5I7;qRPEw~@|0P-466GjeM~sIhp{~qKERp@ zo@QSY<}`Rc6^q@VxY7cJ_ZarQp%u;_IYf@LS;^-*l3LH=twpa{&9+O>bdFSZpPG)% zg>tC6w!>_XAF~?ahiS=5?mo4Qte4LsjlzUk{r41UMi<%Dy%Dgq3!sS3=c!`LNHmVO zCi{UeFnO>g{Dkb)G}o1mmi0lzLU*ViZc}C&F66d>W>EMZg?V}~Nu8cx;NT4U`T8j} z>sBIpW(ZXn__2ik#bg^AjO`iE5PkayYO+kw60sc*78N5+J(lz{E0j_;l=}D)N;A{3 zso@h1-4>1=54%Ftu`IjdMKX^Z7{bzfJ|jhd5&=VwlhxiX*s{J9pQg8v=-YVZ2>W9Q zHQfg5)4tRYG7*`T{*=*N2m5YEDL=!L?0c$Ve!GYdx~mK6yh$um^M;(n&Xjr1h0L1k znKY_`dvtsw(Q0d2{%8{x=qpM8S|t1;gSpqFcNjYDsPNwW$LDRjisFA?qGen=>FzBh zgA1NC`d=^P)SRU?M(GGBe#`2IxT3)}2pM%kFTOpBoLBx$I{qQ#_Wd;$KfMXtF5x8B zzd^J(hukMSP(R^3iJ!CxW;>@svpbyJ-nv3mG?eYW^#z;kWTdyMjq*=FW!(%TFwJ^5 zX=mAzwBfC?;kj@w{!z&C$9`gVhKphM&Xy0lt|Iql6+7BlL&48`3w>25d{Tac_nA02 z-VyfY-CCyk{U|f}d==XIGPWk`CcuIReT=ShLof8nbjy>w~RjB_ebcv7TSH~D^C3P85!wuXxi~Rw5r2tTEHKW{62xd zJoFa3R~&;^prCacUXk4HDP5dcj;c{=Xyt@J>T!?#O!+3%;c1k%`WyRt@iY0hUZWm9LT)Z`XZEr@nB86nYx@thURMiR zJx_CmzYhYIDagU18lnZRD4D8*Zi_2ua;gHxUKY$Dc{9p`<*4g)jY8EiWd8I|xX(~C z`N5w`yOA#tnyn4Zq!vpJM2CT}Ls1zK!!>|z9+ zT`XL06It|jA%nKVd}c*54tTcEnyBj}lMGW1?f(rAm^Y-gWnwEK4;^M$hNga((QjlV zjc}fd8FM?xe8E)mj<2L?dfwE@RnP%L`?B@7ipb7F!-_k9WepP+K=p0CQW_Q@&d<8Y zbiWOzip@@_x;Kh4WvwJJ{X=a1VGQ#Ax&p_MR#2F|XVKU9BJ!S`p9-(Q#eW9l>&_a| zxqTH?9iJ$2=~apvv>uBMUy`h`UEEli%d0#L=tAUnhft$>*`%0lTS1CI50+k;h z41JyMq!)0BJjF`ZNnl|9$HUBa^n&mA(X?6G0QcU%NZmXDwGN>)(5i&Y_OwuE;dcy$ zHA`4Se+|RBx@t(Bn#gV9W_Sg4rA^81$Qu5HoKJ4$owJ*%;_XpxSF(}7E`3Yhmcn^DD_pEOdyeV*gi>&qN07T3GR;Ab+6qe) zK8z%lrwNzkZX@f+-$<)&5G~d{MnS(6)NT7>Bm{D5Uh*FK`r~-zLMJ8%{CT`qtj@MVLp(;IR68VqH4tR5k z)EjS$b9IBcG`ETVTM!59ux+S%^N4cNXQ1d;C`>mlB=e*MF5A40wvCH~(ZDB=O!-T! z>7qj}TGQbk+{m1c?W3imZb5&>8D{?CD|}ba#q#Plav2c-z1tzA`D-4le=1~GRTJTP zdj`4uwG+z+=TOO>K#XfHpx}*j;PK}JYHL16livt??f$ZCPhzihIk(a|LiRo6eA>2`SibN*jb3>l675Ub(sBQA zx4*w}yN)j;*Y3wHSEo?VKKZblQ%<#i_N1OB;fM@)D&$~&n8AU0G$!pA^k;Qt!9sma ze|(m5WiQEF_aAOo`VN6N-oo+oBT}(4@o$%YVA`u?baD3>9944e`PWvM4$t9}9S%a( zud6sSbv_Kw%%TX{J96;t1(kgWcNBeNc8vv;=zbQW9UfUT-0zUS?o=w4c+qXcU(jBB zg;^YTW2IwiU|gBNa`tb5Wc_>QFW-XX&sWKCXbAJn&SyEjd%C(jEsaA7W; zo_!C~J#RxMxkt7qh(2<(rPvvKvp~u|@Ed&O8xU@3VAD8D}3%p0!;lt#3NTXRX3u(N(krKY`yL0hDv?5>*L)xJW+3#{Jqaestg? zSV$U~_**M4>IR&<_7V^7JVj{&@*`L z9779sy&}7^IHq{{fw{5Of`_T2)YcwPs%J;U8Cj=!KEA$Xg&Tx;f3(2;=A8KkBK)oi2bhNL+#p5#R z8pLvoL7$W>yK{2=PmGc~F{H~*QS#(cN}Ew9uy5bQR=cwh6!efX2N{uOff@4~8cQN= zNtWSfW3Iz1Y46fBXjTk^&)*Md#J7{=Xm10{LlqRezX;azDp5?rb$h)s9QB$Z+u3yc_0mZIGWZd>yW4I zh{^VgkR_@n`R4E3@kKY}{!HN0=Em{7XBV+p9E#W#zo@Ev5cfHtCuEafS?v`O6_1LA z!Lr`;RDA{NcY~lZ{7RY)8L;zPObyncdTpUEU)@PuIy#i3wgD{GGl@?0D1#%{V&Pwo z;qv%=boIN7&VM%1g5@s}Z5u$gbc8bRoE7{=9h*aMvf&mzn-bDKqV{VZZ3`|!MgNy9 zvDXb$Ost`xA4i~iX2K-NixR2f?fZ>R zsDBFF(0fw%KBfG9w!RJ^b-^YSe9t7yu}cId;4`bN zh&mgc7fy5bx>XYsWdk2!g96bWVo=D)ZI*A z-sdy5 rK+n-V9C|z3Eo&(pMC#ad4Ah2XI>{uI%uX$cr|1p?W&O1sDR)YR7IRuxF z$+YIlK}2i~gz=MQti|LNuR1yd#ZR6=_F7HjI=vu+mo6yZmrq&^wR~1-1+uSY!|`kl zC9GdiFYabuI*o)vH-hJxJ&CUdX2SWPde3gzJUhX^`JLJYQ3V+CKz8t1#B4 zG;;xYIyO?&&`*NiQ6SMGjs1S01Aa@iC`s}RzPFNLw{8v^%a3tc&F91WSUem( zbxFT@Gqr#7po;j*i0|}+cUyfO{k8;Bc~4LzU}a#!%XSt64#e z;D?wO+Sna(M*OHVXq)w&a!rQPjsrT_l&K=aDV02VOcJU-d!WrYheoU3!r8)|GShcZ ze|ZPA?tP^CvniAqx{0P$9D-(H2KRByBah1&j=RbAcM31X1CpD|D^vRRnrBAnJW1%y}sqZEPvww^m9>sijqlU|5ZN zMve`>WPP=i^w!JCG2|SJHwi?c^>et+mGi_$XINER!rTa(4cOjn`L+E~gMn1#kUlW;)r zZal2Z$h%VxQw-Io_)E(K9lM?)9U9p7J>Q`CmCS=?h10;><|xSwh4}Iz$c}hs|BUuU z&x`j^upo?F$2&v&A32;h9p-LDUCHV1PpB_z#@8zyq&W12TlIZH=U;q7#a>$|It^i} z2hW)#a;^Bv<}A=30w-CSMC%{FLVC47>7IXvHM#v!>S#h*3kM_pc?Z;=nn-Wz0eVmz zh=7<0Tvl_`#^q5w6*p)xyOC;=)-{VAy*;?W+^5`A{e`zm+n9d8;Uvm@F-Z|DFr(B) zepi>1>--=dA-|7+4*16{p zn6ERXXx-lA?CDJn${{d(SOMRhM`Wm-C)DjPlM8G$(2pO@>VDTk#hYc^Y2OU0j*2GP zAI@YnG#HCtlu%*9PsBq;wp|Y((?>-<0tY!e(jJm!i^VGG3#C(jA5xwhj|&B*sMw#+ z7S~@V;}CDQcxD<@S~XOEW(KuReGQ5EQ|3}#M8>1O@Nw_Lsc@wq%|2L4jgDD}boqyS zlpA1ZQYq=*>&nxTe`hYmW|%NrfyLh|;N$p~22P(Y_$Ap)s&^O4-3KvdVK4%5nh@{fhvK$ncoUumv>~?G@cxto>K6%Dxr6ZVLRrJLgfv2T3@r|t)*u$TbzS673chODYr)(=YonLbj7EN&|d|F3ZLIyTR*f*|AvUvUg zCoCxbOd~E|rfUx#qqIJW;-!1Zdu=HdC-`xnOF57}5O{{M$zqGIoQcycQL?0+Ag`h$ z9)|^%|1$TSb4%&}DTRi*R||Q>dCcw$RR4Pb=I5>pT+=foUiYI`$tfQDSYV_>p0I#p z7g)ctMCy}sn?#Wdvx{e4g6pdHBy&*NH2NHa?!22&o!zJGzx54FDT&*$y%3GQ$coFp za{H1;klq~4G791$7az4p;LnRM91vs>W|79zH0SHw8aOqeU4N=ARN+r&ClAu$w%l_#rJ!%w>rx{wee)#f>pXx0`2B|E()e z{v(g| zUh#o~s;0m*u_tP?j+2G=8;WimNt&W(Eb7QWXkB+E(~GsFS6N5()Hvv$%W0*?1v0s{*lxjaQqTM-R?m2dB1h z@DS3j+gSXuSCn;pD%_T*qV(huPKqC-KO>0gohZS(&QB1u`d?@tu7j&z1C&lyC{=}V zAJ?-KoqLS(8pEM-`pi^eL&TQ~RXE!J9kNos3!YpD^55FwR$x7hjvf^_)MDPRhaZ%& zx45)?J+oeWf*OR}Pa61?i4x_^^O!N}_BGQ=7bA=veG2-W_mK6m$LxFcTX;V2#H1;S z)Hp!UrHjYn+Pv$i8e~Dm9miSgqb&A*uOHl&tfkz)N0Cgwdv@!w2i$P&26BBl6`fMv z!THf`-Yw+=rK-^zh6=xD;e}UVkjVG0G~YgA}Nd#sP0i1)WW%GIA|%t-Et{0X({DueWnOu)`6bvMYVG(No%2kubPXBnU*>kxN5ai{0!7tdLRH>o ze4JG!c-l*lVYG`32fSx0pSjATyXtv->tCcEDU51#s-r(^vQXi73Dc_lQTxV+?8m&p zpx;Vp<=rB92>DF?AsJH?-C@$9;gogf2DCP>qB#pWf{v%qh~7VF>pw!Cs>or+DVGIL za6T%#6p{P79A;jaMYY;kql8G(2*Ln|^pqsvmCR8!a3OKSL<(zJ|hwR8YyN z^=QaB1e^8-$oI4<6;Z!v&8f5GZ}W&MI}TFp-;c0-`x6qyG-p|}J>0gZoRa@INR~r| zY{7N4z!g@)F36W0zs0a=y-spZmmOSRzKu2BUkT~1(PTI~f|p;9qnwm{{L%gf<+Bfy zeq~=)H{p^nKN3&jVS+Ce+B~^!mB7f){6USgt!PSS0>nwCbl>DHc05U;<-a$Q=lgwJ z+b9#tjTO+jdWM`Gk8#5)EwtS0g!oxDG{br)JXI>Cljj+(3fB=gPKlmStA}Rz8D8~RLz*uGxX+ibWH@sJh2M>U#nK5ZIO;LRC7uEeET+`rgK$pK=N09r zkubAD@Wd_%%w!bFhtEaxG$GG-4rT_gY3`X7MzHrnT$yObE1y{vnfm3@$hAR~;uk09^NVmh*E(aDp?~-j~Fv8mI z!$o&LSzW$C?s*TH-R(lI|I&vnHclaN+9H~v_(f5r`>|Y}NcM@Q^tQMG79)e0=PD@E z^nlDeSyW;Bg)KjOkMdJq!>l}lSmcM6e2YRJRSniehM!7bA5K(%oI{zC>GZSt)|wV zTQRV&3uQQ3kW&DsmC4O$**S>hn%)SSX$t=yPsu7ZkOJ0k75w5nRGzORiP8_rlHHI^ z9mr%mwO|(_oI5*{DXZ-`wHqJd#pAcI*0#r7W;lQ~EH{VD3EqIHZ$13tDL@760qQlzLtW-W)=!?wp3V z)R&CI{}ou{^JH&xg6C~~PU6lP)O_$0W{k8)i>WuQF{s0eQ?XEP-i_%;p5Skt8*uz- z#PgME=JfkxE`OMYg#EKH%exg@XJ?Y}mLoJHWd=MiU0}t5hnR7Dk+@~ea^&sXjmXjM zO#O2cMP1FN1!ES$?%NWy`Te3155B`-LrEc<0q`|g zN{hYk(!i!9%J)3SJWDjZaPfD<{{0#1n^Tl=B^ODOki%_vexu53M`>k{JyfI4h#SV- zfz)h@()8cYv`Jv`@>kr#n&|tm5Ez4W=SC_&R!jy~0dN_*45|-bvLo$lNwlOlY{L+-wEVMJebA8AZqMOO6AOi$;4*JM^Aq*UW0}!~m-Kpl zEA$1=DM$Ad4S1gg&)IjFhj2a_q-0}a*hAEhnM!62HC!q9@)qIoRC(A2$?9Fgb-^sd ztth4JFwBFm!pce)K3l(&k!U_->xVLVw@!!~dX*;EN8sYGwFDa!yRkEL;d7}$Uq;G+9*((CHM*B_8`vx3KhRkW&R(R;a|a1_+KXXzpPF4 z^$7PCWBtF4MpR~Ny1rue`gL6Sa+feKu#Hyj z35RF$OD6s89A9jHfK*Fd=;z#IYHGV7a2lS}K2#rhxxdKb_1x$J(%cjBWt~P zmdn=1CMC9vH#I|TOTF;`bK zD?=wSc$-|Hf}ZwpwqK9U;dkKGeHwXXKBmMkAGqVcYngFbB4$jnhw-_EEWlzd9oU~F zWNfebpqFpSDJ%?WZ92ky><|(!IgAncFDbn-7tdc@6MB9PGwf!GrsPU0{$9r%7dk*u zZclSc$^weCBaMW0~PjE`}jo zcNfJl%bou4I=+%rV|K9GwVx^Sba(7MvI&XD&yw->r}Uug7Bwvexz)!J%{~ch-PKtB zY!cFX9a9$W8vyU&_feP<1F->7@qoc>P~07|ia7;GHw)%n>P``}-4XHaAxX0UeYnNa+DOELO%}0ju;|Il9+N-Diyn*d;f&*)e^y{ADO9 zk41pNL%~_Kz%N*E8}kWr$8DOSxyKK)H1ewLHr~ zKx_l~*<^`(ZOkXBt*B|kKGJ^oh)p-XOyPEykmCC@;;j?dlaO|rG)W5UrGVqAGo(>V z5!#*3&>_w&>bT6vs~w8mSWWnmHx$-+0hS}~aCzNG@`z92sm~Gxr{7IEm0id#@u9|> z)#RPnM!Yv=#%rVf9silftYp7UZOW9=_G(R5C442>G z5#L@Q(REn57M?HtTV9*hxmuIYOm<`OdcX^TsCyl ztI&_F(lpCsnVi2uXu)whVpR#x>Jga^lL^*i0IsevlvFmC>;L+hB{dE}lD>}H zEjmwi-j*W2y9tv%c)?{)|Ar>f^F=CNFpvJjq~CUzTfIER4Bw;ryMR z!lM4fh;f57!}Ic_T9LO!Pk)I1LlWgK)8?m= znRu^535U5qMWaGpD{H-N3FWeFh>vu_7e9;r5?Q~o;!&9hxtWX<>ogjZEOJF|c|tSv z7IX4;wzPAA8&4QT7B~NZuKA6|<=hiY_Pa${rFS76E@ke4ZM0>!cuSWDp$yW(@tF7? zvq5;6YFT9b2DF#0;%WI+%xRLAysjh&Uvndx&37=5X?2>UpSB{rXd=}mJSN3OD@a>v z*lpcZBn>-8y1Fzd4W_W3xn-2(FbFFn2TkaS*|6R&=8$Q|l;l~=tY`J}M`hwaXxc(< zi3=!g;})a^2GHmXksm*`36ErzG|<0+qJ8(0YURgVdN4&3=|2KCwI|U3Dv{0psFS=V z3;*PKGHCc?DvdWmMWqa-P4C(06JsT+-cD)M7FO?V%m8I`72X z&lpnR%pkGPVm7LmG*ZbPp%u?^arDqGNBW=Vq3e1?{hJGr_|Zx7UTZ-blj9_R9K}?H zdS(%JpA3c#Ck^vL!SO)I&+XB~+pDNpFuA?aS>k_IgsI&NNIUO5d3FAdaHR_c&h1C_ zqBcZZi+kyVvfW<7qsxTJrt5h~$0m~V zP_Ccu&2oM-rb$P>CHb^IR{z0qp_`8WpZ_WJgj^odbsglCMma8EQ_UXiqMTzB8w5dv8;0Z4B&kUPCqQ2WI^!g*TgeG8cXsOQWK>D!_@^ ze;&$$mT;q@*1F%>CE+2l}u4S|=BGBc4Gyf-?X5>_cO`_+BYrYU*k^v9&H z_o8c=yJ+?54$&J}P|992l6-BWiMd)zHh(t3zx?kAS*XS7&m$lgOn>x*hZv|-q3PX8 zQ5SDwbDrS!Yn<57xFNpKw-+AWaa0`gC57#-p^%~1IB;EfYLkfTixx9U=!8W_q=j%l zR1N#!2F!`vhv3D6T|6Ur)3@H5&Y2I%bV~yiC1W*KgClrCf)co)L$JkM#QUv+w`&$Y zs(nf?KRAZ)s4i;E(}_B$Map0~s(>PPnmmOLF>D1*s>uU#cZ@mqx@>jBuQ^FVQzl7lIb)-y>WS-aDsWPz}4q~?H zi7SFnTG0-=-lZQH26TPXK=Z(L{ z11_C|afF)e#5`JW(gTkbXPLaHiCZcSp?et2i*jr5McQ-Hmq)YgZ~lOhMHzQkn~%n4 z-{CaHb2E{bj?Os(=ed3q>2#K-jp$;5<99)3QiylLPp~^9W=apj;h8m5rhY&>6N}*; z_8TH@sYo@ihKDIykpJvHw1x@1q+~Ot#aSTG#6aj{Y6{=BAG_UK5&5?nX#4R#^@z;y z$G`Sa@#;E6k30>yTl(4ZgEWFvms<`KRBq{!I#Qn_MaUuW&Be z8jq!(VX*Mnjd!2E6Pd+%tSPV>dD;)iEaNK#i%gm{<1DF%@8ONH_u!=&Mf%zfS{>m^ z(iO*Hndg9TBNbe}GN8ZK?F~Z;$$b<9R7vZQA$!4O3_QMG_q9W zuj0NX=iCsonAt*dHpp#F$1=J9(8ZpSQ*h0{77}wm$IAV`@_Tki$@ZYgJf=i2)s73A zM5hw!8Y+YLl0I@NEQO?cu`DUundyU!aC+^p%%>@d>36Q?3sbAvziS7`_eKWX8#!ha zyk}V_`l0o2LI3p^Y)o`N3NGhU+_Y1)DEA$O_%xGCS079ja%vBI0^R=g%=*I;Z1_b( zilZ`qPuh*hhUbv(ka79A{oEyH7*cxV(1f(W%I=l$cP^p&qx(>_Pu7&GPqOv}71;gw zywDvNlW)mR_^in0`rdiWYzn8gS)EXb46W4Agh@{uB&shLnlr?V}eHCSIpVO@gRiy3^ z?DGLP&d>FcX73nOrq~G|(v9Xh*NVAj8ufHqkYr}FOuk0STe?!{ZkM<}GezcAn=06_ zr=qqRbBW1V%1_V7zP@$TWZpq)yG!(bSUQw_9^@tXXWL(FgqCs@_IbBSQd8onu+HK7 z@li~&a;9U(;#)K@&WzL*HRLwjA4ysE)U&^n+IQ6PNC#)`ocJHyZAcZlTOFJDR_HP! z7p@$Y$@E9|bNdx?a*|}wp4~S{zA;f|IcYo@UilsAVUF~PXE*Amyo5G)6WSYc*_^d< z$c+Wh+uzIQow!W?8o`#ORl#=0M&`6Yyrs_~5c&L{{CRi{tqG{7fIEL7a?V5E$iYQ&OC(8^(?eTyyvB6uTlNwb=;o6 zP1Ln+w)$Z1|22pI=>XZoFJWD>IVs#+5vU(x5`7vZ^+6)Lkk5)v*-%zhli+hZ5frft zfd)c4HWj`052s}VCP!hb{**obI+sdzeMUjsmeY&{2GC7wBR@?Jy}zhJnk%trx!9-U c+=Ggp?$CU`RNN_VYBm?#s7V8*Sa#5V0rH&3TL1t6 literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/check_weights.cpp b/src/autoschedulers/anderson2021/check_weights.cpp new file mode 100644 index 000000000000..9e8ea5ec9b2e --- /dev/null +++ b/src/autoschedulers/anderson2021/check_weights.cpp @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +#include "CostModel.h" +#include "NetworkSize.h" + +using namespace Halide; + +std::string getenv_safe(const char *key) { + const char *value = getenv(key); + if (!value) value = ""; + return value; +} + +int check_weights(const std::string &filename, const std::vector &shape) { + Runtime::Buffer buf(shape); + + std::ifstream i(filename, std::ios_base::binary); + i.read((char *)(buf.data()), buf.size_in_bytes()); + i.close(); + + int num_nans = 0; + + buf.for_each_value([&filename, &num_nans](float &f) { + if (std::isnan(f)) { + std::cerr << "NaN found in weights: " << filename << "\n"; + ++num_nans; + } + }); + + return num_nans; +} + +int main(int argc, char **argv) { + using std::string; + + string weights_dir = getenv_safe("HL_WEIGHTS_DIR"); + if (weights_dir.empty()) { + std::cout << "No weights_dir specified. Exiting.\n"; + return 0; + } + + std::cout << "Checking weights...\n"; + + int num_nans = check_weights(weights_dir + "/head1_conv1_weight.data", {head1_channels, head1_w, head1_h}); + num_nans = check_weights(weights_dir + "/head1_conv1_bias.data", {head1_channels}); + + num_nans = check_weights(weights_dir + "/head2_conv1_weight.data", {head2_channels, head2_w}); + num_nans = check_weights(weights_dir + "/head2_conv1_bias.data", {head2_channels}); + + num_nans = check_weights(weights_dir + "/trunk_conv1_weight.data", {conv1_channels, head1_channels + head2_channels}); + num_nans = check_weights(weights_dir + "/trunk_conv1_bias.data", {conv1_channels}); + + std::cout << "Number of NaNs found: " << num_nans << "\n"; + + return 0; +} diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp new file mode 100644 index 000000000000..c6608a412d0a --- /dev/null +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -0,0 +1,676 @@ +// This file defines our cost model as a Halide generator. It is +// templated such that it can be compiled in either forward or +// backwards mode, for inference or training respectively. + +#include "Halide.h" + +#include "NetworkSize.h" +#include "cost_model_schedule.h" + +using namespace Halide; +using Halide::Derivative; + +// A model weight is either just an input, or an input and an output +// (the updated weights and the ADAM state) depending on whether we're +// doing inference or training. +template +struct ModelWeight; + +template<> +struct ModelWeight : public GeneratorInput> { + ModelWeight(const std::string &name, int dim) + : GeneratorInput>(name, dim) { + } + void backprop(const Derivative &d, Expr learning_rate, Expr timestep) { + } + void set_shape(int s0 = 0, int s1 = 0, int s2 = 0) { + if (s0) dim(0).set_bounds(0, s0); + if (s1) dim(1).set_bounds(0, s1); + if (s2) dim(2).set_bounds(0, s2); + } +}; + +template<> +struct ModelWeight : public GeneratorInput> { + GeneratorOutput> grad; + + ModelWeight(const std::string &name, int dim) + : GeneratorInput>(name, dim), grad("updated_" + name, dim + 1) { + } + void backprop(const Derivative &d, Expr learning_rate, Expr timestep) { + std::vector args(dimensions() + 1); + for (auto &e : args) + e = Var(); + grad(args) = undef(); + + // We'll report back the new weights and the loss gradients, + // and update the ADAM state. Depending on the mode the caller + // is in, it may use the new weights, or it may just send the + // loss gradients up to an ADAM server. + args.back() = 0; + FuncRef new_weight = grad(args); + args.back() = 1; + FuncRef smoothed_deriv = grad(args); + args.back() = 2; + FuncRef smoothed_second_moment = grad(args); + args.back() = 3; + FuncRef loss_gradient = grad(args); + + args.pop_back(); + Expr current_weight = (*this)(args); + + loss_gradient = d(*this)(args); + + // Update the first and second moment estimates + smoothed_deriv = 0.9f * smoothed_deriv + 0.1f * loss_gradient; + smoothed_second_moment = 0.999f * smoothed_second_moment + 0.001f * pow(loss_gradient, 2); + + // Correction to account for the fact that the smoothed_deriv + // and smoothed_second_moment start at zero when t == 0 + Expr smoothed_deriv_correction = 1 / (1 - pow(0.9f, timestep + 1)); + Expr smoothed_second_moment_correction = 1 / (1 - pow(0.999f, timestep + 1)); + + // Update the weights + Expr step = learning_rate * smoothed_deriv * smoothed_deriv_correction; + step /= sqrt(smoothed_second_moment * smoothed_second_moment_correction) + 1e-5f; + + new_weight = current_weight - step; + } + + void set_shape(int s0 = 0, int s1 = 0, int s2 = 0) { + if (s0) { + dim(0).set_bounds(0, s0); + dim(0).set_estimate(0, s0); + grad.dim(0).set_bounds(0, s0); + grad.dim(0).set_estimate(0, s0); + grad.bound(grad.args()[0], 0, s0); + grad.set_estimate(grad.args()[0], 0, s0); + } + if (s1) { + dim(1).set_bounds(0, s1); + dim(1).set_estimate(0, s1); + grad.dim(1).set_bounds(0, s1); + grad.dim(1).set_estimate(0, s1); + grad.bound(grad.args()[1], 0, s1); + grad.set_estimate(grad.args()[1], 0, s1); + } + if (s2) { + dim(2).set_bounds(0, s2); + dim(2).set_estimate(0, s2); + grad.dim(2).set_bounds(0, s2); + grad.dim(2).set_estimate(0, s2); + grad.bound(grad.args()[2], 0, s2); + grad.set_estimate(grad.args()[2], 0, s2); + } + grad.dim(dimensions()).set_bounds(0, 4); + grad.dim(dimensions()).set_estimate(0, 4); + } +}; + +template +class CostModel : public Generator> { +public: + template + using Input = GeneratorInput; + template + using Output = GeneratorOutput; + using Generator>::auto_schedule; + using Generator>::get_pipeline; + + // Number of pipeline stages + Input num_stages{"num_stages", 1}; + + // Batch size. Every item in the batch is a different schedule for + // the same algorithm. + Input batch_size{"batch_size", 1}; + + // Number of cores on the target machine. Used to reason about idle cores. + Input num_cores{"num_cores", 1}; + + Input batch_id{"batch_id", 0}; + + GeneratorParam enable_debug_output{"enable_debug_output", false}; + + // Algorithm-specific features + Input> pipeline_features{"pipeline_features", 3}; + + // Schedule-specific features + Input> schedule_features{"schedule_features", 3}; + + // Network weights. We use some template-fu so that they are + // inputs in inference mode, and inputs and outputs in training + // mode. + using Weight = ModelWeight; + Weight head1_filter{"head1_filter", 3}; + Weight head1_bias{"head1_bias", 1}; + Weight head2_filter{"head2_filter", 2}; + Weight head2_bias{"head2_bias", 1}; + Weight filter1{"filter1", 2}; + Weight bias1{"bias1", 1}; + + // Some extra inputs for training mode. + Input learning_rate{"learning_rate", 1.0f}; + Input timestep{"timestep", 0}; // Needed by ADAM + + // The index of the fastest schedule in the batch. Used as a + // reference point for computing relative throughput. + Input reference{"reference", 0}; + + // The true runtimes obtained by benchmarking. + Input> true_runtime{"true_runtime", 1}; + + // The predicted runtimes + Output> prediction_output{"prediction_output", 1}; + + // Predicted per stage run times + Output> cost_per_stage_output{"cost_per_stage_output", 2}; + + // The loss. L2 on relative throughput. + Output> loss_output{"loss_output", 0}; + + // Zero pad alone the last dimension of a Func + Func pad_stages(Func f, Expr stages) { + Halide::Region bounds(f.dimensions()); + bounds[1].min = 0; + bounds[1].extent = stages; + return BoundaryConditions::constant_exterior(f, cast(f.value().type(), 0), bounds); + } + + Expr activation(Expr e) { + // leaky relu + return max(e, 0) + min(e, 0) * 1e-10f; + } + + Expr sigmoid(Expr e) { + return 1 / (1 + exp(-e)); + } + + Expr print_wrap(Expr e, const std::string& out, const Var& n, const Var& w) { + if (training || !enable_debug_output) { + return e; + } + + return print(e, "<-", out + ".", "batch_id =", batch_id, "pipeline_id =", n, "stage_id =", w); + } + + void generate() { + Var c("c"), w("w"), n("n"), j("j"), s("s"); + + Func normalized_schedule_features("normalized_schedule_features"); + normalized_schedule_features(n, c, s) = fast_log(schedule_features(n, c, s) + 1); + + // Force the weights of the algorithm embedding layer to be positive and bounded. + Func squashed_head1_filter("squashed_head1_filter"); + squashed_head1_filter(c, s, n) = sigmoid(head1_filter(c, s, n)); + + // Explicitly broadcast the weights across the batch. This + // give the autoscheduler some more options in the + // reverse-mode pipeline. + Func squashed_head1_filter_broadcast("squashed_head1_filter_broadcast"); + squashed_head1_filter_broadcast(c, w, s, n) = squashed_head1_filter(c, s, n); + + // The conv layer that embeds the algorithm-specific features. + Func head1_conv("head1_conv"); + RDom r_head1(0, head1_w, 0, head1_h); + head1_conv(c, w) = head1_bias(c); + head1_conv(c, w) += (squashed_head1_filter_broadcast(c, w, r_head1.x, r_head1.y) * + pipeline_features(r_head1.x, r_head1.y, w)); + + // No point in a relu - the inputs and weights are positive + + // The conv layer that embeds the schedule-specific features. + Func head2_conv("head2_conv"); + RDom r_head2(0, head2_w); + head2_conv(c, w, n) = head2_bias(c); + head2_conv(c, w, n) += head2_filter(c, r_head2) * normalized_schedule_features(n, r_head2, w); + + Func head2_relu("head2_relu"); + head2_relu(c, w, n) = activation(head2_conv(c, w, n)); + + // The conv layer that computes coefficients, split into two + // stages. First we consumer the algorithm embedding. + Func conv1_stage1("conv1_stage1"); + RDom r1_stage1(0, head1_channels); + conv1_stage1(c, w) = bias1(c); + conv1_stage1(c, w) += filter1(c, r1_stage1.x) * head1_conv(r1_stage1.x, w); + + // Then we consume the schedule embedding. + Func conv1_stage2("conv1_stage2"); + RDom r1_stage2(0, head2_channels); + conv1_stage2(c, w, n) = conv1_stage1(c, w); + conv1_stage2(c, w, n) += filter1(c, head1_filter.dim(0).extent() + r1_stage2.x) * head2_relu(r1_stage2.x, w, n); + + // The final set of predicted coefficients. + Func relu1("relu1"); + relu1(c, w, n) = activation(conv1_stage2(c, w, n)); + + // That's the end of the neural network. Now we will use these + // coefficients with a bunch of hand-designed terms. + + // Unpack all of the schedule features. We don't use all of + // them, but it's easier to avoid bugs if we just unpack them + // all in the same order as Featurization.h + int idx = 0; + Expr num_realizations = schedule_features(n, idx++, w); + Expr num_productions = schedule_features(n, idx++, w); + Expr points_computed_per_realization = schedule_features(n, idx++, w); + Expr points_computed_per_production = schedule_features(n, idx++, w); + Expr points_computed_per_thread = schedule_features(n, idx++, w); + Expr points_computed_total = schedule_features(n, idx++, w); + Expr points_computed_minimum = schedule_features(n, idx++, w); + Expr innermost_loop_extent = schedule_features(n, idx++, w); + Expr innermost_pure_loop_extent = schedule_features(n, idx++, w); + Expr unrolled_loop_extent = schedule_features(n, idx++, w); + Expr inner_parallelism = schedule_features(n, idx++, w); + Expr outer_parallelism = schedule_features(n, idx++, w); + Expr bytes_at_realization = schedule_features(n, idx++, w); + Expr bytes_at_production = schedule_features(n, idx++, w); + Expr bytes_at_root = schedule_features(n, idx++, w); + Expr innermost_bytes_at_realization = schedule_features(n, idx++, w); + Expr innermost_bytes_at_production = schedule_features(n, idx++, w); + Expr innermost_bytes_at_root = schedule_features(n, idx++, w); + Expr inlined_calls = schedule_features(n, idx++, w); + + Expr unique_global_bytes_read_per_realization = schedule_features(n, idx++, w); + Expr unique_shared_bytes_read_per_realization = schedule_features(n, idx++, w); + Expr unique_register_bytes_read_per_realization = schedule_features(n, idx++, w); + Expr unique_global_lines_read_per_realization = schedule_features(n, idx++, w); + Expr unique_shared_lines_read_per_realization = schedule_features(n, idx++, w); + Expr unique_register_lines_read_per_realization = schedule_features(n, idx++, w); + + Expr unique_global_bytes_read_per_thread = schedule_features(n, idx++, w); + Expr unique_shared_bytes_read_per_thread = schedule_features(n, idx++, w); + Expr unique_register_bytes_read_per_thread = schedule_features(n, idx++, w); + Expr unique_global_lines_read_per_thread = schedule_features(n, idx++, w); + Expr unique_shared_lines_read_per_thread = schedule_features(n, idx++, w); + Expr unique_register_lines_read_per_thread = schedule_features(n, idx++, w); + + Expr global_allocation_bytes_read_per_realization = schedule_features(n, idx++, w); + Expr shared_allocation_bytes_read_per_realization = schedule_features(n, idx++, w); + Expr register_allocation_bytes_read_per_realization = schedule_features(n, idx++, w); + Expr working_set = schedule_features(n, idx++, w); + Expr num_scalars = schedule_features(n, idx++, w); + Expr global_bytes_at_task = schedule_features(n, idx++, w); + Expr shared_bytes_at_task = schedule_features(n, idx++, w); + Expr register_bytes_at_task = schedule_features(n, idx++, w); + Expr global_innermost_bytes_at_task = schedule_features(n, idx++, w); + Expr shared_innermost_bytes_at_task = schedule_features(n, idx++, w); + Expr register_innermost_bytes_at_task = schedule_features(n, idx++, w); + Expr unique_bytes_read_per_point = schedule_features(n, idx++, w); + Expr unique_lines_read_per_point = schedule_features(n, idx++, w); + Expr unique_bytes_read_per_task = schedule_features(n, idx++, w); + Expr unique_lines_read_per_task = schedule_features(n, idx++, w); + Expr working_set_at_task = schedule_features(n, idx++, w); + Expr working_set_at_production = schedule_features(n, idx++, w); + Expr working_set_at_realization = schedule_features(n, idx++, w); + Expr working_set_at_root = schedule_features(n, idx++, w); + + Expr num_blocks = schedule_features(n, idx++, w); + Expr num_warps_per_block = schedule_features(n, idx++, w); + Expr block_occupancy = schedule_features(n, idx++, w); + + Expr warp_lane_utilization = schedule_features(n, idx++, w); + Expr num_active_warps_per_block = schedule_features(n, idx++, w); + Expr warp_lane_utilization_at_block_y = schedule_features(n, idx++, w); + Expr warp_lane_utilization_at_block_z = schedule_features(n, idx++, w); + Expr idle_lane_wastage = schedule_features(n, idx++, w); + + Expr num_shared_mem_loads_per_block = schedule_features(n, idx++, w); + Expr num_global_mem_loads_per_block = schedule_features(n, idx++, w); + Expr num_shared_mem_stores_per_block = schedule_features(n, idx++, w); + Expr num_global_mem_stores_per_block = schedule_features(n, idx++, w); + + Expr shared_mem_store_efficiency = schedule_features(n, idx++, w); + Expr shared_mem_load_efficiency = schedule_features(n, idx++, w); + + Expr global_mem_store_efficiency = schedule_features(n, idx++, w); + Expr global_mem_load_efficiency = schedule_features(n, idx++, w); + + Expr working_set_at_thread = schedule_features(n, idx++, w); + + Expr shared_mem_occupancy = schedule_features(n, idx++, w); + Expr shared_mem_block_limit_factor = schedule_features(n, idx++, w); + Expr max_warp_occupancy = schedule_features(n, idx++, w); + Expr max_block_occupancy = schedule_features(n, idx++, w); + + Expr num_threads_per_block = schedule_features(n, idx++, w); + Expr expr_branching = schedule_features(n, idx++, w); + + assert(idx == head2_w); + + num_blocks = max(1, num_blocks); + + // Count up the number of things computed, applying a + // different cost to vectors and scalars, and a different cost + // depending on whether we were inlined. + Expr compute_cost = select(inlined_calls == 0, + num_scalars * relu1(1, w, n), + num_scalars * relu1(3, w, n)); + + compute_cost = print_wrap(compute_cost, "compute_cost_initial", n, w); + + compute_cost += select(inlined_calls == 0, + (num_blocks * num_threads_per_block * points_computed_per_thread * relu1(19, w, n)), + (num_blocks * num_threads_per_block * points_computed_per_thread * relu1(4, w, n))); + + compute_cost = print_wrap(compute_cost, "compute_cost_after_warps", n, w); + + Expr num_tasks = max(1, inner_parallelism * outer_parallelism); + Expr tasks_per_core = num_tasks / num_cores; + Expr idle_core_wastage = ceil(tasks_per_core) / max(1, tasks_per_core); + compute_cost *= idle_core_wastage; + + compute_cost = print_wrap(compute_cost, "compute_cost_after_idle_core_wastage", n, w); + + // Ignore for inlined stages + // Serial loops use a single thread + + compute_cost /= select(inlined_calls == 0, 1 - idle_lane_wastage, 1.f); + compute_cost = print_wrap(compute_cost, "compute_cost_after_idle_lane", n, w); + + expr_branching = max(1, relu1(23, w, n) * expr_branching); + expr_branching = print_wrap(expr_branching, "expr_branching", n, w); + + num_threads_per_block = print_wrap(num_threads_per_block, "num_threads_per_block", n, w); + + Expr num_registers_available_per_thread = min(64.f, 65536.f / num_threads_per_block); + Expr num_registers_per_block = num_threads_per_block * min(num_registers_available_per_thread, expr_branching); + Expr max_theoretical_active_blocks = max(1.f, floor(65536.f / num_registers_per_block)); + Expr max_active_blocks = min(max_theoretical_active_blocks, 32.f); + + Expr register_block_occupancy = print_wrap(select(inlined_calls == 0, max_active_blocks / 32.f, 1.f), "register_block_occupancy", n, w); + + //compute_cost *= select(inlined_calls == 0, 1.f / register_block_occupancy, 1.f); + compute_cost = print_wrap(compute_cost, "compute_cost_after_register_block_occupancy", n, w); + + // Next comes a long list of plausible terms to capture the cost of loads. + Expr load_cost = num_realizations * unique_global_lines_read_per_realization * relu1(5, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_realizations * unique_global_lines_read_per_realization", n, w); + + load_cost += num_realizations * unique_shared_lines_read_per_realization * relu1(16, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_realizations * unique_shared_lines_read_per_realization", n, w); + + load_cost += num_realizations * unique_register_lines_read_per_realization * relu1(8, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_realizations * unique_register_lines_read_per_realization", n, w); + + load_cost += num_realizations * unique_global_bytes_read_per_realization * relu1(6, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_realizations * unique_global_bytes_read_per_realization", n, w); + + load_cost += num_realizations * unique_shared_bytes_read_per_realization * relu1(20, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_realizations * unique_shared_bytes_read_per_realization", n, w); + + load_cost += num_realizations * unique_register_bytes_read_per_realization * relu1(7, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_realizations * unique_register_bytes_read_per_realization", n, w); + + load_cost += num_blocks * num_threads_per_block * unique_global_lines_read_per_thread * relu1(18, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_blocks * num_threads_per_block * unique_global_lines_read_per_thread", n, w); + + load_cost += num_blocks * num_threads_per_block * unique_shared_lines_read_per_thread * relu1(17, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_blocks * num_threads_per_block * unique_shared_lines_read_per_thread", n, w); + + load_cost += num_blocks * num_threads_per_block * unique_register_lines_read_per_thread * relu1(2, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_blocks * num_threads_per_block * unique_register_lines_read_per_thread", n, w); + + load_cost += num_blocks * num_threads_per_block * unique_global_bytes_read_per_thread * relu1(13, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_blocks * num_threads_per_block * unique_global_bytes_read_per_thread", n, w); + + load_cost += num_blocks * num_threads_per_block * unique_shared_bytes_read_per_thread * relu1(11, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_blocks * num_threads_per_block * unique_shared_bytes_read_per_thread", n, w); + + load_cost += num_blocks * num_threads_per_block * unique_register_bytes_read_per_thread * relu1(0, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_blocks * num_threads_per_block * unique_register_bytes_read_per_thread", n, w); + + load_cost += num_scalars * unique_bytes_read_per_point * relu1(10, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_scalars * unique_bytes_read_per_point", n, w); + + load_cost += num_scalars * unique_lines_read_per_point * relu1(12, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_scalars * unique_lines_read_per_point", n, w); + + load_cost += num_tasks * unique_bytes_read_per_task * relu1(14, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_tasks * unique_bytes_read_per_task", n, w); + + load_cost += num_tasks * unique_lines_read_per_task * relu1(15, w, n); + load_cost = print_wrap(load_cost, "load_cost after num_tasks * unique_lines_read_per_task", n, w); + + Expr global_mem_load_cost = num_blocks * num_global_mem_loads_per_block * relu1(28, w, n); + + global_mem_load_cost = print_wrap(global_mem_load_cost, "global_mem_load_cost", n, w); + + global_mem_load_cost *= select(inlined_calls == 0, 1.f / global_mem_load_efficiency, 1); + global_mem_load_cost = print_wrap(global_mem_load_cost, "global_mem_load_cost_after_load_efficiency", n, w); + + Expr shared_mem_load_cost = num_blocks * num_shared_mem_loads_per_block * relu1(27, w, n); + + shared_mem_load_cost = print_wrap(shared_mem_load_cost, "shared_mem_load_cost_after_load_efficiency", n, w); + + load_cost += global_mem_load_cost + shared_mem_load_cost; + + // Store costs + Expr shared_mem_store_cost = num_blocks * num_shared_mem_stores_per_block * relu1(29, w, n); + + shared_mem_store_cost = print_wrap(shared_mem_store_cost, "shared_mem_store_cost_after_store_efficiency", n, w); + + Expr global_mem_store_cost = num_blocks * num_global_mem_stores_per_block * relu1(21, w, n); + global_mem_store_cost *= select(inlined_calls == 0, 1.f / global_mem_store_efficiency, 1); + + global_mem_store_cost = print_wrap(global_mem_store_cost, "global_mem_store_cost_after_store_efficiency", n, w); + + Expr store_cost = shared_mem_store_cost + global_mem_store_cost; + + // Now account for false sharing of cache lines. The + // probability of a store hitting a cache line also hit by + // another core is inversely proportional to + // innermost_bytes_at_task, and the cost is paid on every + // store. + Expr cost_of_false_sharing = + select(inner_parallelism > 1, + relu1(22, w, n) * (num_scalars) / max(1, global_innermost_bytes_at_task), + 0.0f); + + store_cost += cost_of_false_sharing; + store_cost = print_wrap(store_cost, "store_cost_after_false_sharing", n, w); + + // Malloc is not free, so add a cost per allocation. + Expr cost_of_malloc = relu1(24, w, n) * num_realizations; + + // A cost for launching a parallel task... + Expr cost_of_parallel_launches = num_productions * select(inner_parallelism > 1, relu1(25, w, n), 0.0f); + + // ... and an overhead per task. + Expr cost_of_parallel_tasks = num_productions * (inner_parallelism - 1) * relu1(26, w, n); + + Expr cost_of_parallelism = cost_of_parallel_tasks + cost_of_parallel_launches; + + // Make it easier for the model to penalize working sets that + // start to fall out of cache by giving it a term that gets + // multiplied by the working set. + Expr cost_of_working_set = working_set * relu1(9, w, n); + + Expr cost = (print_wrap(compute_cost, "compute_cost_total", n, w) + + print_wrap(store_cost, "store_cost_total", n, w) + + print_wrap(load_cost, "load_cost_total", n, w) + + print_wrap(cost_of_malloc, "cost_of_malloc_total", n, w) + + print_wrap(cost_of_parallelism, "cost_of_parallelism_total", n, w) + + print_wrap(cost_of_working_set, "cost_of_working_set_total", n, w)); + + cost = print_wrap(cost, "cost_total", n, w); + + for (int i = 0; i < conv1_channels; i++) { + cost += 0.0f * relu1(i, w, n); + } + + Func runtime_per_stage; + // Change units so that network weights are in a human-readable range. + runtime_per_stage(n, w) = cost * 1e-9f; + cost_per_stage_output(n, w) = runtime_per_stage(n, w); + + // Sum across the stages. + Func prediction; + RDom r_reduce(0, num_stages); + prediction(n) += cost_per_stage_output(n, r_reduce); + + prediction_output(n) = prediction(n); + + Func err; + + if (!training) { + loss_output() = 0.0f; + } else { + + // The tail end of the reverse-mode pipeline + RDom r_batch(0, batch_size); + + // We believe the coefficients on all the various + // components of cost should be positive, even before the + // relu, and even before schedule-specific features are + // taken into account. The network shouldn't be telling us + // that things would be cheaper if we would do more + // mallocs, or compute more values, or launch more + // parallel tasks. So we add a regularization term. This + // helps dead relus get unstuck. + RDom r_conv1_output(0, conv1_channels, 0, num_stages); + Expr regularize = sum(-min(conv1_stage2(r_conv1_output.x, r_conv1_output.y, n), 0)); + + // Our loss will be L2 on relative throughput. + + // Get the reference runtime. + Expr n2 = clamp(reference, 0, batch_size - 1); + Expr scale = 1.0f / true_runtime(n2); + + // Compute the relative true runtime and the relative predicted runtime + Expr p1 = prediction(n) * scale; + Expr r1 = true_runtime(n) * scale; + + // Invert them to get relative throughput, and compute L2 loss. + Expr delta = pow(1.0f / max(p1, 1e-10f) - 1.0f / r1, 2); + + // Add the regulization with a small weight. + err(n) = delta + 1e-5f * regularize; + + // Sum the errors over the batch. + Expr loss = sum(err(r_batch)); + + loss_output() = loss; + + // Compute derivatives of the loss, and backpropagate them + // to the model weights. + Derivative d_loss_d = propagate_adjoints(loss_output); + + Weight *weights[] = {&head1_filter, &head1_bias, + &head2_filter, &head2_bias, + &filter1, &bias1}; + + for (Weight *w : weights) { + w->backprop(d_loss_d, learning_rate, timestep); + } + } + + // All the model weight shapes are statically known, so we + // tell Halide their sizes to simplify the generated code. + head1_filter.set_shape(head1_channels, head1_w, head1_h); + head1_bias.set_shape(head1_channels); + head2_filter.set_shape(head2_channels, head2_w); + head2_bias.set_shape(head2_channels); + filter1.set_shape(conv1_channels, head1_channels + head2_channels); + bias1.set_shape(conv1_channels); + + // Estimates for autoscheduling this pipeline (using + // itself!). We do that offline and check in the generated + // schedule source, so that bugs in our autoscheduler don't + // cause build nightmares due to the circular dependency. + batch_id.set_estimate(0); + num_cores.set_estimate(80); + reference.set_estimate(0); + batch_size.set_estimate(80); + num_stages.set_estimate(13); + prediction_output.set_estimates({{0, 80}}); + cost_per_stage_output.set_estimates({{0, 80}, {0, 13}}); + learning_rate.set_estimate(0.001f); + timestep.set_estimate(37); + pipeline_features.set_estimates({{0, head1_w}, {0, head1_h}, {0, 13}}); + schedule_features.set_estimates({{0, 80}, {0, head2_w}, {0, 13}}); + true_runtime.set_estimates({{0, 80}}); + + // SCHEDULE + if (training && !auto_schedule) { + do_cost_model_schedule(get_pipeline()); + } else if (auto_schedule) { + // Do nothing. + } else { + // We just write down a good schedule for + // inference. Scheduling a couple of convs is easy. + Var no; + prediction_output.specialize(batch_size < 8).split(n, no, n, 1); + prediction_output.compute_root().split(n, no, n, 8).parallel(no); + prediction_output.bound(n, 0, batch_size); + + cost_per_stage_output.reorder(w, n); + cost_per_stage_output.specialize(batch_size < 8).split(n, no, n, 1); + cost_per_stage_output.compute_root().split(n, no, n, 8).parallel(no); + + // schedule for the forwards path + const int vec = 8; + + // A helper function for scheduling conv layers + auto schedule_conv = [&](Func conv, Func relu, RVar r_channels) { + Var ci("ci"), wi("wi"); + if (!training) { + relu + .compute_at(cost_per_stage_output, n) + .tile(c, w, ci, wi, vec, 4, TailStrategy::RoundUp) + .vectorize(ci); + conv.compute_at(relu, c); + } else { + // In training mode, we need the conv activations pre-relu too + conv.in() + .compute_root() + .tile(c, w, ci, wi, vec, 1, TailStrategy::RoundUp) + .vectorize(ci) + .unroll(wi) + .parallel(n, 8); + conv.compute_at(conv.in(), c); + relu + .compute_root() + .reorder_storage(c, w, n) + .reorder(c, w, n) + .vectorize(c, vec) + .parallel(n, 8); + } + conv + .vectorize(c) + .unroll(w) + .update() + .vectorize(c) + .unroll(w) + .reorder(c, w, r_channels); + }; + + // Pipeline features processing + conv1_stage1.compute_root().vectorize(c); + squashed_head1_filter.compute_root().vectorize(c); + + // Schedule features processing. The number of schedule + // features is not close to a multiple of 8, so vectorized + // across the batch. + if (!training) { + normalized_schedule_features + .compute_at(cost_per_stage_output, n) + .vectorize(n); + } else { + normalized_schedule_features + .compute_root() + .vectorize(n, 8); + } + + // conv+relu layers + schedule_conv(head2_conv, head2_relu, r_head2.x); + schedule_conv(conv1_stage2, relu1, r1_stage2.x); + } + } +}; + +using CostModelInference = CostModel; +using CostModelTraining = CostModel; + +HALIDE_REGISTER_GENERATOR(CostModelInference, cost_model); +HALIDE_REGISTER_GENERATOR(CostModelTraining, train_cost_model); diff --git a/src/autoschedulers/anderson2021/cost_model_schedule.h b/src/autoschedulers/anderson2021/cost_model_schedule.h new file mode 100644 index 000000000000..090cd672f465 --- /dev/null +++ b/src/autoschedulers/anderson2021/cost_model_schedule.h @@ -0,0 +1,620 @@ +#include "Halide.h" + +using namespace Halide; + +inline void do_cost_model_schedule(Halide::Pipeline pipeline) { + // Generated by autoscheduler, manually remove unrolls. + // Also manually replaced all RoundUp and ShiftInwards with GuardWithIf. + //for (int i = 0; i < 58; i++) { + //pipeline.get_func(i).compute_root(); + //return; + //} + + using ::Halide::Func; + using ::Halide::MemoryType; + using ::Halide::RVar; + using ::Halide::TailStrategy; + using ::Halide::Var; + Func loss_output = pipeline.get_func(57); + Func sum_1 = pipeline.get_func(56); + Func f2 = pipeline.get_func(55); + Func sum = pipeline.get_func(54); + Func prediction_output = pipeline.get_func(53); + Func updated_bias1 = pipeline.get_func(52); + Func bias1_im_0_d_def__ = pipeline.get_func(51); + Func conv1_stage1_0_d_def___1 = pipeline.get_func(50); + Func updated_filter1 = pipeline.get_func(49); + Func filter1_im_0_d_def__ = pipeline.get_func(48); + Func updated_head2_bias = pipeline.get_func(47); + Func head2_bias_im_0_d_def__ = pipeline.get_func(46); + Func head2_conv_0_d_def___1 = pipeline.get_func(45); + Func updated_head2_filter = pipeline.get_func(44); + Func head2_filter_im_0_d_def__ = pipeline.get_func(43); + Func head2_conv_1_d_def__ = pipeline.get_func(42); + Func head2_relu_0_d_def__ = pipeline.get_func(41); + Func updated_head1_bias = pipeline.get_func(40); + Func head1_bias_im_0_d_def__ = pipeline.get_func(39); + Func head1_conv_0_d_def___1 = pipeline.get_func(38); + Func updated_head1_filter = pipeline.get_func(37); + Func head1_filter_im_0_d_def__ = pipeline.get_func(36); + Func squashed_head1_filter_0_d_def__ = pipeline.get_func(35); + Func squashed_head1_filter_broadcast_0_d_def__ = pipeline.get_func(34); + Func head1_conv_1_d_def__ = pipeline.get_func(33); + Func conv1_stage1_1_d_def__ = pipeline.get_func(32); + Func conv1_stage2_0_d_def___1 = pipeline.get_func(31); + Func conv1_stage2_1_d_def__ = pipeline.get_func(30); + Func sum_1_d_def__ = pipeline.get_func(29); + Func relu1_0_d_def__ = pipeline.get_func(28); + Func f0_0_d_def__ = pipeline.get_func(27); + Func cost_per_stage_output_0_d_def__ = pipeline.get_func(26); + Func f1_1_d_def__ = pipeline.get_func(25); + Func f2_0_d_def__ = pipeline.get_func(23); + Func sum_1_1_d_def__ = pipeline.get_func(22); + Func loss_output_0_d_def__ = pipeline.get_func(21); + Func adjoint = pipeline.get_func(20); + Func f1 = pipeline.get_func(19); + Func cost_per_stage_output = pipeline.get_func(18); + Func f0 = pipeline.get_func(17); + Func relu1 = pipeline.get_func(16); + Func conv1_stage2 = pipeline.get_func(15); + Func head2_relu = pipeline.get_func(14); + Func head2_conv = pipeline.get_func(13); + Func normalized_schedule_features = pipeline.get_func(12); + Func conv1_stage1 = pipeline.get_func(8); + Func head1_conv = pipeline.get_func(7); + Func squashed_head1_filter_broadcast = pipeline.get_func(6); + Func squashed_head1_filter = pipeline.get_func(5); + Var c(head2_conv_0_d_def___1.get_schedule().dims()[0].var); + Var ci("ci"); + Var n(sum.get_schedule().dims()[0].var); + Var ni("ni"); + Var nii("nii"); + Var niii("niii"); + Var r1316_z(filter1_im_0_d_def__.update(0).get_schedule().dims()[2].var); + Var r1512_y(filter1_im_0_d_def__.update(1).get_schedule().dims()[1].var); + Var s(squashed_head1_filter_0_d_def__.get_schedule().dims()[1].var); + Var si("si"); + Var v11(bias1_im_0_d_def__.get_schedule().dims()[0].var); + Var v11i("v11i"); + Var v12(filter1_im_0_d_def__.get_schedule().dims()[0].var); + Var v13(filter1_im_0_d_def__.get_schedule().dims()[1].var); + Var v13i("v13i"); + Var v14(head1_bias_im_0_d_def__.get_schedule().dims()[0].var); + Var v14i("v14i"); + Var v21(head2_bias_im_0_d_def__.get_schedule().dims()[0].var); + Var v21i("v21i"); + Var v22(head2_filter_im_0_d_def__.get_schedule().dims()[0].var); + Var v22i("v22i"); + Var v23(head2_filter_im_0_d_def__.get_schedule().dims()[1].var); + Var v298(updated_head1_filter.get_schedule().dims()[0].var); + Var v298i("v298i"); + Var v299(updated_head1_filter.get_schedule().dims()[1].var); + Var v299i("v299i"); + Var v300(updated_head1_filter.get_schedule().dims()[2].var); + Var v301(updated_head1_filter.get_schedule().dims()[3].var); + Var v301i("v301i"); + Var v302(updated_head1_bias.get_schedule().dims()[0].var); + Var v302i("v302i"); + Var v303(updated_head1_bias.get_schedule().dims()[1].var); + Var v304(updated_head2_filter.get_schedule().dims()[0].var); + Var v304i("v304i"); + Var v305(updated_head2_filter.get_schedule().dims()[1].var); + Var v306(updated_head2_filter.get_schedule().dims()[2].var); + Var v307(updated_head2_bias.get_schedule().dims()[0].var); + Var v307i("v307i"); + Var v308(updated_head2_bias.get_schedule().dims()[1].var); + Var v309(updated_filter1.get_schedule().dims()[0].var); + Var v309i("v309i"); + Var v310(updated_filter1.get_schedule().dims()[1].var); + Var v311(updated_filter1.get_schedule().dims()[2].var); + Var v312(updated_bias1.get_schedule().dims()[0].var); + Var v312i("v312i"); + Var v313(updated_bias1.get_schedule().dims()[1].var); + Var w(head2_conv_0_d_def___1.get_schedule().dims()[1].var); + Var wi("wi"); + RVar r1294_x(head2_relu_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1316_x(filter1_im_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1316_y(filter1_im_0_d_def__.update(0).get_schedule().dims()[1].var); + RVar r1336_x(conv1_stage1_1_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1400_x(head2_filter_im_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1400_y(head2_filter_im_0_d_def__.update(0).get_schedule().dims()[1].var); + RVar r1421_x(head2_bias_im_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1421_y(head2_bias_im_0_d_def__.update(0).get_schedule().dims()[1].var); + RVar r1491_x(head1_conv_1_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1512_x(filter1_im_0_d_def__.update(1).get_schedule().dims()[0].var); + RVar r1532_x(bias1_im_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1594_x(head1_bias_im_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r1614_x(squashed_head1_filter_0_d_def__.update(0).get_schedule().dims()[0].var); + RVar r31_x(head1_conv.update(0).get_schedule().dims()[0].var); + RVar r31_y(head1_conv.update(0).get_schedule().dims()[1].var); + RVar r40_x(head2_conv.update(0).get_schedule().dims()[0].var); + RVar r54_x(conv1_stage1.update(0).get_schedule().dims()[0].var); + RVar r63_x(conv1_stage2.update(0).get_schedule().dims()[0].var); + RVar r81_x(f1.update(0).get_schedule().dims()[0].var); + RVar r89_x(sum_1.update(0).get_schedule().dims()[0].var); + RVar r94_x(sum.update(0).get_schedule().dims()[0].var); + RVar r94_y(sum.update(0).get_schedule().dims()[1].var); + loss_output + .compute_root(); + sum_1 + .compute_root(); + sum_1.update(0); + sum + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .compute_root() + .reorder(ni, n) + .serial(n); + sum.update(0) + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .reorder(ni, r94_x, r94_y, n) + .serial(n); + prediction_output + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .compute_root() + .reorder(ni, n) + .serial(n); + updated_bias1 + .split(v312, v312, v312i, 8, TailStrategy::GuardWithIf) + .vectorize(v312i) + .compute_root() + .reorder(v312i, v312, v313) + .fuse(v312, v313, v312) + .serial(v312); + updated_bias1.update(0) + .split(v312, v312, v312i, 8, TailStrategy::GuardWithIf) + .vectorize(v312i) + .reorder(v312i, v312) + .serial(v312); + updated_bias1.update(1) + .split(v312, v312, v312i, 8, TailStrategy::GuardWithIf) + .vectorize(v312i) + .reorder(v312i, v312) + .serial(v312); + updated_bias1.update(2) + .split(v312, v312, v312i, 8, TailStrategy::GuardWithIf) + .vectorize(v312i) + .reorder(v312i, v312) + .serial(v312); + updated_bias1.update(3) + .split(v312, v312, v312i, 8, TailStrategy::GuardWithIf) + .vectorize(v312i) + .reorder(v312i, v312) + .serial(v312); + bias1_im_0_d_def__ + .split(v11, v11, v11i, 8, TailStrategy::GuardWithIf) + .vectorize(v11i) + .compute_at(updated_bias1, v312) + .reorder(v11i, v11); + bias1_im_0_d_def__.update(0) + .split(v11, v11, v11i, 8, TailStrategy::GuardWithIf) + .vectorize(v11i) + .reorder(v11i, v11, r1532_x); + updated_filter1 + .split(v309, v309, v309i, 8, TailStrategy::GuardWithIf) + .vectorize(v309i) + .compute_root() + .reorder(v309i, v311, v309, v310) + .fuse(v309, v310, v309) + .serial(v309); + updated_filter1.update(0) + .split(v309, v309, v309i, 8, TailStrategy::GuardWithIf) + .vectorize(v309i) + .reorder(v309i, v309, v310) + .fuse(v309, v310, v309) + .serial(v309); + updated_filter1.update(1) + .split(v309, v309, v309i, 8, TailStrategy::GuardWithIf) + .vectorize(v309i) + .reorder(v309i, v309, v310) + .fuse(v309, v310, v309) + .serial(v309); + updated_filter1.update(2) + .split(v309, v309, v309i, 8, TailStrategy::GuardWithIf) + .vectorize(v309i) + .reorder(v309i, v309, v310) + .fuse(v309, v310, v309) + .serial(v309); + updated_filter1.update(3) + .split(v309, v309, v309i, 8, TailStrategy::GuardWithIf) + .vectorize(v309i) + .reorder(v309i, v309, v310) + .fuse(v309, v310, v309) + .serial(v309); + filter1_im_0_d_def__ + .split(v13, v13, v13i, 8, TailStrategy::GuardWithIf) + .vectorize(v13i) + .compute_root() + .reorder(v13i, v13, v12) + .fuse(v13, v12, v13) + .parallel(v13) + .reorder_storage(v13, v12); + filter1_im_0_d_def__.update(0) + .reorder(r1316_z, r1316_x, r1316_y, v12) + .vectorize(r1316_z, 8) + .unroll(r1316_z) + .parallel(v12); + filter1_im_0_d_def__.update(1) + .reorder(r1512_x, r1512_y, v12) + .vectorize(r1512_y) + .parallel(v12); + updated_head2_bias + .split(v307, v307, v307i, 8, TailStrategy::GuardWithIf) + .vectorize(v307i) + .compute_root() + .reorder(v307i, v307, v308) + .fuse(v307, v308, v307) + .serial(v307); + updated_head2_bias.update(0) + .split(v307, v307, v307i, 8, TailStrategy::GuardWithIf) + .vectorize(v307i) + .reorder(v307i, v307) + .serial(v307); + updated_head2_bias.update(1) + .split(v307, v307, v307i, 8, TailStrategy::GuardWithIf) + .vectorize(v307i) + .reorder(v307i, v307) + .serial(v307); + updated_head2_bias.update(2) + .split(v307, v307, v307i, 8, TailStrategy::GuardWithIf) + .vectorize(v307i) + .reorder(v307i, v307) + .serial(v307); + updated_head2_bias.update(3) + .split(v307, v307, v307i, 8, TailStrategy::GuardWithIf) + .vectorize(v307i) + .reorder(v307i, v307) + .serial(v307); + head2_bias_im_0_d_def__ + .split(v21, v21, v21i, 8, TailStrategy::GuardWithIf) + .vectorize(v21i) + .compute_at(updated_head2_bias, v307) + .reorder(v21i, v21); + head2_bias_im_0_d_def__.update(0) + .split(v21, v21, v21i, 8, TailStrategy::GuardWithIf) + .vectorize(v21i) + .reorder(v21i, v21, r1421_x, r1421_y); + head2_conv_0_d_def___1 + .store_in(MemoryType::Stack) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .compute_at(head2_bias_im_0_d_def__, v21) + .reorder(ci, c, w, n); + updated_head2_filter + .split(v304, v304, v304i, 8, TailStrategy::GuardWithIf) + .vectorize(v304i) + .compute_root() + .reorder(v304i, v306, v304, v305) + .fuse(v304, v305, v304) + .parallel(v304); + updated_head2_filter.update(0) + .split(v304, v304, v304i, 8, TailStrategy::GuardWithIf) + .vectorize(v304i) + .reorder(v304i, v304, v305) + .fuse(v304, v305, v304) + .parallel(v304); + updated_head2_filter.update(1) + .split(v304, v304, v304i, 8, TailStrategy::GuardWithIf) + .vectorize(v304i) + .reorder(v304i, v304, v305) + .fuse(v304, v305, v304) + .serial(v304); + updated_head2_filter.update(2) + .split(v304, v304, v304i, 8, TailStrategy::GuardWithIf) + .vectorize(v304i) + .reorder(v304i, v304, v305) + .fuse(v304, v305, v304) + .serial(v304); + updated_head2_filter.update(3) + .split(v304, v304, v304i, 8, TailStrategy::GuardWithIf) + .vectorize(v304i) + .reorder(v304i, v304, v305) + .fuse(v304, v305, v304) + .parallel(v304); + head2_filter_im_0_d_def__ + .split(v22, v22, v22i, 8, TailStrategy::GuardWithIf) + .vectorize(v22i) + .compute_at(updated_head2_filter, v304) + .reorder(v22i, v22, v23); + head2_filter_im_0_d_def__.update(0) + .split(v22, v22, v22i, 8, TailStrategy::GuardWithIf) + .vectorize(v22i) + .reorder(v22i, v22, v23, r1400_x, r1400_y); + head2_conv_1_d_def__ + .split(w, w, wi, 2, TailStrategy::GuardWithIf) + .split(n, n, ni, 128, TailStrategy::GuardWithIf) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 4, TailStrategy::GuardWithIf) + .unroll(wi) + .unroll(nii) + .vectorize(ci) + .compute_root() + .reorder(ci, wi, nii, c, ni, w, n) + .fuse(w, n, w) + .serial(w); + head2_relu_0_d_def__ + .store_in(MemoryType::Stack) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .unroll(w) + .unroll(n) + .vectorize(ci) + .compute_at(head2_conv_1_d_def__, c) + .reorder(ci, c, w, n); + head2_relu_0_d_def__.update(0) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .unroll(w) + .unroll(n) + .vectorize(ci) + .reorder(ci, c, w, n, r1294_x); + updated_head1_bias + .split(v302, v302, v302i, 8, TailStrategy::GuardWithIf) + .vectorize(v302i) + .compute_root() + .reorder(v302i, v302, v303) + .serial(v303); + updated_head1_bias.update(0) + .split(v302, v302, v302i, 8, TailStrategy::GuardWithIf) + .vectorize(v302i) + .reorder(v302i, v302); + updated_head1_bias.update(1) + .split(v302, v302, v302i, 8, TailStrategy::GuardWithIf) + .vectorize(v302i) + .reorder(v302i, v302); + updated_head1_bias.update(2) + .split(v302, v302, v302i, 8, TailStrategy::GuardWithIf) + .vectorize(v302i) + .reorder(v302i, v302); + updated_head1_bias.update(3) + .split(v302, v302, v302i, 8, TailStrategy::GuardWithIf) + .vectorize(v302i) + .reorder(v302i, v302); + head1_bias_im_0_d_def__ + .split(v14, v14, v14i, 8, TailStrategy::GuardWithIf) + .vectorize(v14i) + .compute_root() + .reorder(v14i, v14); + head1_bias_im_0_d_def__.update(0) + .split(v14, v14, v14i, 8, TailStrategy::GuardWithIf) + .vectorize(v14i) + .reorder(v14i, v14, r1594_x); + updated_head1_filter + .split(v299, v299, v299i, 5, TailStrategy::GuardWithIf) + .split(v301, v301, v301i, 2, TailStrategy::GuardWithIf) + .split(v298, v298, v298i, 8, TailStrategy::GuardWithIf) + .unroll(v299i) + .unroll(v301i) + .vectorize(v298i) + .compute_root() + .reorder(v298i, v298, v299i, v301i, v299, v300, v301) + .fuse(v300, v301, v300) + .fuse(v299, v300, v299) + .serial(v299); + updated_head1_filter.update(0) + .split(v299, v299, v299i, 5, TailStrategy::GuardWithIf) + .split(v298, v298, v298i, 8, TailStrategy::GuardWithIf) + .vectorize(v298i) + .reorder(v298i, v298, v299i, v299, v300) + .fuse(v299, v300, v299) + .serial(v299); + updated_head1_filter.update(1) + .split(v299, v299, v299i, 5, TailStrategy::GuardWithIf) + .split(v298, v298, v298i, 8, TailStrategy::GuardWithIf) + .vectorize(v298i) + .reorder(v298i, v298, v299i, v299, v300) + .fuse(v299, v300, v299) + .serial(v299); + updated_head1_filter.update(2) + .split(v299, v299, v299i, 5, TailStrategy::GuardWithIf) + .split(v298, v298, v298i, 8, TailStrategy::GuardWithIf) + .vectorize(v298i) + .reorder(v298i, v298, v299i, v299, v300) + .fuse(v299, v300, v299) + .serial(v299); + updated_head1_filter.update(3) + .split(v299, v299, v299i, 5, TailStrategy::GuardWithIf) + .split(v298, v298, v298i, 8, TailStrategy::GuardWithIf) + .vectorize(v298i) + .reorder(v298i, v298, v299i, v299, v300) + .fuse(v299, v300, v299) + .serial(v299); + squashed_head1_filter_0_d_def__ + .store_in(MemoryType::Stack) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .compute_at(updated_head1_filter, v298) + .reorder(ci, c, s, n); + squashed_head1_filter_0_d_def__.update(0) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .reorder(ci, c, s, n, r1614_x); + squashed_head1_filter_broadcast_0_d_def__ + .store_in(MemoryType::Stack) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .compute_at(updated_head1_filter, v299i) + .store_at(updated_head1_filter, v299) + .reorder(ci, c, w, s, n); + head1_conv_1_d_def__ + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .compute_root() + .reorder(ci, c, w) + .serial(w); + head1_conv_1_d_def__.update(0) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .reorder(ci, c, r1491_x, w) + .serial(w); + conv1_stage1_1_d_def__ + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .compute_root() + .reorder(wi, w, c) + .fuse(w, c, w) + .serial(w) + .reorder_storage(w, c); + conv1_stage1_1_d_def__.update(0) + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .reorder(wi, r1336_x, w, c) + .fuse(w, c, w) + .serial(w); + conv1_stage2_1_d_def__ + .split(c, c, ci, 14, TailStrategy::GuardWithIf) + .split(n, n, ni, 32, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 4) + .split(nii, nii, niii, 2) + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .compute_root() + .reorder(wi, w, ci, niii, nii, ni, c, n) + .fuse(c, n, c) + .parallel(c) + .reorder_storage(w, c, n); + sum_1_d_def__ + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .unroll(n) + .vectorize(ni) + .compute_at(conv1_stage2_1_d_def__, c) + .reorder(ni, n); + + relu1_0_d_def__.in() + .store_in(MemoryType::Stack) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .vectorize(ci) + .compute_at(conv1_stage2_1_d_def__, nii) + .reorder(ci, c, w, n); + relu1_0_d_def__.compute_at(relu1_0_d_def__.in(), w); + + cost_per_stage_output_0_d_def__ + .store_in(MemoryType::Stack) + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .compute_at(conv1_stage2_1_d_def__, nii) + .store_at(conv1_stage2_1_d_def__, ni) + .reorder(wi, w, n) + .reorder_storage(w, n); + f1_1_d_def__ + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .unroll(n) + .vectorize(ni) + .compute_at(conv1_stage2_1_d_def__, c) + .reorder(ni, n); + adjoint + .compute_root(); + f1 + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .compute_root() + .reorder(ni, n) + .serial(n); + f1.update(0) + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .reorder(ni, r81_x, n) + .serial(n); + cost_per_stage_output + .split(n, n, ni, 128, TailStrategy::GuardWithIf) + .split(w, w, wi, 2, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 8, TailStrategy::GuardWithIf) + .vectorize(nii) + .compute_root() + .reorder(nii, ni, wi, n, w) + .fuse(n, w, n) + .serial(n); + conv1_stage2 + .split(n, n, ni, 512, TailStrategy::GuardWithIf) + .split(c, c, ci, 10, TailStrategy::GuardWithIf) + .split(w, w, wi, 2, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 8, TailStrategy::GuardWithIf) + .vectorize(nii) + .compute_root() + .reorder(nii, ni, ci, wi, n, c, w) + .fuse(c, w, c) + .fuse(n, c, n) + .serial(n) + .reorder_storage(n, c, w); + conv1_stage2.update(0) + .split(n, n, ni, 512, TailStrategy::GuardWithIf) + .split(c, c, ci, 10, TailStrategy::GuardWithIf) + .split(w, w, wi, 2, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 8, TailStrategy::GuardWithIf) + .vectorize(nii) + .reorder(nii, r63_x, ni, ci, wi, n, c, w) + .fuse(c, w, c) + .fuse(n, c, n) + .serial(n); + head2_relu + .split(c, c, ci, 3, TailStrategy::GuardWithIf) + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .compute_root() + .reorder(ni, n, ci, c, w) + .fuse(c, w, c) + .serial(c) + .reorder_storage(n, c, w); + head2_conv + .split(n, n, ni, 512, TailStrategy::GuardWithIf) + .split(c, c, ci, 6, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 8, TailStrategy::GuardWithIf) + .vectorize(nii) + .compute_root() + .reorder(nii, ni, ci, n, c, w) + .fuse(c, w, c) + .fuse(n, c, n) + .serial(n) + .reorder_storage(n, c, w); + head2_conv.update(0) + .split(n, n, ni, 512, TailStrategy::GuardWithIf) + .split(c, c, ci, 6, TailStrategy::GuardWithIf) + .split(ni, ni, nii, 8, TailStrategy::GuardWithIf) + .vectorize(nii) + .reorder(nii, r40_x, ni, ci, n, c, w) + .fuse(c, w, c) + .fuse(n, c, n) + .parallel(n); + normalized_schedule_features + .split(c, c, ci, 11, TailStrategy::GuardWithIf) + .split(n, n, ni, 8, TailStrategy::GuardWithIf) + .vectorize(ni) + .compute_root() + .reorder(ni, n, ci, c, s) + .fuse(c, s, c) + .serial(c); + conv1_stage1 + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .compute_root() + .reorder(wi, w, c) + .fuse(w, c, w) + .serial(w) + .reorder_storage(w, c); + conv1_stage1.update(0) + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .reorder(wi, r54_x, w, c) + .fuse(w, c, w) + .serial(w); + head1_conv + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .compute_root() + .reorder(wi, w, c) + .fuse(w, c, w) + .serial(w) + .reorder_storage(w, c); + head1_conv.update(0) + .split(w, w, wi, 8, TailStrategy::GuardWithIf) + .vectorize(wi) + .reorder(wi, r31_x, r31_y, w, c) + .fuse(w, c, w) + .serial(w); + squashed_head1_filter + .split(s, s, si, 8, TailStrategy::GuardWithIf) + .vectorize(si) + .compute_at(head1_conv, w) + .reorder(si, s, c, n) + .reorder_storage(s, c, n); +} diff --git a/src/autoschedulers/anderson2021/demo_generator.cpp b/src/autoschedulers/anderson2021/demo_generator.cpp new file mode 100644 index 000000000000..8c31d68e5e1a --- /dev/null +++ b/src/autoschedulers/anderson2021/demo_generator.cpp @@ -0,0 +1,51 @@ +#include "Halide.h" + +namespace { + +using namespace Halide; + +class ConvRelu : public Halide::Generator { +public: + Input> input{"input", 4}; + Input> filter{"filter", 4}; + Input> bias{"bias", 1}; + Output> relu{"relu", 4}; + + void generate() { + const int N = 5, CI = 120, CO = 24, W = 100, H = 80; + + Var x("x"), y("y"), c("c"), n("n"); + + Func conv("conv"); + RDom r(0, CI, 0, 3, 0, 3); + conv(c, x, y, n) = bias(c); + conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); + relu(c, x, y, n) = max(0, conv(c, x, y, n)); + + relu.bound(c, 0, CO) + .bound(x, 0, W) + .bound(y, 0, H) + .bound(n, 0, N); + + relu.dim(0).set_bounds(0, CO).set_stride(1); + relu.dim(1).set_bounds(0, W).set_stride(CO); + relu.dim(2).set_bounds(0, H).set_stride(CO * W); + relu.dim(3).set_bounds(0, N).set_stride(CO * H * W); + + input.dim(0).set_bounds(0, CI).set_stride(1); + input.dim(1).set_bounds(0, W + 2).set_stride(CI); + input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2)); + input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2)); + + filter.dim(0).set_bounds(0, CO).set_stride(1); + filter.dim(1).set_bounds(0, 3).set_stride(CO); + filter.dim(2).set_bounds(0, 3).set_stride(CO * 3); + filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3); + + bias.dim(0).set_bounds(0, CO).set_stride(1); + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(ConvRelu, demo) diff --git a/src/autoschedulers/anderson2021/featurization_to_sample.cpp b/src/autoschedulers/anderson2021/featurization_to_sample.cpp new file mode 100644 index 000000000000..fa94cb840cb9 --- /dev/null +++ b/src/autoschedulers/anderson2021/featurization_to_sample.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +// A sample is a featurization + a runtime + some ids, all together in one file. +// This utility concats the runtime and ids onto a featurization to produce a sample. +int main(int argc, char **argv) { + if (argc != 6) { + std::cout << "Usage: featurization_to_sample in.featurization runtime pipeline_id schedule_id out.sample\n"; + return -1; + } + + std::ifstream src(argv[1], std::ios::binary); + if (!src) { + std::cerr << "Unable to open input file: " << argv[1] << "\n"; + return -1; + } + + std::ofstream dst(argv[5], std::ios::binary); + if (!dst) { + std::cerr << "Unable to open output file: " << argv[5] << "\n"; + return -1; + } + + dst << src.rdbuf(); + + // Input runtime value is presumed to be in seconds, + // but sample file stores times in milliseconds. + float r = atof(argv[2]) * 1000.f; + int32_t pid = atoi(argv[3]); + int32_t sid = atoi(argv[4]); + + dst.write((const char *)&r, 4); + dst.write((const char *)&pid, 4); + dst.write((const char *)&sid, 4); + + src.close(); + dst.close(); + + return 0; +} diff --git a/src/autoschedulers/anderson2021/generate_data.sh b/src/autoschedulers/anderson2021/generate_data.sh new file mode 100644 index 000000000000..6f8cf28d9142 --- /dev/null +++ b/src/autoschedulers/anderson2021/generate_data.sh @@ -0,0 +1,211 @@ +#!/bin/bash + +# This script will generate a batch of data using the autoscheduler, retraining +# the cost model after each batch. It can be used for generating training data or +# for autotuning on an individual app. +# It is a wrapper around autotune_loop.sh, which handles compiling, benchmarking, +# and retraining the cost model. This file makes the process more user friendly +# by providing statistics, support for resuming previous batches, autotuning +# across multiple apps, etc. +# +# Arguments: +# max_iterations - the number of batches to generate. The cost model is +# retrained after each +# resume - resume using the previously generated samples or start a new run? +# train_only - don't generate new data, just retrain the cost model with +# existing samples +# predict_only - don't generate new data, just predict the costs of the existing +# samples +# app - the individual application (in Halide/apps/) to generate data for. If +# not provided, it will generate a data for all the apps in the list below + +if [[ $# -ne 4 && $# -ne 5 ]]; then + echo "Usage: $0 max_iterations resume train_only predict_only app" + exit +fi + +set -e + +MAX_ITERATIONS=${1} +RESUME=${2} +TRAIN_ONLY=${3} +PREDICT_ONLY=${4} +APP=${5} + +if [[ $PREDICT_ONLY == 1 && $TRAIN_ONLY == 1 ]]; then + echo "At most one of train_only and predict_only can be set to 1." + exit +fi + +if [[ $PREDICT_ONLY == 1 ]]; then + echo "Predict only mode: ON" +fi + +SCRIPTS_DIR="$(dirname $0)/scripts" +source ${SCRIPTS_DIR}/utils.sh + +BEST_SCHEDULES_DIR=$(dirname $0)/best + +find_halide HALIDE_ROOT + +build_autoscheduler_tools ${HALIDE_ROOT} +get_absolute_autoscheduler_bin_dir ${HALIDE_ROOT} AUTOSCHED_BIN +get_autoscheduler_dir ${HALIDE_ROOT} AUTOSCHED_SRC + +export CXX="ccache ${CXX}" + +export HL_MACHINE_PARAMS=80,24000000,160 + +export HL_PERMIT_FAILED_UNROLL=1 + +export AUTOSCHED_BIN=${AUTOSCHED_BIN} +echo "AUTOSCHED_BIN set to ${AUTOSCHED_BIN}" +echo + +if [ ! -v HL_TARGET ]; then + get_host_target ${HALIDE_ROOT} HL_TARGET + HL_TARGET=${HL_TARGET}-cuda-cuda_capability_70 +fi + +export HL_TARGET=${HL_TARGET} + +echo "HL_TARGET set to ${HL_TARGET}" + +DEFAULT_SAMPLES_DIR_NAME="${SAMPLES_DIR:-autotuned_samples}" + +CURRENT_DATE_TIME="`date +%Y-%m-%d-%H-%M-%S`"; + +function ctrl_c() { + echo "Trap: CTRL+C received, exiting" + pkill -P $$ + + for app in $APPS; do + ps aux | grep ${app}.generator | awk '{print $2}' | xargs kill + + unset -v LATEST_SAMPLES_DIR + for f in "$APP_DIR/${DEFAULT_SAMPLES_DIR_NAME}"*; do + if [[ ! -d $f ]]; then + continue + fi + + if [[ -z ${LATEST_SAMPLES_DIR+x} || $f -nt $LATEST_SAMPLES_DIR ]]; then + LATEST_SAMPLES_DIR=$f + fi + done + + if [[ ${RESUME} -eq 1 && -z ${LATEST_SAMPLES_DIR+x} ]]; then + SAMPLES_DIR=${LATEST_SAMPLES_DIR} + else + while [[ 1 ]]; do + SAMPLES_DIR_NAME=${DEFAULT_SAMPLES_DIR_NAME}-${CURRENT_DATE_TIME} + SAMPLES_DIR="${APP_DIR}/${SAMPLES_DIR_NAME}" + + if [[ ! -d ${SAMPLES_DIR} ]]; then + break + fi + + sleep 1 + CURRENT_DATE_TIME="`date +%Y-%m-%d-%H-%M-%S`"; + done + fi + save_best_schedule_result ${BEST_SCHEDULES_DIR} ${SAMPLES_DIR} + done + + print_best_schedule_times $(dirname $0)/best + exit +} + +trap ctrl_c INT + +if [ -z $APP ]; then + APPS="bgu bilateral_grid local_laplacian nl_means lens_blur camera_pipe stencil_chain harris hist max_filter unsharp interpolate conv_layer cuda_mat_mul iir_blur depthwise_separable_conv" +else + APPS=${APP} +fi + +NUM_APPS=0 +for app in $APPS; do + NUM_APPS=$((NUM_APPS + 1)) +done + +echo "Autotuning on $APPS for $MAX_ITERATIONS iteration(s)" + +for app in $APPS; do + SECONDS=0 + APP_DIR="${HALIDE_ROOT}/apps/${app}" + + unset -v LATEST_SAMPLES_DIR + for f in "$APP_DIR/${DEFAULT_SAMPLES_DIR_NAME}"*; do + if [[ ! -d $f ]]; then + continue + fi + + if [[ -z ${LATEST_SAMPLES_DIR+x} || $f -nt $LATEST_SAMPLES_DIR ]]; then + LATEST_SAMPLES_DIR=$f + fi + done + + if [[ ${RESUME} -eq 1 && -z ${LATEST_SAMPLES_DIR+x} ]]; then + SAMPLES_DIR=${LATEST_SAMPLES_DIR} + echo "Resuming from existing run: ${SAMPLES_DIR}" + else + while [[ 1 ]]; do + SAMPLES_DIR_NAME=${DEFAULT_SAMPLES_DIR_NAME}-${CURRENT_DATE_TIME} + SAMPLES_DIR="${APP_DIR}/${SAMPLES_DIR_NAME}" + + if [[ ! -d ${SAMPLES_DIR} ]]; then + break + fi + + sleep 1 + CURRENT_DATE_TIME="`date +%Y-%m-%d-%H-%M-%S`"; + done + SAMPLES_DIR="${APP_DIR}/${SAMPLES_DIR_NAME}" + echo "Starting new run in: ${SAMPLES_DIR}" + fi + + OUTPUT_FILE="${SAMPLES_DIR}/autotune_out.txt" + PREDICTIONS_FILE="${SAMPLES_DIR}/predictions" + PREDICTIONS_WITH_FILENAMES_FILE="${SAMPLES_DIR}/predictions_with_filenames" + BEST_TIMES_FILE="${SAMPLES_DIR}/best_times" + + mkdir -p ${SAMPLES_DIR} + touch ${OUTPUT_FILE} + + if [[ ${app} = "cuda_mat_mul" ]]; then + app="mat_mul" + fi + + GENERATOR=bin/host/${app}.generator + make -C ${APP_DIR} ${GENERATOR} + + if [[ $PREDICT_ONLY != 1 ]]; then + NUM_BATCHES=${MAX_ITERATIONS} \ + TRAIN_ONLY=${TRAIN_ONLY} \ + SAMPLES_DIR=${SAMPLES_DIR} \ + HL_MACHINE_PARAMS=80,1,1 \ + SAMPLES_DIR=${SAMPLES_DIR} \ + HL_DEBUG_CODEGEN=0 \ + HL_SHARED_MEMORY_LIMIT=48 \ + bash ${AUTOSCHED_SRC}/autotune_loop.sh \ + ${APP_DIR}/${GENERATOR} \ + ${app} \ + ${HL_TARGET} \ + ${AUTOSCHED_SRC}/baseline.weights \ + ${AUTOSCHED_BIN} \ + ${TRAIN_ONLY} | tee -a ${OUTPUT_FILE} + fi + + WEIGHTS_FILE="${SAMPLES_DIR}/updated.weights" + predict_all ${HALIDE_ROOT} ${SAMPLES_DIR} ${WEIGHTS_FILE} ${PREDICTIONS_WITH_FILENAMES_FILE} 1 ${LIMIT:-0} + awk -F", " '{printf("%f, %f\n", $2, $3);}' ${PREDICTIONS_WITH_FILENAMES_FILE} > ${PREDICTIONS_FILE} + + echo "Computing average statistics..." + bash ${SCRIPTS_DIR}/average_times.sh ${SAMPLES_DIR} >> ${OUTPUT_FILE} + + echo "Total autotune time (s): ${SECONDS}" >> ${OUTPUT_FILE} + + save_best_schedule_result ${BEST_SCHEDULES_DIR} ${SAMPLES_DIR} +done + +print_best_schedule_times $(dirname $0)/best diff --git a/src/autoschedulers/anderson2021/get_host_target.cpp b/src/autoschedulers/anderson2021/get_host_target.cpp new file mode 100644 index 000000000000..5d0062def489 --- /dev/null +++ b/src/autoschedulers/anderson2021/get_host_target.cpp @@ -0,0 +1,21 @@ +#include "Halide.h" + +using namespace Halide; + +// Print the host target to stdout. +// Any extra arguments are assumed to be features that should be stripped from +// the target (as a convenience for use in Makefiles, where string manipulation +// can be painful). +int main(int argc, char **argv) { + Target t = get_host_target(); + for (int i = 1; i < argc; ++i) { + auto f = Target::feature_from_name(argv[i]); + if (f == Target::FeatureEnd) { + fprintf(stderr, "Unknown feature: %s\n", argv[i]); + exit(1); + } + t = t.without_feature(f); + } + printf("%s", t.to_string().c_str()); + return 0; +} diff --git a/src/autoschedulers/anderson2021/included_schedule_file.schedule.h b/src/autoschedulers/anderson2021/included_schedule_file.schedule.h new file mode 100644 index 000000000000..c84aab4cbc0c --- /dev/null +++ b/src/autoschedulers/anderson2021/included_schedule_file.schedule.h @@ -0,0 +1,60 @@ + +#ifndef included_schedule_file_SCHEDULE_H +#define included_schedule_file_SCHEDULE_H + +// MACHINE GENERATED -- DO NOT EDIT +// This schedule was automatically generated by apps/autoscheduler/AutoSchedule +// for target=x86-64-osx-avx-avx2-f16c-fma-sse41 +// with machine_params=16,16777216,40 + +#include "Halide.h" + +inline void apply_schedule_included_schedule_file( + ::Halide::Pipeline pipeline, + ::Halide::Target target) { + using ::Halide::Func; + using ::Halide::MemoryType; + using ::Halide::RVar; + using ::Halide::TailStrategy; + using ::Halide::Var; + + Func relu = pipeline.get_func(4); + Func conv = pipeline.get_func(3); + Var c(relu.get_schedule().dims()[0].var); + Var ci("ci"); + Var n(relu.get_schedule().dims()[3].var); + Var x(relu.get_schedule().dims()[1].var); + Var xi("xi"); + Var y(relu.get_schedule().dims()[2].var); + Var yi("yi"); + RVar r4_x(conv.update(0).get_schedule().dims()[0].var); + RVar r4_y(conv.update(0).get_schedule().dims()[1].var); + RVar r4_z(conv.update(0).get_schedule().dims()[2].var); + relu + .split(x, x, xi, 2, TailStrategy::ShiftInwards) + .split(c, c, ci, 8, TailStrategy::ShiftInwards) + .split(y, y, yi, 4, TailStrategy::ShiftInwards) + .unroll(xi) + .unroll(yi) + .vectorize(ci) + .compute_root() + .reorder(ci, xi, yi, c, y, x, n) + .fuse(x, n, x) + .parallel(x); + conv.update(0) + .split(c, c, ci, 8, TailStrategy::GuardWithIf) + .unroll(x) + .unroll(y) + .vectorize(ci) + .reorder(ci, c, x, y, n, r4_x, r4_y, r4_z); + conv + .store_in(MemoryType::Stack) + .split(c, c, ci, 8, TailStrategy::ShiftInwards) + .unroll(x) + .unroll(y) + .vectorize(ci) + .compute_at(relu, c) + .reorder(ci, c, x, y, n); +} + +#endif // included_schedule_file_SCHEDULE_H diff --git a/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp b/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp new file mode 100644 index 000000000000..1a5cb99a784f --- /dev/null +++ b/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp @@ -0,0 +1,54 @@ +#include "Halide.h" + +#if defined(GENERATING_SCHEDULE) +// nothing +#else +#include "included_schedule_file.schedule.h" +#endif + +namespace { + +// Trivial Generator for testing (and demonstrating) use of .schedule.h +// files produced by the autoschedulers; this is very similar to +// demo_generator.cpp, but packaged separately to avoid confusion for +// newcomers. +struct IncludedScheduleFile : public Halide::Generator { + Input> input{"input", 4}; + Input> filter{"filter", 4}; + Input> bias{"bias", 1}; + Output> relu{"relu", 4}; + + void generate() { + const int N = 5, CI = 120, CO = 24, W = 100, H = 80; + + Var x("x"), y("y"), c("c"), n("n"); + + // Algorithm + Func conv("conv"); + RDom r(0, CI, 0, 3, 0, 3); + conv(c, x, y, n) = bias(c); + conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); + relu(c, x, y, n) = max(0, conv(c, x, y, n)); + + // Estimates (for autoscheduler and/or RunGen) + input.set_estimates({{0, CI}, {0, W + 2}, {0, H + 2}, {0, N}}); + filter.set_estimates({{0, CO}, {0, 3}, {0, 3}, {0, CI}}); + bias.set_estimates({{0, CO}}); + relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}}); + + // Schedule + if (auto_schedule) { + // nothing + } else { +#if defined(GENERATING_SCHEDULE) + abort(); +#else + apply_schedule_included_schedule_file(get_pipeline(), get_target()); +#endif + } + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(IncludedScheduleFile, included_schedule_file) diff --git a/src/autoschedulers/anderson2021/retrain_cost_model.cpp b/src/autoschedulers/anderson2021/retrain_cost_model.cpp new file mode 100644 index 000000000000..7f40b318f961 --- /dev/null +++ b/src/autoschedulers/anderson2021/retrain_cost_model.cpp @@ -0,0 +1,728 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Clock = std::chrono::high_resolution_clock; + +#include "cmdline.h" + +#include "DefaultCostModel.h" +#include "HalideBuffer.h" +#include "NetworkSize.h" + +namespace { + +using namespace Halide; + +using Halide::Runtime::Buffer; +using std::map; +using std::string; +using std::vector; + +struct Flags { + int epochs = 0; + std::vector rates = {0.0001f}; + string initial_weights_path; + string weights_out_path; + int num_cores = 32; + bool reset_weights = false; + bool randomize_weights = false; + string best_benchmark_path; + string best_schedule_path; + string predictions_file; + bool verbose; + bool partition_schedules; + int limit; + + Flags(int argc, char **argv) { + cmdline::parser a; + + const char *kNoDesc = ""; + + constexpr bool kOptional = false; + a.add("epochs"); + a.add("rates"); + a.add("initial_weights", '\0', kNoDesc, kOptional, ""); + a.add("weights_out"); + a.add("reset_weights", '\0', kNoDesc, kOptional, false); + a.add("randomize_weights", '\0', kNoDesc, kOptional, false); + a.add("num_cores"); + a.add("best_benchmark"); + a.add("best_schedule"); + a.add("predictions_file"); + a.add("verbose"); + a.add("partition_schedules"); + a.add("limit"); + + a.parse_check(argc, argv); // exits if parsing fails + + epochs = a.get("epochs"); + rates = parse_floats(a.get("rates")); + initial_weights_path = a.get("initial_weights"); + weights_out_path = a.get("weights_out"); + reset_weights = a.exist("reset_weights") && a.get("reset_weights"); + randomize_weights = a.exist("randomize_weights") && a.get("randomize_weights"); + best_benchmark_path = a.get("best_benchmark"); + best_schedule_path = a.get("best_schedule"); + predictions_file = a.get("predictions_file"); + verbose = a.exist("verbose") && a.get("verbose"); + partition_schedules = a.exist("partition_schedules") && a.get("partition_schedules"); + limit = a.get("limit"); + + if (!reset_weights && epochs <= 0) { + std::cerr << "--epochs must be specified and > 0.\n"; + std::cerr << a.usage(); + exit(1); + } + if (!reset_weights && (!initial_weights_path.empty()) == randomize_weights) { + std::cerr << "You must specify exactly one of --initial_weights or --randomize_weights.\n"; + std::cerr << a.usage(); + exit(1); + } + if (weights_out_path.empty()) { + std::cerr << "--weights_out must be specified.\n"; + std::cerr << a.usage(); + exit(1); + } + if (!reset_weights && rates.empty()) { + std::cerr << "--rates cannot be empty.\n"; + std::cerr << a.usage(); + exit(1); + } + } + + std::vector parse_floats(const std::string &s) { + const char *c = s.c_str(); + std::vector v; + while (isspace(*c)) + ++c; + while (*c) { + string f; + while (*c && !isspace(*c)) { + f += *c++; + } + v.push_back(std::atof(f.c_str())); + while (isspace(*c)) { + ++c; + } + } + return v; + } +}; + +constexpr int kModels = 1; + +struct Sample { + vector runtimes; // in msec + double prediction[kModels]; + string filename; + int32_t schedule_id; + uint64_t schedule_hash; + Buffer schedule_features; +}; + +struct PipelineData { + int32_t pipeline_id; + int32_t num_stages; + Buffer pipeline_features; + uint64_t pipeline_hash; +}; + +struct PipelineSample { + map schedules; + uint64_t fastest_schedule_hash; + float fastest_runtime{1e30f}; // in msec +}; + +uint64_t hash_floats(uint64_t h, const float *begin, const float *end) { + while (begin != end) { + uint32_t bits = *((const uint32_t *)begin); + // From boost + h ^= (bits + 0x9e3779b9 + (h << 6) + (h >> 2)); + begin++; + } + return h; +} + +bool ends_with(const string &str, const string &suffix) { + if (str.size() < suffix.size()) return false; + size_t off = str.size() - suffix.size(); + for (size_t i = 0; i < suffix.size(); i++) { + if (str[off + i] != suffix[i]) return false; + } + return true; +} + +string leaf(const string &path) { + size_t slash_pos = path.rfind('/'); +#ifdef _WIN32 + if (slash_pos == string::npos) { + // Windows is a thing + slash_pos = path.rfind('\\'); + } +#endif + if (slash_pos != string::npos) { + return path.substr(slash_pos + 1); + } else { + return path; + } +} + +// Load all the samples, reading filenames from stdin +size_t load_samples(map &training_set, map &validation_set, map &pipelines, const Flags &flags, bool predict_only) { + vector scratch(10 * 1024 * 1024); + + int best = -1; + float best_runtime = 1e20f; + string best_path; + + size_t num_read = 0, num_unique = 0; + auto start = Clock::now(); + std::cout << "Loading samples...\n"; + while (!std::cin.eof()) { + string s; + std::cin >> s; + if (s.empty()) { + std::cout << "Empty: " << s << "\n"; + continue; + } + if (!ends_with(s, ".sample")) { + std::cout << "Skipping file: " << s << "\n"; + continue; + } + std::ifstream file(s); + file.read((char *)(scratch.data()), scratch.size() * sizeof(float)); + const size_t floats_read = file.gcount() / sizeof(float); + const size_t num_features = floats_read - 3; + const size_t features_per_stage = head2_w + (head1_w + 1) * head1_h; + file.close(); + // Note we do not check file.fail(). The various failure cases + // are handled below by checking the number of floats read. We + // expect truncated files if the benchmarking or + // autoscheduling procedure crashes and want to filter them + // out with a warning. + + if (floats_read == scratch.size()) { + std::cout << "Too-large sample: " << s << " " << floats_read << "\n"; + continue; + } + if (num_features % features_per_stage != 0) { + if (flags.verbose) { + std::cout << "Truncated sample: " << s << " " << floats_read << "\n"; + } + continue; + } + const size_t num_stages = num_features / features_per_stage; + + const float runtime = scratch[num_features]; + if (runtime > 100000) { // Don't try to predict runtime over 100s + std::cout << "Implausible runtime in ms: " << runtime << "\n"; + continue; + } + // std::cout << "Runtime: " << runtime << "\n"; + + int pipeline_id = *((int32_t *)(&scratch[num_features + 1])); + const int schedule_id = *((int32_t *)(&scratch[num_features + 2])); + + if (runtime < best_runtime) { + best_runtime = runtime; + best = schedule_id; + best_path = s; + } + + PipelineData &p = pipelines[pipeline_id]; + + if (p.pipeline_features.data() == nullptr) { + p.pipeline_id = pipeline_id; + p.num_stages = (int)num_stages; + p.pipeline_features = Runtime::Buffer(head1_w, head1_h, num_stages); + for (size_t i = 0; i < num_stages; i++) { + for (int x = 0; x < head1_w; x++) { + for (int y = 0; y < head1_h; y++) { + float f = scratch[i * features_per_stage + (x + 1) * 7 + y + head2_w]; + if (f < 0 || std::isnan(f)) { + std::cout << "Negative or NaN pipeline feature: " << x << " " << y << " " << i << " " << f << "\n"; + } + p.pipeline_features(x, y, i) = f; + } + } + } + + p.pipeline_hash = hash_floats(0, p.pipeline_features.begin(), p.pipeline_features.end()); + } + + uint64_t schedule_hash = 0; + for (size_t i = 0; i < num_stages; i++) { + schedule_hash = + hash_floats(schedule_hash, + &scratch[i * features_per_stage], + &scratch[i * features_per_stage + head2_w]); + } + + uint64_t hash = flags.partition_schedules ? schedule_hash : p.pipeline_hash; + + // Whether or not a pipeline/schedule is part of the validation set + // can't be a call to rand. It must be a fixed property of a + // hash of some aspect of it. This way you don't accidentally + // do a training run where a validation set member was in the + // training set of a previous run. The id of the fastest + // schedule will do as a hash. + PipelineSample &ps = ((hash & 7) == 0) ? validation_set[pipeline_id] : training_set[pipeline_id]; + + auto it = ps.schedules.find(schedule_hash); + if (it != ps.schedules.end()) { + // Keep the smallest runtime at the front + float best = it->second.runtimes[0]; + if (runtime < best) { + it->second.runtimes.push_back(best); + it->second.runtimes[0] = runtime; + it->second.filename = s; + } else { + it->second.runtimes.push_back(runtime); + } + if (runtime < ps.fastest_runtime) { + ps.fastest_runtime = runtime; + ps.fastest_schedule_hash = schedule_hash; + } + } else { + Sample sample; + sample.schedule_hash = schedule_hash; + sample.filename = s; + sample.runtimes.push_back(runtime); + for (int i = 0; i < kModels; i++) { + sample.prediction[i] = 0.0; + } + sample.schedule_id = schedule_id; + sample.schedule_features = Buffer(head2_w, num_stages); + + bool ok = true; + for (size_t i = 0; i < num_stages; i++) { + for (int x = 0; x < head2_w; x++) { + float f = scratch[i * features_per_stage + x]; + if (f < 0 || f > 1e14 || std::isnan(f)) { + std::cout << "Negative or implausibly large schedule feature: " << i << " " << x << " " << f << "\n"; + // Something must have overflowed + ok = false; + } + sample.schedule_features(x, i) = f; + } + /* + if (sample.schedule_features(0, i) != sample.schedule_features(1, i)) { + std::cout << "Rejecting sliding window schedule for now\n"; + ok = false; + } + */ + } + if (ok) { + if (runtime < ps.fastest_runtime) { + ps.fastest_runtime = runtime; + ps.fastest_schedule_hash = schedule_hash; + } + ps.schedules.emplace(schedule_hash, std::move(sample)); + num_unique++; + } + } + num_read++; + + if (num_read % 10000 == 0) { + std::cout << "Samples loaded: " << num_read << " (" << num_unique << " unique)\n"; + } + } + + auto dur = Clock::now() - start; + auto ms = std::chrono::duration_cast(dur).count(); + auto avg = ms / (float)num_read; + std::cout << "Samples loaded: " << num_read << " (" << num_unique << " unique) in " << ms << "ms (avg. per sample = " << avg << " ms)\n"; + + // If the training set is empty, we are likely training on a single pipeline + if (training_set.empty()) { + training_set.swap(validation_set); + } + + // Check the noise level + for (const auto &pipe : training_set) { + double variance_sum = 0; + size_t count = 0; + // Compute the weighted average of variances across all samples + for (const auto &p : pipe.second.schedules) { + if (p.second.runtimes.empty()) { + std::cerr << "Empty runtimes for schedule: " << p.first << "\n"; + abort(); + } + if (flags.verbose) { + std::cout << "Unique sample: " << leaf(p.second.filename) << " : " << p.second.runtimes[0] << "\n"; + } + if (p.second.runtimes.size() > 1) { + // Compute variance from samples + double mean = 0; + for (float f : p.second.runtimes) { + mean += f; + } + mean /= p.second.runtimes.size(); + double variance = 0; + for (float f : p.second.runtimes) { + f -= mean; + variance += f * f; + } + variance_sum += variance; + count += p.second.runtimes.size() - 1; + } + } + if (count > 0) { + double stddev = std::sqrt(variance_sum / count); + std::cout << "Noise level: " << stddev << "\n"; + } + } + + std::cout << "Distinct pipelines: " << training_set.size() + validation_set.size() << "\n"; + + std::ostringstream o; + o << "Best runtime is " << best_runtime << " msec, from schedule id " << best << " in file " << best_path << "\n"; + std::cout << o.str(); + if (!predict_only && !flags.best_benchmark_path.empty()) { + std::ofstream f(flags.best_benchmark_path, std::ios_base::trunc); + f << o.str(); + f.close(); + assert(!f.fail()); + } + if (!predict_only && !flags.best_schedule_path.empty()) { + // best_path points to a .sample file; look for a .schedule.h file in the same dir + size_t dot = best_path.rfind('.'); + assert(dot != string::npos && best_path.substr(dot) == ".sample"); + string schedule_file = best_path.substr(0, dot) + ".schedule.h"; + std::ifstream src(schedule_file); + + if (!src.good()) { + std::cout << "Could not find " << schedule_file << ". Unable to save it as the best schedule. Continuing...\n"; + return num_read; + } + std::ofstream dst(flags.best_schedule_path); + dst << src.rdbuf(); + assert(!src.fail()); + assert(!dst.fail()); + } + + return num_read; +} + +void save_predictions(const map &samples, const string &filename) { + std::ostringstream out; + + for (const auto &p : samples) { + for (const auto &sched : p.second.schedules) { + out << sched.second.filename << ", " << sched.second.prediction[0] << ", " << sched.second.runtimes[0] << "\n"; + } + } + + std::ofstream file(filename, std::ios_base::trunc); + file << out.str(); + file.close(); + assert(!file.fail()); + + std::cout << "Predictions saved to: " << filename << "\n"; +} + +void print_statistics(const map &training_set, const map &validation_set) { + int64_t num_training_set_schedules = 0; + int64_t num_val_set_schedules = 0; + + for (const auto &ps : training_set) { + num_training_set_schedules += ps.second.schedules.size(); + } + + for (const auto &ps : validation_set) { + num_val_set_schedules += ps.second.schedules.size(); + } + + std::cout << "Training set: " + << training_set.size() + << " pipelines, " + << num_training_set_schedules + << " schedules. Validation set: " + << validation_set.size() + << " pipelines, " + << num_val_set_schedules + << " schedules.\n"; +} + +} // namespace + +int main(int argc, char **argv) { + Flags flags(argc, argv); + + // Iterate through the pipelines + vector> tpp; + Internal::Autoscheduler::Statistics stats; + for (int i = 0; i < kModels; i++) { + tpp.emplace_back(make_default_cost_model(stats, flags.initial_weights_path, flags.weights_out_path, flags.randomize_weights || flags.reset_weights)); + } + + if (flags.reset_weights) { + std::cout << "Saving new random weights...\n"; + for (int i = 0; i < kModels; i++) { + tpp[i]->save_weights(); + } + return 0; + } + + map samples; + map validation_set; + map pipelines; + bool predict_only = !flags.predictions_file.empty(); + size_t num_samples = load_samples(samples, validation_set, pipelines, flags, predict_only); + print_statistics(samples, validation_set); + + if (predict_only) { + std::cout << "Predicting only (no training)\n"; + flags.epochs = 1; + } + + std::cout.setf(std::ios::fixed, std::ios::floatfield); + std::cout.precision(4); + + auto seed = time(NULL); + std::mt19937 rng((uint32_t)seed); + + std::cout << "Iterating over " << samples.size() << " pipelines using seed = " << seed << "\n"; + + std::cout << "Constructing training batches\n"; + struct Batch { + int pipeline_id; + int first; + int batch_size; + }; + vector training_batches, validation_batches; + for (int train = 0; train < 2; train++) { + for (auto &p : train ? samples : validation_set) { + for (int first = 0; first < (int)p.second.schedules.size(); first += 64) { + Batch b; + b.pipeline_id = p.first; + b.first = first; + int end = std::min((int)p.second.schedules.size(), first + 64); + b.batch_size = end - first; + if (b.batch_size > 8) { + if (train) { + training_batches.push_back(b); + } else { + validation_batches.push_back(b); + } + } + } + } + } + std::cout << training_batches.size() << " " << validation_batches.size() << " batches constructed\n"; + + std::chrono::time_point start = Clock::now(); + for (float learning_rate : flags.rates) { + float loss_sum[kModels] = {0}, loss_sum_counter[kModels] = {0}; + float correct_ordering_rate_sum[kModels] = {0}; + float correct_ordering_rate_count[kModels] = {0}; + float v_correct_ordering_rate_sum[kModels] = {0}; + float v_correct_ordering_rate_count[kModels] = {0}; + + for (int e = 0; e < flags.epochs; e++) { + std::chrono::time_point epoch_start = Clock::now(); + int counter = 0; + + float worst_miss = 0; + uint64_t worst_miss_pipeline_id = 0; + uint64_t worst_miss_schedule_id = 0; + + struct Inversion { + int pipeline_id; + string f1, f2; + float p1, p2; + float r1, r2; + float badness = 0; + } worst_inversion; + +#if defined(_OPENMP) +#pragma omp parallel for +#endif + for (int model = 0; model < kModels; model++) { + loss_sum[model] = 0; + loss_sum_counter[model] = 0; + correct_ordering_rate_sum[model] = 0; + correct_ordering_rate_count[model] = 0; + v_correct_ordering_rate_sum[model] = 0; + v_correct_ordering_rate_count[model] = 0; + + std::shuffle(training_batches.begin(), training_batches.end(), rng); + + for (int train = 0; train < 2; train++) { + auto &tp = tpp[model]; + + for (auto &p : train ? training_batches : validation_batches) { + tp->reset(); + const auto &pipeline = pipelines[p.pipeline_id]; + auto &sample = train ? samples[p.pipeline_id] : validation_set[p.pipeline_id]; + tp->set_pipeline_features(pipeline.pipeline_features, flags.num_cores); + + int fastest_idx = 0; + Halide::Runtime::Buffer runtimes(p.batch_size); + + auto it = sample.schedules.begin(); + std::advance(it, p.first); + std::vector> cost_per_stage; + cost_per_stage.resize(p.batch_size); + for (int j = 0; j < p.batch_size; j++) { + auto &sched = it->second; + Halide::Runtime::Buffer buf; + tp->enqueue(pipeline.num_stages, &buf, &sched.prediction[model], &cost_per_stage[j]); + runtimes(j) = sched.runtimes[0]; + if (runtimes(j) < runtimes(fastest_idx)) { + fastest_idx = j; + } + buf.copy_from(sched.schedule_features); + it++; + } + + float loss = 0.0f; + if (train && !predict_only) { + loss = tp->backprop(runtimes, learning_rate); + assert(!std::isnan(loss)); + loss_sum[model] += loss; + loss_sum_counter[model]++; + + auto it = sample.schedules.begin(); + std::advance(it, p.first); + for (int j = 0; j < p.batch_size; j++) { + auto &sched = it->second; + float m = sched.runtimes[0] / (sched.prediction[model] + 1e-10f); + if (m > worst_miss) { + worst_miss = m; + worst_miss_pipeline_id = p.pipeline_id; + worst_miss_schedule_id = it->first; + } + it++; + } + } else { + tp->evaluate_costs(); + } + + if (true) { + int good = 0, bad = 0; + for (auto &sched : sample.schedules) { + auto &ref = sample.schedules[sample.fastest_schedule_hash]; + if (sched.second.prediction[model] == 0) continue; + assert(sched.second.runtimes[0] >= ref.runtimes[0]); + float runtime_ratio = sched.second.runtimes[0] / ref.runtimes[0]; + if (runtime_ratio <= 1.3f) continue; // Within 30% of the runtime of the best + if (sched.second.prediction[model] >= ref.prediction[model]) { + good++; + } else { + if (train) { + float badness = (sched.second.runtimes[0] - ref.runtimes[0]) * (ref.prediction[model] - sched.second.prediction[model]); + badness /= (ref.runtimes[0] * ref.runtimes[0]); + if (badness > worst_inversion.badness) { + worst_inversion.pipeline_id = p.pipeline_id; + worst_inversion.badness = badness; + worst_inversion.r1 = ref.runtimes[0]; + worst_inversion.r2 = sched.second.runtimes[0]; + worst_inversion.p1 = ref.prediction[model]; + worst_inversion.p2 = sched.second.prediction[model]; + worst_inversion.f1 = ref.filename; + worst_inversion.f2 = sched.second.filename; + } + } + bad++; + } + } + if (train) { + correct_ordering_rate_sum[model] += good; + correct_ordering_rate_count[model] += good + bad; + } else { + v_correct_ordering_rate_sum[model] += good; + v_correct_ordering_rate_count[model] += good + bad; + } + } + } + } + + counter++; + } + + std::cout << "Loss: "; + for (int model = 0; model < kModels; model++) { + std::cout << loss_sum[model] / loss_sum_counter[model] << " "; + } + if (kModels > 1) std::cout << "\n"; + std::cout << " Rate: "; + int best_model = 0; + float best_rate = 0; + for (int model = 0; model < kModels; model++) { + float rate = correct_ordering_rate_sum[model] / correct_ordering_rate_count[model]; + if (correct_ordering_rate_count[model] == 0) { + std::cout << "? "; + } else { + std::cout << rate << " "; + } + + rate = v_correct_ordering_rate_sum[model] / v_correct_ordering_rate_count[model]; + if (rate < best_rate) { + best_model = model; + best_rate = rate; + } + if (v_correct_ordering_rate_count[model] == 0) { + std::cout << "? "; + } else { + std::cout << rate << " "; + } + } + + if (kModels > 1) std::cout << "\n"; + if (!predict_only && samples.count(worst_miss_pipeline_id)) { + std::cout << " Worst: " << worst_miss << " " << leaf(samples[worst_miss_pipeline_id].schedules[worst_miss_schedule_id].filename) << " "; + } + + auto epoch_duration = Clock::now() - epoch_start; + auto total_duration = Clock::now() - start; + + auto epoch_ms = std::chrono::duration_cast(epoch_duration).count(); + auto total_ms = std::chrono::duration_cast(total_duration).count(); + std::cout << "(Epoch " << e + 1 << " "; + std::cout << "took " << epoch_ms << " ms. "; + std::cout << "Total time: " << total_ms << " ms. "; + std::cout << "Avg. time per epoch: " << total_ms / (float)(e + 1) << " ms. "; + std::cout << "Avg. time per epoch, per sample: " << total_ms / (float)((e + 1) * num_samples) << " ms)\n"; + + if (worst_inversion.badness > 0) { + std::cout << "Worst inversion:\n" + << leaf(worst_inversion.f1) << " predicted: " << worst_inversion.p1 << " actual: " << worst_inversion.r1 << "\n" + << leaf(worst_inversion.f2) << " predicted: " << worst_inversion.p2 << " actual: " << worst_inversion.r2 << "\n"; + if (samples.size() > 50000) { + // For robustness during training on large numbers + // of random pipelines, we discard poorly + // performing samples from the training set + // only. Some of them are weird degenerate + // pipelines. + samples.erase(worst_inversion.pipeline_id); + } + } + + if (!predict_only) { + tpp[best_model]->save_weights(); + } + + if (!predict_only && loss_sum[best_model] < 1e-5f) { + std::cout << "Zero loss, returning early\n"; + return 0; + } + } + } + + if (predict_only) { + save_predictions(samples, flags.predictions_file); + save_predictions(validation_set, flags.predictions_file + "_validation_set"); + } + + return 0; +} diff --git a/src/autoschedulers/anderson2021/scripts/average_times.sh b/src/autoschedulers/anderson2021/scripts/average_times.sh new file mode 100644 index 000000000000..bee5a0888452 --- /dev/null +++ b/src/autoschedulers/anderson2021/scripts/average_times.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Usage: $0 samples_dir" + exit +fi + +SAMPLES_DIR=${1} + +if [ ! -d ${SAMPLES_DIR} ]; then + echo "Samples directory not found: ${SAMPLES_DIR}" + exit +fi + +echo "Samples directory: ${SAMPLES_DIR}" + +GREEDY_SAMPLES=$(find ${SAMPLES_DIR} -regextype sed -regex ".*/.*/[1-9][0-9]*/compile_err.txt") + +echo "$GREEDY_SAMPLES" | xargs grep "Number of states added:" | awk -F" " '{sum += $5}; END{printf("Average number of states added (greedy): %d\n", sum / NR);}' + +grep "Number of states added:" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $5}; END{printf("Average number of states added (beam search): %d\n", sum / NR);}' + +echo "$GREEDY_SAMPLES" | xargs grep "Number of featurizations computed:" | awk -F" " '{sum += $5}; END{printf("Average number of featurizations computed (greedy): %d\n", sum / NR);}' + +grep "Number of featurizations computed:" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $5}; END{printf("Average number of featurizations computed (beam search): %d\n", sum / NR);}' + +echo "$GREEDY_SAMPLES" | xargs grep "Number of schedules evaluated by cost model" | awk -F" " '{sum += $8}; END{printf("Average number of schedules evaluated by cost model (greedy): %d\n", sum / NR);}' + +grep "Number of schedules evaluated by cost model" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $8}; END{printf("Average number of schedules evaluated by cost model (beam search): %d\n", sum / NR);}' + +echo "$GREEDY_SAMPLES" | xargs grep "Compile time" | awk '{sum += $4}; END{printf("Average greedy compile time: %f s\n", sum / NR);}' + +grep "Compile time" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $4}; END{printf("Average beam search compile time: %f s\n", sum / NR);}' + +echo "$GREEDY_SAMPLES" | xargs grep "Time taken for autoscheduler" | awk '{sum += $6}; END{printf("Average greedy autoschedule time: %f s\n", sum / NR);}' + +grep "Time taken for autoscheduler" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $6}; END{printf("Average beam search autoschedule time: %f s\n", sum / NR);}' + +# Average featurization time +echo "$GREEDY_SAMPLES" | xargs grep "Average featurization time" | awk -F" " '{sum += $NF}; END{printf("Average featurization time (greedy): %f\n", sum / NR);}' + +grep "Average featurization time" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $NF}; END{printf("Average featurization time (beam search): %f\n", sum / NR);}' + +# Average cost model evaluation time +echo "$GREEDY_SAMPLES" | xargs grep "Average cost model evaluation time" | awk -F" " '{sum += $NF}; END{printf("Average cost model evaluation time (greedy): %f\n", sum / NR);}' + +grep "Average cost model evaluation time" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $NF}; END{printf("Average cost model evaluation time (beam search): %f\n", sum / NR);}' + +# Average number of memoization hits +echo "$GREEDY_SAMPLES" | xargs grep "Number of memoization hits:" | awk -F" " '{sum += $NF}; END{printf("Average number of memoization hits (greedy): %d\n", sum / NR);}' + +grep "Number of memoization hits:" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $NF}; END{printf("Average number of memoization hits (beam search): %d\n", sum / NR);}' + +# Average number of memoization misses +echo "$GREEDY_SAMPLES" | xargs grep "Number of memoization misses:" | awk -F" " '{sum += $NF}; END{printf("Average number of memoization misses (greedy): %d\n", sum / NR);}' + +grep "Number of memoization misses:" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $NF}; END{printf("Average number of memoization misses (beam search): %d\n", sum / NR);}' + +echo "$GREEDY_SAMPLES" | xargs grep "Number of tilings generated:" | awk -F" " '{sum += $NF}; END{printf("Average number of tilings generated (greedy): %d\n", sum / NR);}' + +grep "Number of tilings generated:" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $NF}; END{printf("Average number of tilings generated (beam search): %d\n", sum / NR);}' + +echo "$GREEDY_SAMPLES" | xargs grep "Number of tilings accepted:" | awk -F" " '{sum += $NF}; END{printf("Average number of tilings accepted (greedy): %d\n", sum / NR);}' + +grep "Number of tilings accepted:" ${SAMPLES_DIR}/*/0/compile_err.txt | awk -F" " '{sum += $NF}; END{printf("Average number of tilings accepted (beam search): %d\n", sum / NR);}' + +echo diff --git a/src/autoschedulers/anderson2021/scripts/predict_all.sh b/src/autoschedulers/anderson2021/scripts/predict_all.sh new file mode 100644 index 000000000000..90f0abc81117 --- /dev/null +++ b/src/autoschedulers/anderson2021/scripts/predict_all.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "Usage: $0 samples_dir weights_file predictions_file include_filenames limit" + exit +fi + +source $(dirname $0)/utils.sh + +find_halide HALIDE_ROOT + +make_dir_path_absolute ${1} SAMPLES_DIR +make_file_path_absolute ${2} WEIGHTS_FILE +make_file_path_absolute ${3} PREDICTIONS_FILE +INCLUDE_FILENAMES=${4} +LIMIT=${5} + +echo +echo "Samples directory: ${SAMPLES_DIR}" +echo "Weights file: ${WEIGHTS_FILE}" +echo "Saving predictions to: ${PREDICTIONS_FILE}" + +build_retrain_cost_model ${HALIDE_ROOT} + +NUM_CORES=80 +NUM_EPOCHS=1 + +retrain_cost_model ${HALIDE_ROOT} ${SAMPLES_DIR} ${WEIGHTS_FILE} ${NUM_CORES} ${NUM_EPOCHS} 0 0.001 ${PREDICTIONS_FILE} 0 0 ${LIMIT} + +if [[ $INCLUDE_FILENAMES == 1 ]]; then + exit +fi + +RESULT=$(cat ${PREDICTIONS_FILE} | awk -F", " '{printf("%f, %f\n", $2, $3);}') > ${PREDICTIONS_FILE} +echo "$RESULT" > ${PREDICTIONS_FILE} +RESULT=$(cat ${PREDICTIONS_FILE}_validation_set | awk -F", " '{printf("%f, %f\n", $2, $3);}') > ${PREDICTIONS_FILE}_validation_set +echo "$RESULT" > ${PREDICTIONS_FILE}_validation_set diff --git a/src/autoschedulers/anderson2021/scripts/utils.sh b/src/autoschedulers/anderson2021/scripts/utils.sh new file mode 100644 index 000000000000..5ef88822fed0 --- /dev/null +++ b/src/autoschedulers/anderson2021/scripts/utils.sh @@ -0,0 +1,410 @@ +#!/bin/bash + +function find_halide() { + local -n halide_root_ref=$1 + local -r silent="${2:-0}" + local dir=$(pwd) + + for i in {1..5}; do + if [[ -f ${dir}/distrib/include/Halide.h ]]; then + halide_root_ref=$(cd ${dir}; pwd) + if [[ $silent -ne 1 ]]; then + echo "Using Halide in ${halide_root_ref}" + fi + return 0 + fi + dir=${dir}/.. + done + + echo "Unable to find Halide. Try re-running $(basename $0) from somewhere in the Halide tree." + exit +} + +function make_dir_path_absolute() { + local -r path=$1 + local -n absolute_path_ref=$2 + absolute_path_ref=$(cd ${path}; pwd) +} + +function make_file_path_absolute() { + local -r path=$1 + local -n converted_path_ref=$2 + converted_path_ref=$(cd $(dirname ${path}); pwd)/$(basename ${path}) +} + +function get_autoscheduler_dir() { + local -r halide_root=$1 + local -n autoscheduler_dir_ref=$2 + autoscheduler_dir_ref=${halide_root}/src/autoschedulers/anderson2021 +} + +function get_absolute_autoscheduler_bin_dir() { + local -r halide_root=$1 + local -n autoscheduler_bin_dir_ref=$2 + get_autoscheduler_dir $halide_root autoscheduler_dir + autoscheduler_bin_dir_ref=${autoscheduler_dir}/bin +} + +function get_autoscheduler_bin_dir() { + local -n autoscheduler_bin_dir_ref=$1 + autoscheduler_bin_dir_ref=bin +} + +function get_autoscheduler_make_bin_dir() { + local -n autoscheduler_bin_dir_ref=$1 + autoscheduler_bin_dir_ref=../autoscheduler/bin +} + +function get_autoscheduler_scripts_dir() { + local -r halide_root=$1 + local -n autoscheduler_scripts_dir_ref=$2 + get_autoscheduler_dir $halide_root autoscheduler_dir + autoscheduler_scripts_dir_ref=${autoscheduler_dir}/scripts +} + +function build_rungenmain() { + local -r halide_root=$1 + get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_bin_dir autoscheduler_bin_dir + + echo + echo "Building RunGenMain..." + make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/host-cuda/RunGenMain.o + echo +} + +function build_featurization_to_sample() { + local -r halide_root=$1 + get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_bin_dir autoscheduler_bin_dir + + echo + echo "Building featurization_to_sample..." + make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/featurization_to_sample + echo +} + +function build_libauto_schedule() { + local -r halide_root=$1 + get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_bin_dir autoscheduler_bin_dir + + echo + echo "Building libauto_schedule..." + make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/libautoschedule_anderson2021.so + echo +} + +function build_retrain_cost_model() { + local -r halide_root=$1 + get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_bin_dir autoscheduler_bin_dir + + echo + echo "Building retrain_cost_model..." + make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/retrain_cost_model + echo +} + +function build_get_host_target() { + local -r halide_root=$1 + get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_bin_dir autoscheduler_bin_dir + + echo + echo "Building get_host_target..." + make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/get_host_target + echo +} + +function get_host_target() { + local -r halide_root=$1 + local -n host_target_ref=$2 + + get_absolute_autoscheduler_bin_dir $halide_root autoscheduler_bin_dir + + echo "Calling get_host_target()..." + host_target_ref=$(${AUTOSCHED_BIN}/get_host_target) + echo "host_target = ${host_target_ref}" + echo +} + +function build_autoscheduler_tools() { + local -r halide_root=$1 + get_autoscheduler_dir $halide_root autoscheduler_dir + + echo + echo "Building autoscheduler tools..." + build_featurization_to_sample $halide_root + build_retrain_cost_model $halide_root + build_libauto_schedule $halide_root + build_get_host_target $halide_root + build_rungenmain $halide_root + echo +} + +function retrain_cost_model() { + local -r halide_root=$1 + local -r samples_dir=$2 + local -r weights=$3 + local -r num_cores=$4 + local -r num_epochs=$5 + local -r pipeline_id=$6 + local -r learning_rate=${7-0.001} + local -r predictions_file=${8-""} + local -r verbose=${9-0} + local -r partition_schedules=${10-0} + local -r limit=${11-0} + + get_absolute_autoscheduler_bin_dir ${halide_root} autosched_bin + + echo "Using learning rate: ${learning_rate}" + + find ${samples_dir} -name "*.sample" | \ + HL_NUM_THREADS=8 ${autosched_bin}/retrain_cost_model \ + --epochs=${num_epochs} \ + --rates=${learning_rate} \ + --num_cores=${num_cores} \ + --initial_weights=${weights} \ + --weights_out=${weights} \ + --best_benchmark=${samples_dir}/best.${pipeline_id}.benchmark.txt \ + --best_schedule=${samples_dir}/best.${pipeline_id}.schedule.h \ + --predictions_file=${predictions_file} \ + --verbose=${verbose} \ + --partition_schedules=${partition_schedules} \ + --limit=${limit} +} + +function get_timeout_cmd() { + local -n timeout_cmd_ref=$1 + + timeout_cmd_ref="timeout" + if [ $(uname -s) = "Darwin" ] && ! which $timeout_cmd_ref 2>&1 >/dev/null; then + # OSX doesn't have timeout; gtimeout is equivalent and available via Homebrew + timeout_cmd_ref="gtimeout" + if ! which $timeout_cmd_ref 2>&1 >/dev/null; then + echo "Can't find the command 'gtimeout'. Run 'brew install coreutils' to install it." + exit 1 + fi + fi +} + +function predict_all() { + local -r halide_root=$1 + local -r samples_dir=$2 + local -r weights_dir=$3 + local -r predictions_file=$4 + local -r include_filenames=$5 + local -r limit=$6 + + get_autoscheduler_scripts_dir ${halide_root} scripts_dir + bash ${scripts_dir}/predict_all.sh ${samples_dir} ${weights_dir} ${predictions_file} ${include_filenames} ${limit} +} + +function average_compile_time_beam_search() { + local -r samples_dir=$1 + + grep "Compile time" ${samples_dir}/*/0/compile_err.txt | awk -F" " '{sum += $4}; END{printf("Average beam search compile time: %fs\n", sum / NR)}' +} + +function average_compile_time_greedy() { + local -r samples_dir=$1 + + grep "Compile time" ${samples_dir}/*/*/compile_err.txt | awk -F" " '$1 !~ /\/0\/compile_err.txt:Compile$/ {sum += $4}; {count += 1}; END{printf("Average greedy compile time: %fs\n", sum / count)}' +} + +function reset_weights() { + local -r halide_root=$1 + local -r weights=$2 + + get_absolute_autoscheduler_bin_dir ${halide_root} autosched_bin + + ${autosched_bin}/retrain_cost_model \ + --initial_weights=${weights} \ + --weights_out=${weights} \ + --randomize_weights=1 \ + --reset_weights=1 \ + --epochs=1 \ + --rates="0.0001" \ + --num_cores=1 \ + --best_benchmark="" \ + --best_schedule="" \ + --predictions_file="" \ + --partition_schedules=0 \ + --verbose=0 +} + +function extract_sample_details() { + local -r sample_dir=$1 + local -r output_dir=$2 + + local -r output_file=$output_dir/best.txt + + local -r compile_err=${sample_dir}/compile_err.txt + local -r bench=${sample_dir}/bench.txt + local -r weights=$(dirname ${sample_dir})/used.weights + + local -r start_line=$(grep -n "Optimal schedule" ${compile_err} | cut -d":" -f 1) + local -r end_line=$(grep -n "Number of states added" ${compile_err} | cut -d":" -f 1) + + touch ${output_file} + + head -n $((end_line - 1)) ${compile_err} | tail -n $((end_line - start_line)) > "${output_file}" + + echo "" >> ${output_file} + local -r git_hash=$(grep "git rev-parse" ${compile_err} | tail -n 1 | cut -d" " -f 6) + echo "git rev-parse --verify HEAD = ${git_hash}" >> ${output_file} + + local -r time=$(head -n 1 ${bench} | cut -d" " -f 8) + + local -r time_ms=$(echo "${time}" | awk '{printf "%.6f\n", $0 * 1000}') + echo "" >> ${output_file} + echo "Run time (ms) = ${time_ms}" >> ${output_file} + + cp ${weights} ${output_dir}/best.weights +} + +function extract_best_sample_details() { + local -r samples_dir=$1 + + local -r sample_file=$(grep "Best runtime" ${samples_dir}/autotune_out.txt | tail -n 1 | cut -d" " -f 12) + + local -r best_dir=$(dirname $sample_file) + + extract_sample_details ${best_dir} ${samples_dir} +} + +function save_best_schedule_result() { + local -r results_dir=$1 + local -r samples_dir=$2 + mkdir -p ${results_dir} + + local -r app=$(basename $(dirname $samples_dir)) + + echo "Comparing candidate results with current best for ${app}..." + + local -r candidate_details_file=${samples_dir}/best.txt + local -r generator_name=${app#"cuda_"} + local -r candidate_schedule_file=${samples_dir}/best.${generator_name}.schedule.h + local -r candidate_weights_file=${samples_dir}/best.weights + local -r candidate_autotune_out_file=${samples_dir}/autotune_out.txt + + if [ ! -f $candidate_schedule_file ]; then + echo "${candidate_schedule_file} not found. Exiting..." + return + fi + + extract_best_sample_details ${samples_dir} + + local -r best_details_file=${results_dir}/$app.txt + local -r best_schedule_file=${results_dir}/${app}.h + local -r best_weights_file=${results_dir}/${app}.weights + local -r best_autotune_out_file=${results_dir}/${app}.autotune_out + + local -r candidate_run_time=$(tail -n 1 $candidate_details_file | cut -d" " -f 5) + + if [ ! -f $best_details_file ]; then + echo "$best_details_file not found. Copying in candidate (${candidate_run_time} ms) files as new best results..." + cp $candidate_details_file $best_details_file + cp $candidate_schedule_file $best_schedule_file + cp $candidate_weights_file $best_weights_file + cp $candidate_autotune_out_file $best_autotune_out_file + return + fi + + local current_best_run_time=$(tail -n 1 $best_details_file | cut -d" " -f 5) + + local new_best=1 + if [ ${current_best_run_time} ]; then + new_best=$(echo "$candidate_run_time < $current_best_run_time" | bc -l) + else + current_best_run_time="?" + fi + + if [ $new_best -eq 1 ]; then + echo "Candidate run time (${candidate_run_time} ms) is faster than the current best run time (${current_best_run_time} ms). Copying in candidate files as new best results..." + cp $candidate_details_file $best_details_file + cp $candidate_schedule_file $best_schedule_file + cp $candidate_weights_file $best_weights_file + cp $candidate_autotune_out_file $best_autotune_out_file + else + echo "Candidate run time (${candidate_run_time} ms) is not faster than the current best run time (${current_best_run_time} ms)" + fi +} + +function print_best_schedule_times() { + local -r dir=$1 + + local -r apps="bgu bilateral_grid local_laplacian nl_means lens_blur camera_pipe stencil_chain harris hist max_filter unsharp interpolate conv_layer cuda_mat_mul iir_blur depthwise_separable_conv" + + for app in $apps; do + local file=$dir/$app.txt + if [ ! -f $file ]; then + echo "$app not found." + continue + fi + + local current_best_run_time=$(tail -n 1 $file | cut -d" " -f 5) + echo "$app: $current_best_run_time ms" + done +} + +function get_num_local_cores() { + local -n num_local_cores_ref=$1 + + if [ $(uname -s) = "Darwin" ]; then + num_local_cores_ref=$(sysctl -n hw.ncpu) + else + num_local_cores_ref=$(nproc) + fi +} + +function find_unused_gpu() { + local -r benchmark_queue_dir=$1 + local -r num_gpus=$2 + local -n gpu_id_ref=$3 + + for ((index=0;index smaller tiles + if (1) { + Func f("f"), g("g"), h("h"); + f(x, y) = (x + y) * (x + 2 * y) * (x + 3 * y); + h(x, y) = (f(x - 1, y - 1) + f(x, y - 1) + f(x + 1, y - 1) + + f(x - 1, y) + f(x, y) + f(x + 1, y) + + f(x - 1, y + 1) + f(x, y + 1) + f(x + 1, y - 1)); + + h.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); + + Pipeline(h).auto_schedule(target, params); + } + + // A stencil chain + if (1) { + const int N = 8; + Func f[N]; + f[0](x, y) = (x + y) * (x + 2 * y) * (x + 3 * y); + for (int i = 1; i < N; i++) { + Expr e = 0; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + e += f[i - 1](x + dx, y + dy); + } + } + f[i](x, y) = e; + } + f[N - 1].set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); + + Pipeline(f[N - 1]).auto_schedule(target, params); + } + + // An outer product + if (1) { + Buffer a(2048), b(2048); + Func f; + f(x, y) = a(x) * b(y); + + f.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); + + Pipeline(f).auto_schedule(target, params); + } + + // A separable downsample that models the start of local_laplacian + if (1) { + Buffer in(2048, 2048); + Var k; + Func orig("orig"), expensive("expensive"), downy("downy"), downx("downx"); + Expr e = 0; + for (int i = 0; i < 100; i++) { + e += 1; + e *= e; + } + orig(x, y) = e; + expensive(x, y, k) = orig(x, y) * orig(x, y) + (x + orig(x, y)) * (1 + orig(x, y)) + sqrt(k + orig(x, y)); + downy(x, y, k) = expensive(x, 2 * y - 1, k) + expensive(x, 2 * y, k) + expensive(x, 2 * y + 1, k) + expensive(x, 2 * y + 2, k); + downx(x, y, k) = downy(2 * x - 1, y, k) + downy(2 * x, y, k) + downy(2 * x + 1, y, k) + downy(2 * x + 2, y, k); + downx.set_estimate(x, 1, 1022).set_estimate(y, 1, 1022).set_estimate(k, 0, 256); + + Pipeline(downx).auto_schedule(target, params); + } + + // A Func with multiple stages, some of which include additional loops + if (1) { + Buffer a(1024, 1024); + Func f("multiple_stages"), g("g"), h("h"); + Var x, y; + h(x, y) = pow(x, y); + f(x, y) = a(x, y) * 2; + f(x, y) += 17; + RDom r(0, 10); + f(x, y) += r * h(x, y); + f(x, y) *= 2; + f(0, y) = 23.0f; + g(x, y) = f(x - 1, y - 1) + f(x + 1, y + 1); + + g.set_estimate(x, 1, 1022).set_estimate(y, 1, 1022); + + Pipeline(g).auto_schedule(target, params); + } + + if (1) { + // A scan with pointwise stages before and after + Buffer a(1024, 1024); + Func before[5]; + Func after[5]; + Func s("scan"); + Var x, y; + before[0](x, y) = x + y; + for (int i = 1; i < 5; i++) { + before[i](x, y) = before[i - 1](x, y) + 1; + } + RDom r(1, 1023); + s(x, y) = before[4](x, y); + s(r, y) += s(r - 1, y); + after[0](x, y) = s(y, x) + s(y, x + 100); + for (int i = 1; i < 5; i++) { + after[i](x, y) = after[i - 1](x, y) + 1; + } + + after[4].set_estimate(x, 0, 1024).set_estimate(y, 0, 1024); + + Pipeline(after[4]).auto_schedule(target, params); + } + + + if (1) { + Buffer im_a(1024, 1024, "a"), im_b(1024, 1024, "b"); + im_a.fill(0.0f); + im_b.fill(0.0f); + + Func c("c"), a("a"), b("b"); + Var i, j; + a(j, i) = im_a(j, i); // TODO: Add wrappers to the search space + b(j, i) = im_b(j, i); + RDom k(0, 1024); + c(j, i) += a(k, i) * b(j, k); + Func out("out"); + out(j, i) = c(j, i); + + out.set_estimate(j, 0, 1024).set_estimate(i, 0, 1024); + + Pipeline(out).auto_schedule(target, params); + } + + if (1) { + // A scan in x followed by a downsample in y, with pointwise stuff in between + const int N = 3; + Buffer a(1024, 1024); + Func p1[N], p2[N], p3[N]; + Func s("scan"); + Var x, y; + p1[0](x, y) = x + y; + for (int i = 1; i < N; i++) { + p1[i](x, y) = p1[i - 1](x, y) + 1; + } + RDom r(1, 1023); + s(x, y) = p1[N - 1](x, y); + s(r, y) += s(r - 1, y); + p2[0](x, y) = s(x, y); + for (int i = 1; i < N; i++) { + p2[i](x, y) = p2[i - 1](x, y) + 1; + } + Func down("downsample"); + down(x, y) = p2[N - 1](x, 2 * y); + p3[0](x, y) = down(x, y); + for (int i = 1; i < N; i++) { + p3[i](x, y) = p3[i - 1](x, y) + 1; + } + + p3[N - 1].set_estimate(x, 0, 1024).set_estimate(y, 0, 1024); + + Pipeline(p3[N - 1]).auto_schedule(target, params); + } + + if (1) { + // A gather that only uses a small portion of a potentially + // large LUT. The number of points computed should be less + // than points computed minimum, and the LUT should be + // inlined, even if it's really expensive. + Func lut("lut"); + Var x; + lut(x) = (x + 1) * (x + 2) * (x + 3) * (x + 4) * (x + 5) * (x + 6); + + Func idx("idx"); + idx(x) = x * (10000 - x); + + Func out("out"); + out(x) = lut(clamp(idx(x), 0, 100000)); + + out.set_estimate(x, 0, 10); + + Pipeline(out).auto_schedule(target, params); + } + + if (1) { + // A pipeline where the vectorized dimension should alternate index + Func f("f"), g("g"), h("h"); + f(x, y) = x * y; + + RDom r(-50, 100, -50, 100); + g(x, y) += f(y + r.y, x + r.x); + + h(x, y) += g(y + r.y, x + r.y); + + h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + + Pipeline(h).auto_schedule(target, params); + } + + + if (1) { + // A no-win scenario in which a Func is going to be read from + // lots of times using a vector gather no matter how it is + // scheduled. + Func in("in"), a("a"), b("b"); + + in(x, y) = sqrt(sqrt(sqrt(sqrt(x * y)))); + + RDom r(-50, 100, -50, 100); + a(x, y) += in(x + r.x, y + r.y); + b(x, y) += in(y + r.y, x + r.x); + + a.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + b.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + + Pipeline({a, b}).auto_schedule(target, params); + } + + if (1) { + // Boring memcpy + ImageParam im(Float(32), 2); + Func f("f"), g("g"); + f(x, y) = im(x, y); + g(x, y) = f(x, y); + + g.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + Pipeline(g).auto_schedule(target, params); + } + + if (1) { + // A load from a tiny input image + ImageParam im(Float(32), 2); + Func f("f"); + f(x, y) = im(x, y) * 7; + + f.set_estimate(x, 0, 3).set_estimate(y, 0, 5); + Pipeline(f).auto_schedule(target, params); + } + + if (1) { + // Lots of dimensions + ImageParam im(Float(32), 7); + Func f("f"); + Var z, w, t, u, v; + f(x, y, z, w, t, u, v) = im(x, y, z, w, t, u, v) * 7; + + f.set_estimate(x, 0, 8) + .set_estimate(y, 0, 9) + .set_estimate(z, 0, 10) + .set_estimate(w, 0, 5) + .set_estimate(t, 0, 3) + .set_estimate(u, 0, 2) + .set_estimate(v, 0, 6); + Pipeline(f).auto_schedule(target, params); + } + + if (1) { + // Long transpose chain. + ImageParam im(Float(32), 2); + Func f("f"), g("g"), h("h"); + + f(x, y) = im(clamp(y * x, 0, 999), x); + g(x, y) = f(clamp(y * x, 0, 999), x); + h(x, y) = g(clamp(y * x, 0, 999), x); + + // Force everything to be compute root by accessing them in two separate outputs + Func out1("out1"), out2("out2"); + out1(x, y) = f(x, y) + g(x, y) + h(x, y); + out2(x, y) = f(x, y) + g(x, y) + h(x, y); + + out1.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + out2.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + Pipeline({out1, out2}).auto_schedule(target, params); + } + + if (1) { + ImageParam im(Float(32), 2); + // An inlinable Func used at the start and at the end of a long stencil chain. + const int N = 8; + Func f[N]; + f[0] = Func("inline_me"); + f[0](x, y) = im(x, y); // inline me! + for (int i = 1; i < N; i++) { + Expr e = 0; + for (int dy = -1; dy <= 1; dy++) { + for (int dx = -1; dx <= 1; dx++) { + e += f[i - 1](x + dx, y + dy); + } + } + f[i](x, y) = e; + } + + Func g("output"); + // Access it in a way that makes it insane not to inline. + g(x, y) = f[N - 1](x, y) + f[0](clamp(cast(sin(x) * 10000), 0, 100000), clamp(cast(sin(x * y) * 10000), 0, 100000)); + g.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); + + Pipeline(g).auto_schedule(target, params); + } + + if (1) { + // Vectorizing a pure var in an update using RoundUp + + Func f("f"), g("g"); + + f(x, y) = x + y; + RDom r(0, 10); + f(x, y) += f(x, y) * r; + + g(x, y) = f(x, y); + + g.set_estimate(x, 0, 10).set_estimate(y, 0, 2048); + Pipeline(g).auto_schedule(target, params); + } + + if (1) { + ImageParam im(Float(32), 2); + + // A convolution pyramid + Func up[8], down[8]; + int sz = 2048; + Func prev("input"); + prev(x, y) = im(x, y); + + const int N = 4; + + for (int i = 0; i < N; i++) { + up[i] = Func("up" + std::to_string(i)); + down[i] = Func("down" + std::to_string(i)); + down[i](x, y) = prev(2 * x - 10, 2 * y - 10) + prev(2 * x + 10, 2 * y + 10); + prev = BoundaryConditions::repeat_edge(down[i], {{0, sz}, {0, sz}}); + // prev = down[i]; + sz /= 2; + } + + for (int i = N - 1; i >= 0; i--) { + up[i](x, y) = prev(x / 2 + 10, y / 2 + 10) + prev(x / 2 - 10, y / 2 - 10) + down[i](x, y); + prev = up[i]; + } + + Func out; + out(x, y) = up[0](x, y); + + out.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); + Pipeline(out).auto_schedule(target, params); + } + + if (1) { + ImageParam im(Float(32), 2); + + Func f("f"); + f(x, y) = im(x, y); + + Func scan("scan"); + scan(x, y) = f(x, y); + RDom r(1, 1999); + scan(x, r) += scan(x, r - 1); + scan(x, 1999 - r) += scan(x, 2000 - r); + Func casted("casted"); + casted(x, y) = scan(x, y); + + casted.set_estimate(x, 0, 2000).set_estimate(y, 0, 2000); + Pipeline(casted).auto_schedule(target, params); + } + + if (1) { + ImageParam im(Int(32), 2); + + Func f("f"), hist("hist"), output("output"); + Var i("i"); + f(x, y) = clamp(im(x, y), 0, 255); + RDom r(0, 2000, 0, 2000); + hist(i) = cast(0); + hist(f(r.x, r.y)) += cast(1); + output(i) = hist(i); + + f.set_estimate(x, 0, 2000).set_estimate(y, 0, 2000); + output.set_estimate(i, 0, 256); + Pipeline(output).auto_schedule(target, params); + } + + return 0; +} diff --git a/src/autoschedulers/anderson2021/test/bounds.cpp b/src/autoschedulers/anderson2021/test/bounds.cpp new file mode 100644 index 000000000000..d9bc5c599dc2 --- /dev/null +++ b/src/autoschedulers/anderson2021/test/bounds.cpp @@ -0,0 +1,319 @@ +#include "test.h" +#include "Tiling.h" +#include "LoopNest.h" + +using namespace Halide; +using namespace Halide::Internal; +using namespace Halide::Internal::Autoscheduler; + +void test_bounds() { + MachineParams params(80, 16000000, 40); + Target target("host-cuda"); + + Var x("x"), y("y"); + { + Func f("f"), g("g"), h("h"); + f(x) = (x) * (x); + g(x) = f(x - 1) + f(x) + f(x + 1); + h(x) = g(x); + + h.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(h.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_h = &dag.nodes[0]; + const FunctionDAG::Node* node_g = &dag.nodes[1]; + const FunctionDAG::Node* node_f = &dag.nodes[2]; + + EXPECT_EQ(node_h->func.name(), std::string("h")); + EXPECT_EQ(node_f->func.name(), std::string("f")); + EXPECT_EQ(node_g->func.name(), std::string("g")); + + std::unique_ptr root = std::make_unique(); + + // Compute h at root + root->compute_here(node_h, true, 0, false, target); + + // Tile h + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_f = thread->get_bounds(node_f); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); + + EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 3); + } + + { + Func f("f2"), g("g2"), h("h2"), out("out"); + g(x) = x; + f(x) = g(2 * x); + h(x) = g(x); + out(x) = h(x) + f(x); + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[2]; + const FunctionDAG::Node* node_g = &dag.nodes[3]; + + std::unique_ptr root = std::make_unique(); + + // Compute h at root + root->compute_here(node_out, true, 0, false, target); + + // Tile h + std::vector tiling; + tiling.push_back(2); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_f = thread->get_bounds(node_f); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 515); + + EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 2); + } + + // This is a sequence of tests for edge cases of region_required. + // region_required is defined as the region of a producer required to + // satisfy ALL of its consumers (not a single consumer). This can lead to + // surprising results if used unknowingly e.g. to compute the number of + // bytes required of a producer to satisfy a single consumer. + { + Func f("f"), g("g"), h("h"), out("out"); + g(x) = x; + h(x) = g(x - 1) + g(x) + g(x + 1); + out(x) = h(x); + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_h = &dag.nodes[1]; + const FunctionDAG::Node* node_g = &dag.nodes[2]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_h->func.name(), h.name()); + EXPECT_EQ(node_g->func.name(), g.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_h = thread->get_bounds(node_h); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); + + EXPECT_EQ(thread_bounds_h->region_required(0).extent(), 1); + + // If 'h' is inlined, the region_required should not change + root_copy->inline_func(node_h); + { + const auto& thread = root_copy->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_h = thread->get_bounds(node_h); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); + + EXPECT_EQ(thread_bounds_h->region_required(0).extent(), 1); + } + } + + { + Func f("f"), g("g"), out("out"); + g(x) = x; + f(x) = g(x - 100) + g(x + 100); // 201 points of g required for each point of f + out(x) = f(x) + g(x); // 1 point of g required for each point of out + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[1]; + const FunctionDAG::Node* node_g = &dag.nodes[2]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_g->func.name(), g.name()); + EXPECT_EQ(node_f->func.name(), f.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_f = thread->get_bounds(node_f); + const auto& thread_bounds_out = thread->get_bounds(node_out); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 201); + EXPECT_EQ(thread_bounds_g->loops(0, 0).extent(), 201); + + EXPECT_EQ(thread_bounds_out->loops(0, 0).extent(), 1); + + EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 1); + + + vector out_g_edge_chain; + for (const auto *e : node_g->outgoing_edges) { + if (e->consumer != thread->stage) { + continue; + } + + out_g_edge_chain.push_back(e); + } + + EXPECT_EQ((int)out_g_edge_chain.size(), 1); + + vector out_f_g_edge_chain; + for (const auto *e : node_f->outgoing_edges) { + if (e->consumer != thread->stage) { + continue; + } + + out_f_g_edge_chain.push_back(e); + } + + out_f_g_edge_chain.push_back(node_f->stages[0].incoming_edges.front()); + EXPECT_EQ((int)out_f_g_edge_chain.size(), 2); + + const auto& thread_bounds_g_edge = thread->get_bounds_along_edge_chain(node_g, out_g_edge_chain); + + // This should only account for the edge from 'g' -> 'out' (and ignore the + // edge from 'g' -> 'f') + EXPECT_EQ(thread_bounds_g_edge->region_required(0).extent(), 1); + + const auto& thread_bounds_f_g_edge = thread->get_bounds_along_edge_chain(node_g, out_f_g_edge_chain); + + EXPECT_EQ(thread_bounds_f_g_edge->region_required(0).extent(), 201); + + // If 'f' is inlined, the region_required should still produce valid results + root_copy->inline_func(node_f); + { + const auto& thread = root_copy->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 201); + + const auto& thread_bounds_g_edge = thread->get_bounds_along_edge_chain(node_g, out_g_edge_chain); + + EXPECT_EQ(thread_bounds_g_edge->region_required(0).extent(), 1); + + const auto& thread_bounds_f_g_edge = thread->get_bounds_along_edge_chain(node_g, out_f_g_edge_chain); + + EXPECT_EQ(thread_bounds_f_g_edge->region_required(0).extent(), 201); + } + } + + { + Func f("f"), g("g"), out("out"); + g(x) = x; + f(x) = g(x); // 1 point of g required for each point of f + out(x) = f(x) + g(x); // 1 point of g required for each point of out + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[1]; + const FunctionDAG::Node* node_g = &dag.nodes[2]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_g->func.name(), g.name()); + EXPECT_EQ(node_f->func.name(), f.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_f = thread->get_bounds(node_f); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); + EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 1); + + root_copy->inline_func(node_f); + { + const auto& thread = root_copy->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); + } + } +} + +int main(int argc, char **argv) { + test_bounds(); + printf("All tests passed.\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/test/parser.cpp b/src/autoschedulers/anderson2021/test/parser.cpp new file mode 100644 index 000000000000..53d635c65ae2 --- /dev/null +++ b/src/autoschedulers/anderson2021/test/parser.cpp @@ -0,0 +1,20 @@ +#include "test.h" +#include "LoopNestParser.h" + +using namespace Halide; +using namespace Halide::Internal; +using namespace Halide::Internal::Autoscheduler; + +void test_parser() { + { + std::unique_ptr complete = LoopNestParser::from_file("test/test_complete_loop_nest"); + std::unique_ptr partial = LoopNestParser::from_file("test/test_partial_loop_nest"); + EXPECT(complete->contains_sub_loop_nest(*partial)); + } +} + +int main(int argc, char **argv) { + test_parser(); + printf("All tests passed.\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/test/state.cpp b/src/autoschedulers/anderson2021/test/state.cpp new file mode 100644 index 000000000000..ac43ced6974a --- /dev/null +++ b/src/autoschedulers/anderson2021/test/state.cpp @@ -0,0 +1,51 @@ +#include "test.h" +#include "LoopNest.h" +#include "State.h" + +using namespace Halide; +using namespace Halide::Internal; +using namespace Halide::Internal::Autoscheduler; + +void test_state() { + MachineParams params(80, 16000000, 40); + Target target("host-cuda"); + + // Test update_always_consider_inline_options + Var x("x"), y("y"); + { + Func f("f"), g("g"), h("h"); + f(x) = x * x; + g(x) = f(x); + h(x) = g(x); + + h.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(h.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_h = &dag.nodes[0]; + const FunctionDAG::Node* node_g = &dag.nodes[1]; + const FunctionDAG::Node* node_f = &dag.nodes[2]; + + EXPECT_EQ(node_h->func.name(), std::string("h")); + EXPECT_EQ(node_f->func.name(), std::string("f")); + EXPECT_EQ(node_g->func.name(), std::string("g")); + + std::unique_ptr root = std::make_unique(); + + // Compute h at root + root->compute_here(node_h, true, 0, false, target); + + std::unique_ptr state = std::make_unique(); + state->root = root.release(); + state->update_always_consider_inline_options(node_g); + EXPECT(state->should_always_consider_inline(node_g)); + } +} + +int main(int argc, char **argv) { + test_state(); + printf("All tests passed.\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/test/storage_strides.cpp b/src/autoschedulers/anderson2021/test/storage_strides.cpp new file mode 100644 index 000000000000..6b43914f1fd0 --- /dev/null +++ b/src/autoschedulers/anderson2021/test/storage_strides.cpp @@ -0,0 +1,462 @@ +#include "test.h" +#include "LoopNest.h" + +using namespace Halide; +using namespace Halide::Internal; +using namespace Halide::Internal::Autoscheduler; + +void test_bounds() { + MachineParams params(80, 16000000, 40); + Target target("host-cuda"); + bool verbose = false; + int bytes_per_point = 4; + + Var x("x"), y("y"); + { + Func f("f"), g("g"), h("h"); + f(x) = (x) * (x); + g(x) = f(x - 1) + f(x) + f(x + 1); + h(x) = g(x); + + h.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(h.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_h = &dag.nodes[0]; + const FunctionDAG::Node* node_g = &dag.nodes[1]; + const FunctionDAG::Node* node_f = &dag.nodes[2]; + + EXPECT_EQ(node_h->func.name(), std::string("h")); + EXPECT_EQ(node_f->func.name(), std::string("f")); + EXPECT_EQ(node_g->func.name(), std::string("g")); + + std::unique_ptr root = std::make_unique(); + + // Compute h at root + root->compute_here(node_h, true, 0, false, target); + + // Tile h + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_f = thread->get_bounds(node_f); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); + EXPECT_EQ(thread_bounds_g->region_required(1).extent(), 1); + EXPECT_EQ(thread_bounds_g->region_required(2).extent(), 1); + + EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 3); + EXPECT_EQ(thread_bounds_f->region_required(1).extent(), 3); + EXPECT_EQ(thread_bounds_f->region_required(2).extent(), 3); + } + + // This is a sequence of tests for edge cases of region_required. + // region_required is defined as the region of a producer required to + // satisfy ALL of its consumers (not a single consumer). This can lead to + // surprising results if used unknowingly e.g. to compute the number of + // bytes required of a producer to satisfy a single consumer. + { + Func f("f"), g("g"), h("h"), out("out"); + g(x) = x; + h(x) = g(x - 1) + g(x) + g(x + 1); + out(x) = h(x); + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_h = &dag.nodes[1]; + const FunctionDAG::Node* node_g = &dag.nodes[2]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_h->func.name(), h.name()); + EXPECT_EQ(node_g->func.name(), g.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_h = thread->get_bounds(node_h); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); + + EXPECT_EQ(thread_bounds_h->region_required(0).extent(), 1); + + // If 'h' is inlined, the region_required should not change + root_copy->inline_func(node_h); + { + const auto& thread = root_copy->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto& thread_bounds_h = thread->get_bounds(node_h); + + EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); + + EXPECT_EQ(thread_bounds_h->region_required(0).extent(), 1); + } + } + + // Whole number stride + { + Func f("f"), g("g"), out("out"); + f(x) = x; + out(x) = f(x); + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[1]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_f->func.name(), f.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& root_bounds_f = root->get_bounds(node_f); + + EXPECT_EQ(root_bounds_f->region_required(0).extent(), 1024); + EXPECT_EQ(1, (int)node_f->outgoing_edges.size()); + EXPECT_EQ(1, (int)node_f->outgoing_edges.front()->load_jacobians.size()); + + ThreadInfo thread_info{0, {32}, node_out->stages[0].loop, {32}}; + const auto& jac = node_f->outgoing_edges.front()->load_jacobians.front(); + + const auto& thread = root->children[0]->children[0]; + Strides strides = thread->compute_strides(jac, 0, node_f, root_bounds_f, thread_info, verbose); + + GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; + thread_info.for_each_thread_id_in_first_warp(accumulator); + + GlobalMemInfo mem_info; + int num_requests = 1; + accumulator.add_access_info( + num_requests, + mem_info, + false + ); + + EXPECT_EQ(4, mem_info.num_transactions()); + } + + // Fractional stride + { + Func f("f"), g("g"), out("out"); + f(x) = x; + out(x) = f(x / 2); + + out.set_estimate(x, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[1]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_f->func.name(), f.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.back() = 32; + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& root_bounds_f = root->get_bounds(node_f); + + EXPECT_EQ(root_bounds_f->region_required(0).extent(), 512); + EXPECT_EQ(1, (int)node_f->outgoing_edges.size()); + EXPECT_EQ(1, (int)node_f->outgoing_edges.front()->load_jacobians.size()); + + ThreadInfo thread_info{0, {32}, node_out->stages[0].loop, {32}}; + const auto& jac = node_f->outgoing_edges.front()->load_jacobians.front(); + + const auto& thread = root->children[0]->children[0]; + Strides strides = thread->compute_strides(jac, 0, node_f, root_bounds_f, thread_info, verbose); + + GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; + thread_info.for_each_thread_id_in_first_warp(accumulator); + + GlobalMemInfo mem_info; + int num_requests = 1; + accumulator.add_access_info( + num_requests, + mem_info, + false + ); + + EXPECT_EQ(2, mem_info.num_transactions()); + } + + // Fractional stride with multiple dimensions + { + Func f("f"), g("g"), out("out"); + f(x, y) = x + y; + out(x, y) = f(x, y / 2); + + out.set_estimate(x, 0, 1024); + out.set_estimate(y, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[1]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_f->func.name(), f.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + tiling.push_back(1); + // Serial loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + tiling.clear(); + tiling.push_back(1); + tiling.push_back(32); + // Thread loop + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& root_bounds_f = root->get_bounds(node_f); + + EXPECT_EQ(root_bounds_f->region_required(0).extent(), 1024); + EXPECT_EQ(root_bounds_f->region_required(1).extent(), 512); + + EXPECT_EQ(1, (int)node_f->outgoing_edges.size()); + EXPECT_EQ(1, (int)node_f->outgoing_edges.front()->load_jacobians.size()); + + ThreadInfo thread_info{1, {1, 32}, node_out->stages[0].loop, {1, 32}}; + const auto& jac = node_f->outgoing_edges.front()->load_jacobians.front(); + + const auto& thread = root->children[0]->children[0]; + Strides strides = thread->compute_strides(jac, 0, node_f, root_bounds_f, thread_info, verbose); + strides.dump(true); + + GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; + thread_info.for_each_thread_id_in_first_warp(accumulator); + + GlobalMemInfo mem_info; + int num_requests = 1; + accumulator.add_access_info( + num_requests, + mem_info, + false + ); + + EXPECT_EQ(16, mem_info.num_transactions()); + } + + // Fused stage without thread dimension + { + Func f("f"), g("g"), out("out"); + g(y) = y; + f(y) = g(y); + out(x, y) = f(y); + + out.set_estimate(x, 0, 1024); + out.set_estimate(y, 0, 1024); + + std::vector outputs; + outputs.push_back(out.function()); + FunctionDAG dag(outputs, params, target); + + const FunctionDAG::Node* node_out = &dag.nodes[0]; + const FunctionDAG::Node* node_f = &dag.nodes[1]; + const FunctionDAG::Node* node_g = &dag.nodes[2]; + + EXPECT_EQ(node_out->func.name(), out.name()); + EXPECT_EQ(node_f->func.name(), f.name()); + EXPECT_EQ(node_g->func.name(), g.name()); + + std::unique_ptr root = std::make_unique(); + + // Compute out at root + root->compute_here(node_out, true, 0, false, target); + + // Tile out + std::vector tiling; + tiling.push_back(1); + tiling.push_back(1); + // Serial loop + auto thread_loop = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + std::unique_ptr thread_loop_copy{new LoopNest}; + thread_loop_copy->copy_from(*thread_loop); + thread_loop_copy->compute_here(node_f, true, 0, false, target); + tiling.clear(); + tiling.push_back(32); + tiling.push_back(1); + // Thread loop + root->children[0] = thread_loop_copy.release(); + root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + + std::unique_ptr root_copy{new LoopNest}; + root_copy->copy_from(*root); + + const auto& root_bounds_f = root->get_bounds(node_f); + + EXPECT_EQ(root_bounds_f->region_required(0).extent(), 1024); + + EXPECT_EQ(1, (int)node_g->outgoing_edges.size()); + EXPECT_EQ(1, (int)node_g->outgoing_edges.front()->load_jacobians.size()); + + ThreadInfo thread_info{1, {32, 1}, node_out->stages[0].loop, {32, 1}}; + const auto& jac = node_g->outgoing_edges.front()->load_jacobians.front(); + + const auto& thread = root->children[0]->children[0]; + const auto& thread_bounds_g = thread->get_bounds(node_g); + Strides strides = thread->compute_strides(jac, 0, node_g, thread_bounds_g, thread_info, verbose); + + GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; + thread_info.for_each_thread_id_in_first_warp(accumulator); + + GlobalMemInfo mem_info; + int num_requests = 1; + accumulator.add_access_info( + num_requests, + mem_info, + false + ); + + EXPECT_EQ(4, mem_info.num_transactions()); + } + + // Whole number stride with multiple dimensions + { + std::vector storage_strides; + storage_strides.push_back(1); + storage_strides.push_back(64); + + Strides strides{storage_strides}; + + strides.add_valid({1, 0}); + + EXPECT_EQ(strides.offset(0, 0), 0); + EXPECT_EQ(strides.offset(0, 1), 1); + } + + // Fractional stride with multiple dimensions + { + std::vector storage_strides; + storage_strides.push_back(1); + storage_strides.push_back(64); + + Strides strides{storage_strides}; + + strides.add_valid({0, 0.5}); + + EXPECT_EQ(strides.offset(0, 0), 0); + EXPECT_EQ(strides.offset(0, 1), 0); + EXPECT_EQ(strides.offset(0, 2), 64); + EXPECT_EQ(strides.offset(0, 3), 64); + } + + // More complex fractional stride with multiple dimensions + { + std::vector storage_strides; + storage_strides.push_back(1); + storage_strides.push_back(321); + storage_strides.push_back(61953); + + Strides strides{storage_strides}; + + strides.add_valid({0, 0.5, 0}); + strides.add_valid({4, 0, 0}); + strides.add_valid({0, 0, 2}); + + auto x0 = strides.offset(0, 0); + auto x1 = strides.offset(0, 1); + auto x2 = strides.offset(0, 2); + auto x3 = strides.offset(0, 3); + EXPECT_EQ(x0, 0); + EXPECT_EQ(x1, 0); + EXPECT_EQ(x2, 321); + EXPECT_EQ(x3, 321); + + auto y0 = strides.offset(1, 0); + auto y1 = strides.offset(1, 1); + auto y2 = strides.offset(1, 2); + auto y3 = strides.offset(1, 3); + EXPECT_EQ(y0, 0); + EXPECT_EQ(y1, 4); + EXPECT_EQ(y2, 8); + EXPECT_EQ(y3, 12); + + EXPECT_EQ(x0 + y0, 0); + EXPECT_EQ(x1 + y0, 0); + EXPECT_EQ(x0 + y1, 4); + EXPECT_EQ(x1 + y1, 4); + } +} + +int main(int argc, char **argv) { + test_bounds(); + printf("All tests passed.\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/test/test.h b/src/autoschedulers/anderson2021/test/test.h new file mode 100644 index 000000000000..d28796029866 --- /dev/null +++ b/src/autoschedulers/anderson2021/test/test.h @@ -0,0 +1,33 @@ +#ifndef TEST_H +#define TEST_H + +#include "Halide.h" + +namespace Halide { +namespace Internal { +namespace Autoscheduler { + +#define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) +#define EXPECT_EQ(expected, actual) expect_eq(__LINE__, expected, actual) +#define EXPECT(expected) expect(__LINE__, expected) + +template +void expect_eq(int line, const A& expected, const B& actual) { + user_assert(expected == actual) + << "Assert failed on line " << line << "." + << "\nExpected value = " << expected + << "\nActual value = " << actual; +} + +template +void expect(int line, const A& expected) { + user_assert(expected) + << "Assert failed on line " << line << "." + << "\nExpected value to be true\n"; +} + +} // namespace Autoscheduler +} // namespace Internal +} // namespace Halide + +#endif // TEST_H diff --git a/src/autoschedulers/anderson2021/test/thread_info.cpp b/src/autoschedulers/anderson2021/test/thread_info.cpp new file mode 100644 index 000000000000..ef4dc91cabd6 --- /dev/null +++ b/src/autoschedulers/anderson2021/test/thread_info.cpp @@ -0,0 +1,108 @@ +#include "test.h" +#include "ThreadInfo.h" +#include "LoopNest.h" + +using namespace Halide; +using namespace Halide::Internal; +using namespace Halide::Internal::Autoscheduler; + +void test_thread_info() { + MachineParams params(80, 16000000, 40); + Target target("host-cuda"); + + Var x("x"), y("y"); + { + int vectorized_loop_index = 0; + std::vector size; + std::vector loop; + std::vector loop_extents; + std::vector max_thread_counts; + + loop.push_back({}); + loop.push_back({}); + + // 16x8 + size.push_back(16); + size.push_back(8); + + loop_extents.push_back(16); + loop_extents.push_back(8); + + // 16x8 + max_thread_counts.push_back(16); + max_thread_counts.push_back(8); + + { + ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + + EXPECT_EQ(128, info.num_threads); + EXPECT_EQ(1.0, info.max_theoretical_warp_lane_utilization); + EXPECT_EQ(1.0, info.warp_lane_utilization()); + EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_x()); + EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_y()); + } + + // Smaller stage: test that its max_theoretical_warp_lane_utilization is + // penalized because its 'size' is smaller than its loop_extents, + // indicating that it has been split: it could achieve better utilization if it had not been split + size.clear(); + size.push_back(8); + size.push_back(8); + + { + ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + EXPECT_EQ(64, info.num_threads); + EXPECT_EQ(0.5, info.max_theoretical_warp_lane_utilization); + EXPECT_EQ(0.5, info.warp_lane_utilization()); + EXPECT_EQ(0.5, info.warp_lane_utilization_at_block_x()); + EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_y()); + } + + // Smaller stage: test that its max_theoretical_warp_lane_utilization is not + // penalized because its loop is smaller than the max thread loop and + // cannot possibly achieve better utilization + loop_extents.clear(); + loop_extents.push_back(8); + loop_extents.push_back(8); + + { + ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + EXPECT_EQ(64, info.num_threads); + EXPECT_EQ(1.0, info.max_theoretical_warp_lane_utilization); + EXPECT_EQ(0.5, info.warp_lane_utilization()); + EXPECT_EQ(0.5, info.warp_lane_utilization_at_block_x()); + EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_y()); + } + + size.clear(); + size.push_back(11); + size.push_back(11); + size.push_back(2); + loop_extents.clear(); + loop_extents.push_back(11); + loop_extents.push_back(11); + loop_extents.push_back(2); + max_thread_counts.clear(); + max_thread_counts.push_back(16); + max_thread_counts.push_back(16); + max_thread_counts.push_back(2); + loop.push_back({}); + + { + ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + EXPECT_EQ(242, info.num_threads); + EXPECT_EQ(1.0, info.max_theoretical_warp_lane_utilization); + EXPECT_EQ(0.472656, info.warp_lane_utilization()); + EXPECT_EQ(0.6875, info.warp_lane_utilization_at_block_x()); + EXPECT_EQ(0.6875, info.warp_lane_utilization_at_block_y()); + EXPECT_EQ(1, info.warp_lane_utilization_at_block_z()); + } + } +} + + +int main(int argc, char **argv) { + test_thread_info(); + printf("All tests passed.\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/test/tiling.cpp b/src/autoschedulers/anderson2021/test/tiling.cpp new file mode 100644 index 000000000000..93d664c9f2cf --- /dev/null +++ b/src/autoschedulers/anderson2021/test/tiling.cpp @@ -0,0 +1,192 @@ +#include + +#include "test.h" +#include "Tiling.h" + +using namespace Halide; +using namespace Halide::Internal; +using namespace Halide::Internal::Autoscheduler; + +using tilings_t = vector>; + +std::string to_string(const tilings_t& tilings) { + std::ostringstream s; + s << "[\n"; + bool first_tiling = true; + for (const auto& t : tilings) { + if (!first_tiling) { + s << ",\n"; + } + s << " ["; + bool first = true; + for (const auto& x : t) { + if (!first) { + s << ", "; + } + s << x; + first = false; + } + s << "]"; + first_tiling = false; + } + s << "\n]"; + + return s.str(); +} + +template <> +void Halide::Internal::Autoscheduler::expect_eq(int line, const tilings_t& expected, const tilings_t& actual) { + expect_eq(line, to_string(expected), to_string(actual)); +} + +void test_serial_tilings() { + { + // Don't split small, odd extents + vector s; + s.push_back(3); + + vector> expected; + expected.push_back({3}); + + vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); + + EXPECT_EQ(expected, actual); + + s.back() = 5; + expected.back().back() = 5; + actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); + EXPECT_EQ(expected, actual); + + s.back() = 7; + expected.back().back() = 7; + actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); + EXPECT_EQ(expected, actual); + + // If 'allow_inner_ones' is false, don't split + actual = generate_serial_tilings(s, 0, 0, 0, {}, false, false); + expected.clear(); + EXPECT_EQ(expected, actual); + } + + { + vector s; + s.push_back(8); + + vector> expected; + expected.push_back({8}); + expected.push_back({4}); + expected.push_back({2}); + + vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); + + EXPECT_EQ(expected, actual); + } + + { + vector s; + s.push_back(8); + + vector> expected; + // If 'filter_small_outer_extents' is true, don't split small extents + vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, true, true); + + EXPECT_EQ(expected, actual); + } + + { + vector s; + s.push_back(8); + + vector> expected; + expected.push_back({8}); + expected.push_back({4}); + expected.push_back({2}); + + // If 'filter_small_outer_extents' is true but we're not considering the + // vectorized_loop_index, do split + vector> actual = generate_serial_tilings(s, 0, 0, 1, {}, true, true); + + EXPECT_EQ(expected, actual); + } + + // Test that generate_gpu_tilings does not exit when it encounters a tiling + // option with too many threads + { + vector> stage_sizes; + stage_sizes.push_back({16, 16, 32}); + + vector> pure_dims; + pure_dims.push_back({0, 1, 2}); + + vector max_s; + max_s.push_back(16); + max_s.push_back(16); + max_s.push_back(2); + + vector vectorized_indices; + vectorized_indices.push_back(0); + + bool serial_inner = true; + + vector> expected; + expected.push_back({16, 1, 2}); + expected.push_back({16, 1, 4}); + expected.push_back({16, 1, 8}); + expected.push_back({16, 1, 16}); + expected.push_back({16, 2, 2}); + expected.push_back({16, 2, 4}); + expected.push_back({16, 4, 2}); + expected.push_back({16, 4, 4}); + expected.push_back({16, 8, 2}); + expected.push_back({16, 8, 4}); + expected.push_back({16, 16, 2}); + expected.push_back({16, 16, 4}); + + auto actual = generate_gpu_tilings(stage_sizes, pure_dims, max_s, (int)(stage_sizes[0].size() - 1), vectorized_indices, serial_inner, false); + + EXPECT_EQ(expected, actual); + } + + { + vector> stage_sizes; + stage_sizes.push_back({128}); + + vector> pure_dims; + pure_dims.push_back({0}); + + vector max_s; + max_s.push_back(1); + + vector vectorized_indices; + vectorized_indices.push_back(0); + + bool serial_inner = false; + + vector> expected; + expected.push_back({16}); + expected.push_back({32}); + expected.push_back({64}); + + auto actual = generate_gpu_tilings(stage_sizes, pure_dims, max_s, (int)(stage_sizes[0].size() - 1), vectorized_indices, serial_inner, false); + + EXPECT_EQ(expected, actual); + + expected.clear(); + expected.push_back({1}); + expected.push_back({2}); + expected.push_back({4}); + expected.push_back({8}); + expected.push_back({16}); + expected.push_back({32}); + expected.push_back({64}); + actual = generate_gpu_tilings(stage_sizes, pure_dims, max_s, (int)(stage_sizes[0].size() - 1), vectorized_indices, serial_inner, true); + + EXPECT_EQ(expected, actual); + } +} + +int main(int argc, char **argv) { + test_serial_tilings(); + printf("All tests passed.\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/test_function_dag.cpp b/src/autoschedulers/anderson2021/test_function_dag.cpp new file mode 100644 index 000000000000..933c7f9d5027 --- /dev/null +++ b/src/autoschedulers/anderson2021/test_function_dag.cpp @@ -0,0 +1,174 @@ +#include "FunctionDAG.h" +#include "Halide.h" +#include + +using namespace Halide; + +extern "C" int mul_by_two( + halide_buffer_t *input, + halide_buffer_t *output) { + if (input->is_bounds_query()) { + // Bounds query: infer the input dimensions from the output dimensions. In + // this example, the dimensions are exactly the same + for (int i = 0; i < 2; ++i) { + input->dim[i] = output->dim[i]; + } + return 0; + } + + // Actual computation: return 2 times x as an example. The first dimension is + // the innermost, so iterate over it last to avoid inefficient memory access + // patterns. + for (int j = 0; j < input->dim[1].extent; ++j) { + for (int i = 0; i < input->dim[0].extent; ++i) { + float *out = (float *)output->host + i * output->dim[0].stride + + j * output->dim[1].stride; + float *in = (float *)input->host + i * input->dim[0].stride + + j * input->dim[1].stride; + (*out) = 2 * (*in); + } + } + return 0; +} + +void test_coeff_wise(const MachineParams ¶ms, const Target &target) { + Var x("x"), y("y"); + + std::ostringstream with_extern; + { + Func f("f"), g("g"), h("h"); + f(x, y) = (x + y) * (x + y); + + Halide::ExternFuncArgument arg = f; + std::vector vars = {x, y}; + Halide::Type input_type = Halide::Float(32); + g.define_extern( + "mul_by_two", + {arg}, + input_type, + vars, + Halide::NameMangling::C); + g.function().extern_definition_proxy_expr() = f(x, y) * 2.0f; + + h(x, y) = g(x, y) * 2 + 1; + + h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + std::vector v; + v.push_back(h.function()); + Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + + d.dump(with_extern); + } + + std::ostringstream without_extern; + { + Func f("f"), g("g"), h("h"); + f(x, y) = (x + y) * (x + y); + g(x, y) = f(x, y) * 2.0f; + h(x, y) = g(x, y) * 2 + 1; + + h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); + std::vector v; + v.push_back(h.function()); + Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + + d.dump(without_extern); + } + + // Disabled for now: there is still work to do to populate the jacobian + //assert(with_extern.str() == without_extern.str()); +} + +extern "C" int matmul( + halide_buffer_t *input1, + halide_buffer_t *input2, + halide_buffer_t *output) { + if (input1->is_bounds_query() || input2->is_bounds_query()) { + // Bounds query: infer the input dimensions from the output dimensions. + // We leave the k dimension alone since we can't infer it from the output dimensions. + input1->dim[0].min = output->dim[0].min; + input1->dim[0].extent = output->dim[0].extent; + input2->dim[1].min = output->dim[1].min; + input2->dim[1].extent = output->dim[1].extent; + return 0; + } + + // Actual computation: return input1 * input2. + const int max_i = output->dim[0].min + output->dim[0].extent; + const int max_j = output->dim[1].min + output->dim[1].extent; + for (int i = output->dim[0].min; i < max_i; ++i) { + for (int j = output->dim[1].min; j < max_j; ++j) { + int pos[2] = {i, j}; + float *out = (float *)output->address_of(pos); + *out = 0.0f; + for (int k = 0; k < input1->dim[1].extent; ++k) { + int pos1[2] = {i, k}; + float *in1 = (float *)input1->address_of(pos1); + int pos2[2] = {k, j}; + float *in2 = (float *)input2->address_of(pos2); + (*out) += (*in1) * (*in2); + } + } + } + return 0; +} + +void test_matmul(const MachineParams ¶ms, const Target &target) { + Var x("x"), y("y"), k("k"); + RDom r(0, 200); + Halide::Buffer input1(200, 200); + Halide::Buffer input2(200, 200); + + std::ostringstream with_extern; + { + Func mm("mm"), h("h"); + + Halide::ExternFuncArgument arg1 = input1; + Halide::ExternFuncArgument arg2 = input2; + std::vector vars = {x, y}; + Halide::Type input_type = Halide::Float(32); + mm.define_extern( + "matmul", + {arg1, arg2}, + {input_type, input_type}, + vars, + Halide::NameMangling::C); + mm.function().extern_definition_proxy_expr() = Halide::sum(input1(x, r) * input2(r, y)); + + h(x, y) = mm(x, y); + + h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); + std::vector v; + v.push_back(h.function()); + Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + + d.dump(with_extern); + } + std::ostringstream without_extern; + { + Func mm("mm"), h("h"); + mm(x, y) = Halide::sum(input1(x, r) * input2(r, y)); + h(x, y) = mm(x, y); + + h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); + std::vector v; + v.push_back(h.function()); + Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + + d.dump(without_extern); + } + + std::cout << "with_extern:\n " << with_extern.str() + << "\n\nwithout_extern:\n " << without_extern.str() << "\n"; +} + +int main(int argc, char **argv) { + // Use a fixed target for the analysis to get consistent results from this test. + MachineParams params(32, 16000000, 40); + Target target("x86-64-linux-sse41-avx-avx2"); + + test_coeff_wise(params, target); + test_matmul(params, target); + + return 0; +} diff --git a/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp b/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp new file mode 100644 index 000000000000..9a21c6a96e58 --- /dev/null +++ b/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp @@ -0,0 +1,71 @@ +#include + +#include +#include +#include +#include + +#include "PerfectHashMap.h" + +using std::map; +using std::vector; + +struct Key { + int id, max_id; + Key(int i, int m) + : id(i), max_id(m) { + } +}; + +int main(int argc, char **argv) { + std::mt19937 rng(0); + int seed = argc > 1 ? atoi(argv[1]) : time(nullptr); + rng.seed(seed); + printf("seed: %d\n", seed); + + PerfectHashMap h; + std::map ref; + + std::vector keys; + const int N = 100; + + for (int i = 0; i < N; i++) { + keys.emplace_back(i, N); + } + std::shuffle(keys.begin(), keys.end(), rng); + + for (int i = 0; i < 10000; i++) { + // Insert. Possibly a duplicate of an existing item. + int next = rng() % N; + h.insert(&keys[next], next); + ref.insert({&keys[next], next}); + + // Check the map and hash map contain the same stuff in the same order + if (h.size() != ref.size()) { + fprintf(stderr, "Size mismatch: %d vs %d\n", (int)h.size(), (int)ref.size()); + return -1; + } + // Use iterators to convert PerfectHashMap to map and compare to reference map + decltype(ref) h_map; + for (auto it = h.begin(); it != h.end(); it++) { + h_map.insert({it.key(), it.value()}); + } + + auto it = h_map.begin(); + auto ref_it = ref.begin(); + while (it != h_map.end()) { + if (it->first != ref_it->first) { + fprintf(stderr, "Key mismatch: %p vs %p\n", (const void *)it->first, (const void *)ref_it->first); + return -1; + } + if (it->second != ref_it->second) { + fprintf(stderr, "Value mismatch: %d vs %d\n", it->second, ref_it->second); + return -1; + } + it++; + ref_it++; + } + } + printf("Perfect hash map test passed\n"); + return 0; +} diff --git a/src/autoschedulers/anderson2021/weights/README.md b/src/autoschedulers/anderson2021/weights/README.md new file mode 100644 index 000000000000..29d70da9b8a4 --- /dev/null +++ b/src/autoschedulers/anderson2021/weights/README.md @@ -0,0 +1,6 @@ +The weights in this directory were trained in a hold-one-out fashion. Each +app was trained on a set of random pipelines and all the other apps, but not +itself. For example, bilateral_grid.weights was trained on the random pipelines +and all apps except bilateral grid. + +These weights were trained on a V100 and may not perform the same on other GPUs. diff --git a/src/autoschedulers/anderson2021/weights/bgu.weights b/src/autoschedulers/anderson2021/weights/bgu.weights new file mode 100644 index 0000000000000000000000000000000000000000..f023210ad7b302a699597896489556de370841c7 GIT binary patch literal 20400 zcmXV%c~nl{_s5%ONmL>YiZUl5^_+bk4J0B7A#=!-BqTnhXi_Q7Nh-~QGL`DN`#e&K zlq3nM5TYWLB;wb1t=}Jeoqz6K>)yNW+55fU`*r5-KJFqeA|mp?kox~sW@I%O3F(nI zxpl43`K%G&w@O|Zv?Q7R`goOYoH1Eg9(sj_PjcgzU+{)r|63rkuZL_?e85|ANCY~X z&p|#s!w&6IBKNhHoj;Zh7iWaQjD=%_4?Ki$HqIY1&q(mKFUC^K{1bTTc?3Et?ST5O zi>OprKtvR_^4b(qcw2VW!<^<#^xl|>O#bHss@|)`F3lW;CK+aQ#la=)i7BCmr$V?g z1xbwMY+=m0GW0%M3o_g~j*g-RMK}5D94kvdr5g&V>px*>d<2sjjN*IDTO<6q(Hj<< zF2O|xe4*K3A`J=G;af$!@gHwZqMDB>r+Ik>fcM4HK|^0wd;K1hdcBnT++E78`7smt z%M5V+u3&QFv^_g1?aV}5FS4N82Q>5MThiCnfU~yNGv~Ys{Eg$Kge$%G;pj69_`Wim zgyXN+@B<~OFnYBHogzOE+fVOf!N((6V-B!gg@>u~+dJUn+kt82$MO5$yV#K42v&9S z?02>}opCsnjOiVV{mL_F``UaQGxrx$D0Qord}S}$6u#}=IsESvuVHTd1-g%yis*05 z8unUI(P-4FKKQJ0$Khye0s1KeKuxc z!Qm6|;Bq%gwM}PA2S0GrE&N&Q#}w>}P~>0TcONQsqv)SE`&nbV0+TGd$`TJ;raUVmp^qq0O|eN7ExW_7Ug4Xw3a_53he1 z!~-+iIm;&jXkzdO`2Sw;Zd#tf*f%*KKW;p&<_GdFB)U>fXGhkon8k`e$$rcAo3V+r|@!LdJp^;7r^Zh+pxY^KzD&}0E>vuL&vmbJdrz=hunayDHE=^&v z%Wo42=Mo0G(sX9Q7ElUXLHNROPR?;IrWoJD=70U1YT$Zwa}6gqSNtPc&6mOK?5vza&%zYFY55INeB|SdCif15@cJSmI3nbZ+(^c8Q&h0T z&KH~|p74s@*I}#3x$)kMXt!&IY{fh;g zED)Xz{s2$=7Q*`VTc~V<0XuwZ6r5E!&pbVHI1uB(q7GFS9T`GO{}ohLEco1=`~_a|@@pb9&PV)JPB-w6Og%uMqV4J@BHy-JJ7|MS6?zS4>yshdo#iF8A%BeR&~nYsn|$#~(&rqakc+v4^`i0-#;q7^D{L zLGz(GZ17elU#4au{quY$KcJLB=l=I>`ofj;h|eMBu67!CBoW**U%mxiL0VEVlvYCGQG^nM39lYSmo+kuy+qyg0&tEU8$fHsG`Q1vwgt*as zsp=Y*X#RmtR1oER>108XZ3T)v@FOx)nowk0ps5LL<`NykVdc+F==iA{zSyqCruHXX z)AWPT(DD}3Q}W=I;ym2fn~#2$8E{0q8WR?uLxc4-#QIM<%+@P|wig`lolPlPD<8*9 zY6#Kaq9H725DXiAP$B*{XPJ8eZx1$s_JIa6e)WHtYZ?PhQtzQa{||Pp_{@bR*Fo$V zU6`=`CzPq)!qDR%A-3-#c|3<8)Gk6Ntza~@FG7j@R<3{ZA6OB89vrk4!QJ;PC}u~a zlS(5f{W#B+CEVrrjg4amHY5xErxfzvZ8GIcO|cNJIWd~;iIo>NS^rSh<^C z-@BMe%ZT%}owWGgo?&3Vc`I7mWuoEd6sDy*i}BC(bDAH!vHpV&RsPxs&Q|eYxd=JC zl~-`LQw(gG9}nYLE{M!6G&{sSB&KAP&_bn<`svHk(k3n8lYO$n&B@7BYM_^DD~qz} zYt9MR70B^dWLKbe9HpaOeb_ct34X>xeg4kXKVWBS6gS7N23Nj5&t6ponazwljGoF2 zub8S+5#0l3Q&~F()zyK>w`06!`IQi>vX;wSexGDM{R#4c>BKH_maw<-I?W$%LIZa4 zg@JLu*%UoLdV7v6|H+p@bd5Y9l)S9Wx3wwco%IANXI{+;`!xADeKz0fb^%!tX00ryGKxCjEcKi_RdWSA1!>Tm zvH^SMtRpwx=VQ-;3`{wrMW$^M7cP4Bn|jE(QyaTQ!U-S5g)eRbeg8|2&n%>wQLrEL zc;E#3;kppw1z6#-9xEoV!RrEJntAIrmuMaa)~%_;&R{0IcO!>xl6XxuriZ)Q&%wg7DYUY+^}HC08%H1z>3}9 zKn&Z_%J4HDPVr!)+lEM-s1K?Yzrx(c2k4jZhG_bnL-n>8q1NB&{Aq^b!p&Qz@p;L4 zd}+H?!hy5fh0#vtC^}!A#H@6phsK4`Lw|Ono$XCDS89d)?M=jIwlvj|j$%!H7ud~| z-6ieO_4uo17F_!M0T;e+hLCGLWboWD4Jy0F;!TT$Q(kYOCGW=agY5QGhnwU1JNMYI zgI!AeOHMb0CkBgXU-L_PWqy=!-on?k(Ycsk|KbQ+Tl9{7J$6Z`cU+&}!-cZ2en;Wu zZM~rMrj&2E(27P(xrnoOUW3VNkq4KI|t(P@VejxET=)X_7+Q#Bg;oys{w{n1c2 z$AfEhIFBxNn;<~vA!?UNLi<1qI!)Y-v-EGH$d|N|0l8}e70X=Jlq$~9qq#Q^Gi_b0S zR%Ztyk>;kC9eUtS)`Dt*60z36kcSu~Aqs(Feh3>{d?Jtg*ha60u(Os2X!irCw=3QTwN zB3f z;Lq}DFxYYfPwm=+yH}RL!WSG42CM+fig-GD%Uq&6A(dU-^Nzh8A26bwe*eI4pV2(3Ok@!KTDvu=}6f|J#n|9n`_6y5DeIy60EonDT$h}zflBAqc zV{H!p@O60zq(6^<%mFnJ%M)T|Ljo7p?asB`@x`NW#n^QIEih>xgh1CQ2&p*4iVpo? z6&kbX&r7`mVVeX~>I!H2^eRek(V?BmPoP(x!=cuH5Hx!t&N7-sGhe^u-nmX>T3yok zHP#U%wET(U;_*;=`ZkEa&%xjUA*X$O1-B$;3YxyV4M7PB;I#HA6iDTgWnvt@cUS^< ze%sMYZr&JEx|F$!|7N|P45(V}BW|0wCKHeUhNYJl!m@i;(8xFd$BaFMcN%9B%fM3X z(=LMxN8C_hl|9?jvl_xJ*3v#_??wL-cZ00V4RG=u=0bCxj(lT%&^~`%(43B_ak&+T zi~Uf+Adge&9m48m0T|x4kbZu83)QwhW-m{^VMmwC(TRh*;Jwy&4D-0ezKnSZ&c-DW zDUyNj6F;HRE)Uwd?6;Zc5oxq`{R@1(KfEwEfk0+K6&l^jMRQ>s99aGjbbqg7o3~4G z?$VMhI82iUJaQk|-{YxZ+q()KG{y6pv=vwkE87D&;pk|Lz#;(tPY&vas{cb<;7oi8+24PgV4 zdd%Oyi_XdMyYV$=s^8J|^{Q4UHQz zB`{w}P_aOD>08RLan-grXr3w@qc zfG{h{1J{OOtW!_ax^Z6#$_tk~z)_kY_vC3Ezagbd*8Y6Tw zIl>o{5XV!A-MH_{C4N`YWnt^kjM0mdnA5d-_^>~Ph<~@hGQDxkJ0g*~ z1S|y0%#EP;b~~#!C}IBD$kRuA_tKejZqZ^@ePL|AFKXrh`fhH)Dw|@WU^5Lv z9j=4j-w}=8e-zw0rnA+fF3@=0XgaaCf()$|rH-S^n1w?VRs;;e{c>qW2_6D!a~bNbU^trx`%I0*7WZt2NI=(J6C$M z4K)|3c-%R5ODl;9>3uj_)(RWFl(-JMnteTgkY3Ckf;-pL!R`E3_I-`4u*dEop0W?8 z|1F6a=^GXK!xayixq}dtcIV@c!5A=#_D4g{ousZ{9=zODg6Y|>;GUQ=@@yAV)2=&o z&H0sVPUtlF;`o#;RqMiKabfgP#1&4X>MZlmNheM6TJ^i>&ryJrIw2Oc6K0dgQGWEkGQT7Rl*f*Elo;DFZ zO44vaTL;R7R6<+M3Fz8x1*f7E(JM9`EN3S`lbJCpWz~`IDcQL1xhqPTabS{c#EIBc znBByYp3(FcxXGM@OhT$>B;5yq_G}!qKRzB26 z5&s9Jshg+3_@Alh9Cr$qBt}3>;V`^=WP@M!7DLzEaUinuwb{0Z72ut@kSHs}fwyRy znZk@{Ze_<=Fl||Y`=Y|pJ@GesORAE`B^&VTky`+pd!T)KI*7=G3-)bFffc>skfBqH zJ<`?CbF3Z*_(d>tsXPdEq`7gw2`GsG*_ieb3r>F_1+Cdcq$z^*_g%$puF?W$qbLZo zT8v@?pJ3kX9yD_NgO*eJ@qAVr_KcbgGN29}Jvwk_Run2&#iP@{(eU+B2q-$_gV1|2 zXDvMluXK{IZp6Sz8Z{q`r?hZ?CoIRdtv$rWV;VY(-Qkuj+Kby7cVN@%T+(dBfzlOC z;9a|km6~tRzWfn*Pg>07o*8+kyVD_;D?pQ#Eufuy6vK)kN4 zX7@v8MHO*eBL>>Smt0sv7W7vdBmcNDFLI&;4uu!v>G|E@U6)+??!i-Vh|a*amJl!w zPQ=~=(QwYDAMV}n!R(Nen6@k2Jkd<~GKB zhK*aYU?vU6pjX+bS9=0Fz2D%dh8Gyr_7lAq9p_E342B`g5UA5bxJGL*?A%}SsQet7 zjCil(^@mAar3&osR7JyYae{Nt?jc_0Ff%U}q)p2(#_Un>MF++%=_D>I zE@0~1#nAk689bA^kH-hDpxqZO;vFRi-t2E_#F&3Ls9Zrnbq~XQ9SN4T#rTk6sng=oWhm z^!`0V>-)<%gEvK-=eN^fT#snEB9sdVJ`NL1vtZ+ZUT|K%|zz8ADVv+L#gd+F}~#kB!yid>08`UBzX!~^~Mf17v9H! z9rK`V)q1d1xXR=ns|(d*8X)B401;X}!u^~wD@yuIQ$5a8r5Gj9_A`VeiAJ7_=3DTw zSchU0-hqha9PVyd0DN!oXAfjzSx{m+?mg5BPpXhUN>pYw&A~8xqCKs@p~ycge9L=$ zN>W%dYLxFAjD`u@3QZxRbvYQZ^a0-i;CH_EnK)l+EI3 z%3A7GbY6HNQ^-2!^Z75Okm{eybzc{-Icbq}yP7^m&rTuSy&ur&F^YW{FE8A)I#jqc_l{6` z$soq%kK=a)dh<7wEajW#Z|7&JE~HJel0sqrF}5#Bf_9(&!G3(1PDA%EhK3hw1lzBi zr_uE#Y}bq$I`yfzaOhT;@cO)yI7?Snc+$#`-+b~URE7GJ1wJ29Xu*TK$)UJoc>>xi z*`Rl7RB5xP89Xj8!b+4X6?>ve^{z2Z?)n>TBAseJFz5&NrAv{`U zg4xZ_Ft%Vm2t@ybp3$aIHIkPECnaD-&|;{p3~@t(z(YjSYXf z)9SZz$hjFbSM|e3u{7v;aEa4?d{;2R0U@&&(+%&o@G5nyFR0C?Lanr&-|qUO}PUH&3`<=+OwI+>z_=wYRR3Wq zEq)Ut6bbC*9f*Gpb}xOn{yGc1J5sX(w7pS=nWOMj9e1f{GCM!zKN@=25)9UsLbs_m z4z4@P-b_wG^BJ!RReQ-25;Qrn49YEZc7hic?{L%h$j-{*to5mSI=!ouUzQSVx zWd<}#Rf5L0xI^#0*wXK9;mrJJGLssbMc>yxWNqz)Yzt7O@@{1?ZA}L*Xga_P@O}Wc z(?nQ$?MxipqeGuhQ$hRqa`t3LCABkogG>|gjISby95LkVRFlx@T0dvK_8%Es&4@_b z=SAt3ufS(>0md{qsH?L$IT;46nsI* zJ74Ns)P}az)!6o5C{I&!2y#EZhb=3Apxd0M#Oa|BK6*XF{egqn^Y#l+^;ArHNVsz8 zrReP(B)F@y2!`q(!(D$++|qFnJ6c4cWa>jmnQ{Z(xz?kDS{AmEM&cbf3&cbp zfX1@W+Ataxd_RG~4c2%-a>Os=-7&43`H$2)JO!Dbvfz3%91C_%A+@Tr zFgv~S%JbUw+P?V zkr(5eg9Wc{5+{dz#HIBho%9$3J{{tW^L}!H?s@oC_Z3D?Jq+I0K6B#_j^t3WeDv;j z;Mv_{z;b&G#7@cs zmvMnyo629Fb8jsqrMBS2fJ_{7@e#(Sy#b!cOYY|BNVvY~6v~XtCqpMk^xKUtXi!xH zW2gNXyDEoxtUd?3m4oqsjxzQaX2D?n5{T_|fl!S$NCHQYzIFqb=tY8Q{}MbcrHy}h z+u)R&3H(DR7 zBU3FSar)Pfa9i7#6O4Fne{?Kyfol|IPA=ncLkVna^CDGC^||uG+t^@aj-JUSB)RY& z3c9C4`3nR)2ch6ge-uO}K1N|pD5&Pf;!w^)ys7Vi{l8vcCqK&l88|FE)8{BnK zX$%|jQH(@J`t&1-DDr9&uPG!D{0=ljPaHyxqBjaV<-qMt1ebI_4kV6*f%e78#3pSV z+7%{2iRe`{Y?Xo4V(-y0J{J^z%>sjl8gM&!L*Oxa3i9)%c^;=8alFcW*e2}d>WAdf zdDl}=`qPPuX(152sR@k7nnTOr1K>ptV_PmC(+~TB$ISvx>h(hqNtz(g7_fm3y#dbU z&n9p`cnd}R)Oe0hGe}j(5Q>=mEE$-PgoX}33G)0J?ABejxaF)<6#1Mop?H=5MZkjJ5k~d zoey&`GwdL@dQ=!r8#sqzBiD={iDe_PW9J5}(C@|g<|Aa6V*r*RHy@}n`1nW2nx3Xj33t#$bC=`!>?W(3z_9^&Np!CdotO?b6G10G!L z0Kt=og0WAd$R)W^^vh9u*eF*9l8rNA`MwCSwK&~ zXQ1_226rc>%#*~Z5LrpN1~wp8inFN)wnrN2LEvh`09}+tUhao zliEah_vcUG3RlE~{SAtixlduyn8R2;Sp_%z&_U~+oj7i*5~fL{pnYvBxcG#?L@PHg zG~E|e>$R}3A&O{;6q)^eBY|lhp9OP$^#xn^=fRsH3(l7PqV2{`2xH@TIJySrKTv`vHuZiAY{r`Q)FO)I7oRy%A?HmvT!#s3!5CJQ-QUU{2)Ia!%@uWI3fw~CP{ac43R zcojDtlB1((I=-9m4aOzLlHfIea7y7#kX~AhL{)~0L`;F*j+^jIVkdT4+{4D0LOgkY z5TB_&C6P;W1jl?k$Rb^zQfZf4@Samdzj>p1UfIsLBz-n#rF;<$t&Vc!_)QsWMPgosR`jL@1ky_PjNa z)AW*?H>nKI%L>WQ&;PKn+Xvilzrd*u^5`Jiij|_#F#YX2crw=#pZ)cQzwaYBjaVfb zeo}(!IN!kNpJR~UG>1Gq^bCbMN${e7D^@gU!L`Ox(BJ_zvHzpLR3%QO1?-tkjZ-uISrXN5V*Dn_H-rSTVp=nDSFL4d~p-6 zPHy0MSC7K?KNUC}a*MQ0^AS*q1kk>t2`7~!8Cd&rH%+f&Ls1O2d!OSy`Z5E2R*U1R z$vfc{xkIeJn=_ehW>oY`3UpbkvtPG6VSMpCnC-0wQGLaN-`1rlp|l-C%u9Ioy{E9; z$RGb|{DM>&RW^O=M2e&9@x81Ltez~v1`T@f+}LMWyZkUIdFugJms>4Qpruw?s^}Oh3&!3H4nMxTh-AqHWBR(YQd{R{_yvVA6GqD zhzpx9qiv}X@ZCIc#okAtoskZA0{-K^^cb)SlQUt8K?JrhegQj|<>5Zp1?<^w0h~B{ zpXV=f1olh#p}NF7yzT!ORBk8WoBf(}(xXP=+0`i^fr)U?H-glQbO|nuwSfxDAq;l9 z1sk+P(N*)IfWEIMx0kEKr?~%M!|v~Rdccs=ahk+dr(J?>z6QLE+DnEP@;QmPB678> zoEY5S0Z~pJyg4yuoVS$WNxe!jte#10JBQRYw z%q@CZhHbl@L2Y(2JbM2Df@gQ5z(54`f1X6YBfXrl+8#)b(nW1e1$650f${e|@wCP^ zXyp>IqtzbU-?Tum@8uESX+7??evRVRhoS409xU9c1-IfYvBupWIwlU`K~;vY>!Uff zJuk`F31w*VXc93!J_#lspKF>>Z~!cIC8)ZV3R`jTEO4GapfvCZwKjj{&fF=3DH;C6 zXi@-|@^uFH{Ny8&J&T8boZsPUhqr?GtI8v~qnUjCmP+nC5MtB#iQK32Ct#n3790y+ zioQ)Xkm1!ueE8>JDE$Lx>MDi$UUTq8Zv{+C7!T|8?%|{~DYo`;Hky2&jyZ25;7IQk z5O4Slr|d`J#F2mMOaZ~)9|$T-m64cb&7_yVpV)+~#-}MSQ93S)jJ3_h;+YF@x!oFQ zQWJ+L-6dFLE5_n@dvLaFB8-j60L$hGm@zUJQ>swGJ7c@aor|~e<~(m++NpLZa=u4a zIm(0j(huWN#aw{SmiCT07bZ_v7{*rJEvDg6X<`sc| z<1TK^C}oUM)g^Xm1a9{~NB=-|jM%yctcz>0cI!9ZhT6{%IQ9=$a3YyRNk7Ne z7E3_WW)zcoHkmT3#Y1)r+8E<#a*6dLRxvOoeybT5-mSnb24(j+q)au{35ZN#NDufn6cQOYs_+ z{V4)_rSyPC?#CmaeX;lLBUrj77UNGnfbV%Vq}EN6&65cR<*XdujdQ!XGozMs8?M=+ z?v+tA#UqxB_Si~Zl?X6>RNTlMEd!44=!R66F;K6&i_CxJ4ga|f;QE<21hKs%9YS3K z)M$y(oQYP%YUu;ao^+cBzoqDlqi0EVdLHV2b->$i>(J%HNd3Gm1$vtn!v||=^l2Rp znhyeqWy@sd@KPRPPA{PIq<_NEbqSCbwGsr{+1wpNZ*a8R#i>1h1P$vo*t6N95Pq!` zeeUZ(+DkwD`&5Za+V6$g*<;xhsYLu)I*G~VSm8P6f81B!402IVksNrk8`gA`qFYf1 zDRIBfr3xQok3}L#7&Y_OZHN*SUq3BC_Vj=Ue0Q(<~iqHN2+H+Z?Wfa8fO!rY&UL?An! zH5%K}qZ((Kj@@N=EAEO;M(=TO#QPX46NrP-d7z&9AKnO@DsZ0BReHKTou{zb7Ub1Z z@P(rUtNCL{KcX(qc@l>z@4~@r=_QPOpGmC34AErC62_uLsaH;c;I8rjwk8Y7t*FP) z`g(vAHJ<<(k1GKAdT=#)9Gd^0jG<@l3p6eT5wDI%NPQZ~UHKMV?R*}lJq*Ppqcrlp zn9_x5#>74894`2N0h4#@!M1&XN!_XiJZt)hOR4DU{1e14g5RE`y-hFGj@qXMQJZ@|DYJxdG$BWU0}>&-#&&L zdv3w8F=rU(Rs$lkVX!~?2>#uXiwXT4?0YvrZ| z>A*cLjyV|wvxDjukX+meqt=z8!|7(6`}-;Cs}y3-;m?q}uNAj%{)X;X&f|oOv#5x| z3T)iofUDnL1xHJu4JL`a7etx9zpF&Xsct~sY=00_j)3x#E(}{6j$Zz2$-A$iqP8*5!XiULbp#3aNcK2_Ur2c zwEL_`SC7gE$)`KOj?CXJ_+Bfj#?gdLn=`DkLl&;%3aPbJQY#&GK?AKrS`!)xCu zBYXWENS;^-nTAd<=6&yo9-57wg|ndb=tpuSE|beXDTQ0==3~*sWIUa94z?IL!_f3a z(4nD$s$H&_FTVmGe4hgUzMDbnon>e+p%$W~hPbn1ev=^9ihY9syBu%fvoX$s<6B?x z%I1pjeocRfDiU$zbLud8XVQxHE$4~LrVR8yeE}ph^=H14k%c`ib?+FB)U4TV%&VxbXFH9Ew2~&c_>4#5OL6n?^O&^*u z+q#R}U}(WzKJ=Tc8}kUhrRu`mJb9A3bO^srkb`I4Co#y=8LP|6xKLeBT(z4rPo5YJ zZrDqR_Bf^=u^eaDJOYg}d7k#TkNEd}Cg*+*!T-87R%IW=eC6A?VwF7%u5siYm;6i; z0?rY`Y$b4+E=sfZ`Qk3gWt{lQCc)blJDl3UXD&y|!J;A<4X5{zKgkv(sw4vLdtX6M zP8a^{&V=i6*WvN?bQB*h=XO0(#dXVvv0^|T`uAH>`40xzzOZ}5C($5Y$sb|s@nM1-y zWXr6`%fq9D3T7wZkH;WZGi9*TNrNLn=P)4p5C#Wn(h8}mYZ z-oB?uKkawKnn5SV8l~aXeQtzjAH|Z5cEL7x8;l>RT{F~@&}+?jmKxK`*?eHQJXMtm zJ^es`G!wk%ji;lk_dwtAOHlY_3r7En!ZpC=E+3MKGioHl2e!pLX+(+pNiu>ayI>-(1oeM39hvYD5$ z<~_nEl#S{6F^d!mSp4BIF!fG4jM+0hp^p!eq+H#0^RMNhp1 z`BDjb>+lVtvL*`0J6^`i6HW2;vv%Hz0}kZrqcvpd^us1o)Ao>G(-!cC*JyHv?vC94 z!e!707trvjB@7*$Nv4|50H>XV*ss_IGm~$?vCqY*baWA>tUC_hH|>IpHZf4Ma5Uz< zxof(yekz0Klmp|3>ZN%x5)YWTiISeCHE*E2%Q?q;Ry&+ef|F zI+QL7BzK24k*cdYcx1i|eI;oOB*YJIl&jILb0hK4I#DjS(jRl|#b{N_c{sVs3U*8x z%aVRJfr5J+EQu)Q2Iu|VPh#XSG)t1NbPxBOzjYMx7gSIwzF{}G7 zEPHhkvJ9Vr%!exC`_}+f)D}{9gu|EP-*N`QR@iuFB9`z`Dc^(LwIdVtH)5Z;SdOsu zH@6_VhI(vNWe*9Nr^`*DO_=p)HuL;(7Mt!}2GKRovB}C8rez+b2NVOrdxX>8x_>_W z+mVb_-WIrDuNrDMd2wa*J2rp$1Xga{F!)!4t68m#`Ja>UMk^n(!|l<%DFh#uszr6u1#kr zL_v$J3Y6uKWmR>raDMnb+!4J5H#$=En6wujrG5wJp;&wn_7^Kg&%&d!TKFvS0vFy= zfv3jH(%0X=aXl8@+_1=0TDoch^R)T|-maAe(dvJZ7)L2~CY8vT7>(Ce}wXEXwes^F{b)w+ohE8S(X>UqwqBU6i;X z&b(y5!@&2gJcZ?FQSyX7zA_P^Pqcb5EzK2YoG%va$qeE~9ovCZ9}YmPwiUkj=_23B zVo;A*f^oyY$yDpl+>t9EIH{*8xSrnxS!MprF9Q&tRCD)#MPaB-0rVRVaMX7dS(%_RO2Y_xZuLA z6sZx!&2@(Or}HrV$stZt8dtboTA; zL^@Zq7)nMM_hUO%aP9dj_IbGy?%LXkW3&fwz@(h*@<^bTOM}>Si*ULskHH8Y%6`3m zN-vI_3H${zoph}NqZn$HG4mp zr@e*0i#MU?SUBL~L2|=8I?eqzr~I8UW7SR29J&!>L~lcKw>ryE>%+14-V+J4X~HvRtC@lOBr1Fe z#9@LPt*E-dEqwC|r>Lh={o}7dHPix5oan+3sYh6-k$^S6k02vI8h%fE!zt!1gg%}9 zc=Mne`o{en>3>C-&Kv~@S~LySTg~Z$&1d1qr``0-CIx1Hpaybe`q5yE95-WUIQ=%} zJV{nnWg0K$v0$Do%?cS|1ULD?_$MwzdT@wxQYsPiyJl=Cl#s zp&X_ejHP>XmqX9WQcy1g?xNfnni3T+aE$-Ob0V;LPA(O*1*Cri{}^>rk37f7)6_Qt3-_6YRpE*)XVo6x>cp2nOo zq8IAcvg_L{=wTmKkdgMrkLG;hGwLO`rsyGgyw8Em?f)TIYnlTIVdLnip>7Nv>8XzN zkAvYklQF3?2v;xb=Jz``=9Nf3r66XZMI(-3S9fmDxdoHw4koTv?(O`_ZdO zUo;D_smL1V)lI=aD?XyJ$2K(C@D7Im9s#>OVVvmQr_kS$#r;jXhiP5{II|<0yS+yT zD|uDqw#$6#)}cWEH`CMp|7dT%1=V>%yqi`CmKyi*#nEcm-!lYn*6;}TrGopp{1Cp; z80K~6MuD`V2ptxgNart;gw2i_khNBaeHNRKHPe;o_K;J!xatr#Jl=#m65gRusQ`*? zuffH8%EbGV1I=mO#`KnDVdB^XOr7xw>LzwW*NXz~U{DOGKhFRigF4MQ_W_Zmmw00* zw7_3KKW^pyY{-w8g#%voJhfG2Anvl1tUG-Mt}ar*-o{K~wpyI*kNd>k--pCs3h;53 zH2v#!0WX#JVYAdi^blbd;3i8bL^V-%ktUTyw&&dE zB_yKH)<%@Jp#{^fNV0^=7G;SLC8Q*Y_uP-7Wm>F5B_v8kGi9j^mG7ND;{49Julu@A z!uodZh00ERzr7s-E^h_Xi>BBR_Z#}@)Z%EZJnrUiFo`ZJxLenR2fv+$Fs~Xs_$~}K zW*uOyve_{B)jXzlvW4@THJw|kqX`=pd1AD01%7PFLN~Ev(D<G{g};=Vp#HgInWIGdEiq90)xwq_AiQES7ErgHqI z71Q`%Jp0HLeR1AYJb~$09m3B(HVS}{1C>qB2r@89Ki8;8U>xbeAE#GXa>XaYh) z!d`@kQPzK;3~GUPQ~#yJIYlg(!MRyg5viR zsQt1(iJSI)vj1`vgnOUC-^IUTmiTw#uyY<)?ynEzQx06gY|>Do&etjK=Z)=}+5Pj^ zS*YPMP%f?(G{3$<8fN-{#v4Q4qy{m6%{Z~<@*yh4kw`T466dr;R1S$JL&}r+1#V&@ zDlyJJ742pxB;Qj_mq z`P#fU@i3Oi8PJJ~-QgUY0gXQ8U=w$P{CfE%_UB2{B5EKwV`f4Bj$H&RnB=WQ9BxZY_n|i#a&*HwVhj14O1wpL)K`Wz01Lv=5$u;1W6RR!69)0r|!9 zWlk2R>&c>Ynm;|`-vyouCEz!b0s3AcT$g7q?49xkX2!cv6g9`_rNI8yp}n^2lCggW(Ffk=yy;O1 z%yuI17XKBUa+0B5t4*lrQqCeD6yQarr2?ap*;QX7mDrEr3;4rEg3erj5N3VIgELCz z;PTG;wCgq`lj`4TM{EbJ*z9kdc;IlLcD38ZM(R+#OJ_%duACRA`hhWtcIYCH^ zG`aSt9L*AX<0Z`$66vMFbO$C;or`^#xfs{qO2d;jh9ZvUgeZCk2?lrZVtu3FIQ$-+ zRqQ}nN`^o6hrA%uAqnSxInH+UBtyz!1B>9cUt#Npd|bSH6YR0F!@l2gF*2S!QR{Z3NgJhC?1$$n&#HD`H^5g%nlUc!CVqZY zg8P%+Lha`7=-YV^1`DDgT(A#RHPoRqHxX>JS|NIB1>_tG#yg&=;J&61s;+*=^?~W& z7C8pW>XKlb@)56nF@cj7chK(DN4Wg+I_U2!!G)FXc(m*-76hmXf9k!2fJqUeuErj4 zxE9)qR?_;8ICd>m7bKN5@czVLVz}=Tylu-Qhl;9UsPA7$LOuS=#WIpvb^*k|8Ebzo z#af%2WLDHmdPRJUq<9(7BBLBz z^0q@OnT@3lXtcHpm%f+d?&t(zi@PLNFRX?wL3Yr4r~u}xsljryX}CxG7Mu9$6n@k}8bBnOCK4 z0mCt;wmWjb}OO=e5aZlo{P zGC_a)U3Oq|G^+fPfr8R#&~JLm&KNk8m6q$MmPG*rc7_w1<4@XtoJaD|3aoRqs^aZb zP<4VM760)D_GfH{ZC97koMpz?v2EPqLB|4k^0EpXjNf8FZxy`n=5eQ@75QW9HSo2W z4i|2yqU^Cu=&?Lb-JR+LzCM1aa>X1fJg-9R!w|F&d@xR+&HdobmcOb8TpQ=?hO*}c#RvbbXtMu8mL zs&xc{QYFw&?gtzHR*C*!j$@y7IAjK1CFZkop`LXU-Ag|$l*`VdS^SuY*Ttb6q@l-h zWqNh&9jDDXL5RU#Qn+v(>r&96v}-e)0WH*rHl5d;%Jp|iTuvUb^uhh!4qDh8EO?3$bk3nHRux(W zIqV25k4!|(2}+!kg&Zg-`!)y*c=6{s6ja^$N#N z%%r0ligbc&G8=2Rql#;J!N$qnc%BON~#D4^J%&C!QBWAX^ zlIUQAtr86GxXpCRG%0>CtBO1K1QJp@AZE9&z@YFbl!YEk1(b& znI~$)P_1POH3tZH<76TP$t2^#r9%)UQ$}u>O&0X{3`75FfN869(EWvX^(GxHsIQWQ zxwXTP`e`m!@4hT@n&qii{vJ4$7Dv5Cy5Ni30p9j;CCCWtiI(~yE-0rRZccjxrX7#b zq$IgIV@m-aTGE7d=Atn7Tnav__hJ7rhyPfB-I&Nd=~U!5Uc5rOohMSh^)zPBj>V8Z zD|kD{hPOWSfQ8Gyr6J{qasHApcCYp>#@?!j?^_DNqDT=WeSgC6(kZa(e|h9x&3W)h v`5TQM?Wz>UoPuK}e}YGqbD?IZjjwY%8QkkJ21{QsYlf~S_CBm~8d zy`($1d)Q)aZ&Z9aiF6#EOdCIT;8@Zy(W#3g53YK{kIO|Mba;SqkgevJ9~FY04nN3) zChSqJWN`KoD#5;IgKm_GNllJEsV*$KIShuhVq;4vx;KNA@#*$uxaRGdylceJ$Zw1 zkgR8ioa+&jU*VbU>+q+<9Zck*9OoP2R72;obBX@g#}D{qk^;Sg}G+WztXmW75wSzZ;(Q- z`*g)71FqzcCTjBgHvRf~E%_#rPQ#=F=u6`}WY5p>5X*3)xMdm{i|M3m zgkoT|&G2ut{aU1lc#3aoAB9pejBKMP|~g+OAGfmalQ(_hF=N`xKI9` zf?9`coZ2^)zrHe`dwJtUcFKjTbh(=uU)m#ssGOWbw#BM*=e(Do>h+G~w%%_%+-*L2%^5B~jFq-oj>)=J*zs?WRk*SPkGX|0;h~Xqf#g);sH8?T zBO|GmsW82FSeH3eD$gz3eU0QW6Y1@-O4{-zlWW+z21_qg(TkCr`M++?B8p$-2u6u< zKkTq$pSd`Zg%w^DszRxa&nzZEv6JRRMv(^v!dUC$gnCuSfq!QW!#l}mPZr!ldG+a7 zVd)D?*59HH*~b|TaXBLKJc5oVm@_kibD==9m0X#;i~0huLzwCATEtF1s_VekX+Y5odv}iZn@Uf_&Q|EXcCv)V)@qVtWeEugR3?bZo;^lS?S+{0Do*cj1rh z68PJ79eda;i0b--byNS5u?9kyjd*juaTakkvpO**NrLq4LTbEWh`0NY5>3c{1}9ce zCbDZCsf}tgxNn!mq~i?`#H=Er)jQze{jKnRV+lHR$!fKl>kUdwMn#|?EpC=ad=g4K6{^u|6?40*>VqYB@I5>&xuREE) za-AeMbb2K{H~BeH&Xwk#QOSf-i-#yw?axT6x1f;26@5LhXA`W#VAHTI+I;GVUJGk% zdH;-US$G6qbiKjb$+^%by8;jY$U}dlbU3N}1aGeOL#-W88I!TwuvG0HbiZObZ_O&v zWa=r*AleZ9EgC|zM?kyT2W4W5S)-h*SUl1K%C0XM#VzMBNB=ssh`)n;%`xmV|ICKo zYJivv^I_(WVYsJIgdwLsLd?%=Ox-dDp?(e8%?m>Pt))02?=?GUKL+Npm%vI{7EbtH z1lg=;w42@xa=$LI_bS)Zxvuv79!%x)y9T-X!as@6!WJ5~eAoIc)8&o0p7pIXOoKV%!xFZb`$SU-PKqIQ@TRL!GS_HSuT>}wLOWJ4z< zG^0eMEM|whai^>krwdlygZz*7_~?i`RF6edogqbfs-T2zc9=jznYS?Y>rZf69}7ln zkhRsTI7g$|1-f^`Uc>gSkyxhhf+C zVb*u07(aUP4DNRM1GMO;A*nMzaF%gl7*F z-IXT7Rh_KRd{+Yv`W^!t8bIiq2d87=CWx7C&1P)8&tx#^hOPj-RKCAMMaExfSRg1G4X1Noq7 z7X_~_me9UCL-f39B{?^t9V^EslHzIC(6sj&*Em;|G|twbs_i{+k!9(GSQfMISCM#+ zV|c#o99QT;7|k}Fg4JK8>0ar7=vj6FQw~bvijBKShUN|U76(L9Rt22IU$S0L|2&L$X!g30St#!_oBv~m&CRu!Y!fp6fs zhR^may#_CW3UE$;2wl^bNSvoWBG!clRBh>G{_)ICqWvg?@g{Dd_brNXFFl6&qgs%v z8Vu&n-#`T4CwI*UF4Wm7wL@bE%9RWpIz_4_qz^z$m_eADFo*^tlmoa{{R zn)ITO(lzb{s~U8?H;sQ~)?ez=c?C!8#j pR7M}oQUq9NVqdpspfPJx%cfhyXw+e zN}Hpp{$5`w=&Wa8+XmKfj}*ogxWUuS5sdGlT8z7p1=CJw@Qe5tvBq~Qamw$b(QP7J zVT)e`_MNBpyPEjw2bxLoGJSrG<{LVa@{9cER>FPZw1=xv9LO(}59Q+#d9KrmbiQA( zCcKrXC2O)8xzBdXb0gHR;nIC6p!%f&&SYi5tEv5Hx0eqdt8y@P@?!8*h=xJC2duW{ zWN29C&Nf?J!h@E!5TNoHmG6nc`=Ktho8^p4G>cKFH?3kwx(Uqh`a`PhIFy;Iqv!Nn z5YW)bxJ^KiJkkPI0SnMe%>f#ujG$(t6??>X1pB*Qq1>l({9MAvinG^X^no|jgbRa> z-+Wg3b1o*uC_(*gE~;5~u=3TPAm(=pv`Oc{)IE-P=A!{V%6JF^)_kTfwF#14Z^OFJ zU3jd)5`-cR_1Atj;NS#JWw;rL80zB>;u*9ct`03jXUT~k5!g3vv{8%w|NonywiY2A*$qZd@8!KN4Xp) zUw(_+R_^kjy+o}M__Y}taC%-F-Jfnl<}ahvXk;c|GT4VMcsxS(3RqxPF)~lWlC`8+`gwn!^>~e z`jK4nV)X{jg>y^j5$B!szr8XzGGCW)zE2?1Dyqz@U8poEA;vH zzaZ2gM!ri-CY><`CURH&61Vb*F5gho3<|PB`N9kPNOR#6%*|9F z&MF<$$a6kFQWjXuvF zfmX`~$jSKyX+7<@T`>{8zi>ctACIxP{0R;CCHNp+8(Ma^;DqQjSlr%(eH(4j#3>zK zh<}D3#^X>c9S*f!?HIHB8r!2E3+89|&|&r+#~vUy=vFZK#V;74r;jRLAPW_Li*UAY zEA;uxqTR)2=wAATBd^j9EwA4*`cud8{n5uz*w_L!9@nA9%N({^tVaihFHko_3G6IB zz(pkv|F!FE?%}daTn~q7{5_E+y!$={d{qetKHr;iGqmMUFCBsB7>4x|3(!nx1!~O6 z1C!uKOxcbPjM0Gw#6RW^u8h0I4EIaoz5RSpn8V?>9jhfyYj#mftt(`%?*@EScbd2t zb<*G$akS^5FLQiT3%zFegp>wHkoBVy+;s`{RN>rd`XuWqRy*cHcky9*f3OgDI|cEV z-E$&NlKo^;pAk0kQn^;?u6zrSBZtXFxV>o_X_b&7(dQCz+if|f{5%EKtO2^|{$eV9 zBp*t1;OK+@L3hq;8j@d5vewl=Hmd_q4fb#=BPQ@$D~fQg=Qv*O^8nLRlSs&a3+dvy zDX`|wFLwR8eA?`9OV4Z_gA#{p4(cfuLC zSuq_lVxm~R08Q3zQzZ0EIwiorRFp2R!?xZI8Z+of0)tH|=TV`>!J)TlSD?Z3M=i%Ow>!1^T8MQt4F%9`Epml!*-p*8Y7F}Yx7BYp#*-L&rDmSZ~s#Gp#{5&s{ z9m9>Z`_5cGSzU^A&6kl@;XqJ9ya=9GhFQ*R~$?btTaz zc!bey4x^R(QwXbDY2b6(9X9>m$-O&Im|v(rhqf=)rW(v2OkFVv`rNhx_t^|ypXeN@ zDGFhaW!gc)UIcx?*{r{*h%v8|F*5upD4#80nqMzO$;TDoJTQpcRr(=zRW_W7a>FdF z1o_Tjws@5iaK7(>i#bEcQ))qH-9xzJP9oOW&qUKlpU}%t5>9NGhmXhlQ2C82v&*s& zJgrkfPVW;4W#zL$ONQB=4dSd?5C?Q({sCO{fu-kJR1W)ykvk&TpqCktQQE=qPu*l{ zR_kC=UOmW6Nrv8`<*25#7;fJGj@l1Pz^N+$l*@l{K33mH$>-xtg48n*+S*Z3ykHi( zSES*p?jDp3egxgwr=ibrJ$OaQ;+dG+V6^lmv>523TxJ9FJvj>xzjVV11}x~^(qVbsbi)gb6C$9iWE|esnqlvua_Czw4?_Fe4ICan1n-R1%v6~h;4PeH zAhRf%-PCgt^t)E!;ixb?k?;q-#T1yjikl}AB%Q}C&Sn9V34)S1HQK^ zYa%fMZ7NCF;F8EonzRyh)w|fgGdE)Qo&n~d`vPO<-4?{~majI4z3JqR2cpqGiLMl!i zSULz94{I2kZ6ct|Z)HPoX2RejUF4q9ZB#k4&UDEUtbI}ZL~4Xv8t?~fR8%8(R%e-Cya^k-ULO=Wjo z{|vi!XTo9z-^Wu!@o3pQkMWL@25Il1CPa@CP6&8a6 z#O8*<_V+wAYWfNM&Bu5`x%Q0YOMf){8j9kM))?FM0g^(mGPif1K%raeY)z*n*q7YL zfW0fAd-D$X=dqJ^X135lb0BuFGC4_$!cqDAEIK`6KKJeP@k+Vta-i(54M`K4IS1#y z0iSi-QDo*@5HecEmW2xXi%;W#AvXw@N4Zxj0`Akd9O;|Is0+QBnVA+*KI_x!9|aUUTGMiN|&h3TZ@>{+M@xy-Ec`H1{=94NaLf_pdK#I15> z=$#r>+2Lsbbq`AMk#aWjmWx5z&n57+Jp^LE{D*w*5zf~w2Dr*93&gK=f=vt`YEAVp ztD^~H@>c>+_#6yO)`uEFU*ese0Oo;f;nAZIFjS$?b?+cH+`qv#xXxp@Yw%(A&cEz= zrD7a)>VUbM2jQbg8Vpp2v&wa4yqS8vAhTTw>Qvuih+7hJ`_q{-6MLb>is0LoKSA0p z3p)m8gR<5#?2+nYM6`e7gJ*W&Zk`4{D--dj%`9rNumVR-=D@aqXE4*So3zyD;NQD# zI3d#)7aI5CSAlOeZ5aqEdqn%?C3UmPg-Sgj${7kJn0``pgdfK z^_BHUEwPRCtDG|HsI3iyX-BBkHh-G3>lZtEGzo0Xdm!R(FMG61hWmG=4*zWRMpJC^wy$ojF8RBh%<++fvL;Z^n(QBFU9cNu**z z4;8xdgX0?e5-eMN*ujQ%SSG-%0A+8Kq=v}%YG6IKsn8vjlS%pUuh6Wwf*8$T%R3{O zzjX8GGV&`X@H8$KQ(STp^X$(<`GRz|^zsID2v27&q`!d77#S*^dk;*zv}o$`40`5f zAWX_Nfb2IR#BU&m_%^Pk(fxraz4HewUY7~Vd$Ks`A0J>PeSt0VIuIh@9l^&9DR0_$ z&I9==q;>aFQn`Kx7%3g6`rd~0^z5IIs+&fG*Db;u{~XE1HaU!^2^+!3IS^v^_(Lqs#)AfvQSZqfmfz&T zQ*?Ozz~@}5 z^eyd1iziR8`&}Hx7?<-3{M7SDe zo34a#<{{Wzs%KJPOomn8Ph-#v6Lb|5=w+O{`VEW!G0&|UL2_6M+zP`mf1f&2udoEO zVh7>;SO?fFea!|n?T0R*L^wzPVR!i`)7IY$c7c`1`KkpbMUs$VWdK`^1Mz`#4oWtU zGP324$n-o#kA}~1X9*9SZ2m$_LqEK-c#18@r(o^G0aX1p2tOtNAaBoI(Ctq{b&u~D z;GqFlBg&ApC6U?vq8%)idU$1~E3w7sDKliZ414`6u_iVf?ws(2-m?#pA78|9^=&xU zeX}vYt&p*^%0t}n93+zJFyPZsRyTK;y>cQKzs_&NNR8v*o${GgJR<0!B6;XNXvMLt zGR5wqKhU?%2m4-^!NKlYMpwBTyT^|~mRY($hy2L&WnV#Kk_h>8y}-!vI>gM$g@f`} z*zW0nIZi+7At|*BX9Z;7lxwvZo7M>&p;or=d?ehl^+HMcJZAK)fWH;=!3%{Mplj!X zF`Kg)_bqVp;+0M<-H$)F%=_R(p_y3ttNfF_oaOd@Y{;mkkF`cQf59#yL*3fW0FufuRB& zMMp?5r=Ltfp|&}kmf$Pk@7e(aHxQo6dL#dfG#u-UV3Y3OfC(okL;OF-+zQ#f{{fak8Nj@j=NVa%XyRs4*VbN^U;auw7(JL z#=f9zS}+9Jwt((5L+Bc*22SKScIR;M_Hlo3FU)7f+aH5a(oEj$Av5Sv8)6TR*}{n< zMJVJygJaW}&eZgbqLAKj#n8+o)V3OCkmHYh#rKts&-3xHz&{Ntx51chQMRFY9T-RI zqw&sQxVNbaqIGXVUTY^NTX#dDt0CUbZUtdM2IhZxm^mB7sm$;vaFx)Z{4O6*6^X<% zJw|9cYc`(u3PlmYXA}ixDkx3wi@1F<;*r;7%uL^RjJ&)k{kf*!pvg`Xp8tLZGP67? zuWyb+|6FAd>la5^qp5g$s1d|h^rF&wYn+iD3#B52$ybdx2=OjprajGs_EooFsfsB& zn(u_K$s$C~Iv#u&g-3rI~E|=7lhzp$zGT(B84m0 zjDyy-ANX;?G{UO`oE&b>tex3`W?T1z_CX`~njuBMZ@7t_(-+}>vCD8~T_Pr({>&^j z72|Y8?S&MJHE0c=VEhx8y+17;%~q{OE&nh0daR6%5J|?nfi@tbas-NkrSQ(FCCF2p)<+IhfoY# z{Rz^R3}8&4Bsni5O;A~=%$B|iu>rQ&ud8b9SJB-@ItyxkNr}3h{f&K@tTMj?b=ez z3q0yuIeFG(c+kq{{P!vsOw^m%5OE>6bgU3pAFE=cPF=&@UQc+dFYLgaTO$~FW)&v= z`^02E5^$EfI>vllsq)?0F^CYiGnia=6JOmefL~)t`0kr2r{L#HY*`k{WSD5+)ib9+ zhPi~z7t8VE%V5TwnL@X1xB*>rTF@tL5sHexMjBoUul$=J;LRr7SYZJlUu)uNSs5}K zK7|O;eBAQ(Hd~uF4O(B-;r^Rhkg@&_ws?Po43j{3?q&~JQsOw28OeKhAQl4BYLR<6 z0zDLTap&@0&

iLC5n%-(ki61M;4w3bw(!^J^C^{=y# z``S~)@3Xn;_={+?!GO~Ul_0kVh9T&DC`2{B2C<+hGl_)b7*mvladLm44lm%^W$~!L zZFyn7vF0B5XD%e; zW&;k$Jb(e;NBE6j0tPmtL8vLiDg7cKF9u|NS|=79?xRfLyKsO_sNVM(#n3Eb+tf`e7(Gbiw@H z*(wy%^x%Qv5M=S%VU{66q+yjuRIstFw9 zGO(llEEtC;;^6K$2wFM>cPa)j`|MFnbB{*Zf0MDz^A~67_!RyQ#)C(utmyOZ*x?yW zTAq#P)?fGt>)o=zfS$*&)@;;iI1D}hFL6xsGYsn(Mt{>oJQb^O_-S<(8g&q^(Rz#w z`b%mngV02zy-u1Y;vV;vVO!4xH2QQ=5cK#C;$;pq^Ws6q>@Hq#{)(160${*t6z9FD z3*)VOh^PG-Or2u^ZO@m$W9bS!^z8~dbWbJzv7^AB{Vk7H_=n%e-y@)0h5VFnkh>`d zbpESG|3FvXy`Cp1wrCL)1uVwAS2cJzlI?P_0u9E=_g9 zBlqIaJN_o<{CkYH6-zk%mnEFvr{iG!2+_(uf(r>h1S)1(u-+vTZseXojWe6TZTA5b z1nvd{m;jgZ&coVw0c;e6T2e5ZrYnUAzM|BzxDO5au%&POxW)~&)+d-rh@=j_5WMtW?ONdq2D znnK~I9JqH#;=;C-!ZY91geGkVX~oO=LZiTq^v8uYR9K9N1se)I#4n8~*Q`;zA3*S6Cp6tWA)C z`xnPBnM)HvQ^!mgeXyR1TO%BNt4G3e^#($%24R6fWYy?W+;E*)9QB z$uC9!)Y$SiKXa(7EW!I4IVhMT1-HMTU~EOu9jC?PRFadV>*O$;j`^AV)@bLyIHDH}k*1 zD6ed68&Crc{n^+xvX@90{lLnHuHa*z20z;DF}JrH_lWq?w~3+Ps(lqodQ<4mC?k$;##xXm=%|(vt1UqtJ@7D8U4#WyCUGbs2 zh3A;zpn>pe!a}}AT?9MR(S#$_wy~wV(wWcY@%$rKY#?o(3H9EtCXAaUPZy5m(bo@d zF~u#GU~)c|YMfSKOMSPp<^dDt+~P(v+q1#hU=R-eo6lyvtYmHJ_wc^%e&};PNPU*^ zsl0}iP%LzixBJo)aCjcT4K*&p+oGNoqT!FS%o2si8o4n$wb_k><-pf6fR{@a(9R7$ z1rTnf9MqMn{7_FDNLnjq}FqBePK9i-5?k&$^s4dY+goZC6=>hY*CsE5hC7k;Ik#O zsnWsD*8cO;pKD!{8!!YhW1jQGy)mvqVwb(rRryv&=(k7JG-a^zv+sN-N z`s{7`b*#B(&Y0_U_J$`-#yxwBd%7pmL3bOve#jNY4j6F`6Oz&O+7M^E<{$aKk`b|v zkEZEXtq`!G0DWf|v73jcGf%aA2=cuS8uOF6MTh;FMNt(Nm!#qm!Ai#ak;4tIe@>1L zf8|a|c@mw)P=8 z&0@UBmpeJ1z#p8)%v2~ocozz+egn@Xo|};L4el!_qk-TLdajZOIrAjU>~O<%slC`X z`8%$Ec?;F567Xa_L-`{~q-)++vQaLW4E-+W>P~$E;jSv4^QTL&oL&NtYnHIqs}+uy zl%ipU5R8J4k>0Mcu(+fLi)@zx|7a|)V_`pUNYV}hwuQkZ_h7iha?sO!9GX0E=Y)@a z1(SB&gXH(iV7@{QYTWqEle=>fW_aJi{TjKjD>5Ea#)x5ze?It+nZz?vnn$eV0zt?- zRUTNfbScvGhQgxt=z;I{lLde3emt~EmFJn|TKg?`6@*WExTr($vq;VNZp z(BC6Wa9h(9em<&$+o#RAs5#~6*0l$_+Qp$vuLe?d3*n8|BXpdYg&pK6@eiE^65>^$ zwq!m;k$d2Ls)3|7kAnrD4`X<1DiIW{n2_$Ro!lkR2WgUN_ET!4_T8Fku#E zUmAkrzuUms@D&&Scq_DvCBc8}A9j@fB&~hj;2KtryncPKy(tTsj^?n+It(kf<)ZA< zpJZ(5CL~?;=-c=a@@EOa!}%}7H}=7^W%byyPXTL351`i9A^0Nu3kB}Cz_>3Bb$vf$ zh_4d80P+1MRij@6fPAb)=#bRWHk!Yelk-^`hJ zAutCET5k|n$LokTk3c554nscd<&5)&xzPQ2*stA+F?##J|Jp}x(jF0pN?b?(AxECW zoh8_D^B43k3c%i1x52ZcmKbYvV87n&zb1Y_4- z7{4Ni_^b?qZR5jnx8`^pD$auMkE|iS#}gvdIv^RGLFQT^TI|CFG8ie+ zQRa$?=JW%JDAuaZYdISV!ModF;37i(Sbr4ui~{eM(Oh!HMUXrY2^#0L$kH?=bSO@S zGVvrddL;`hCElX*rCgBvF$?sY>%n_Zp}@40g_GWuH$5uAo!Kujywn-vJ4SGgMT@{X z#tg03orSy0|L0>R!1d=ZF=b5$+}Lf2={e6qT$F+NUmhlGbQkK`B*DRGd6wjr&2`%* zVuP80tMpOhTvZZKLUhd}Q7sXbYI*@u)i+qNVJB?aa+7nnkztJj3Ctd8PO7KYL*#p1#@x)V5EYA@7PTIk)S#{jG!W3w% zNy16D+(_SvON1+2&a=O$OFZpDx!AV{LE7sWF-kfQw|vfWmh;NM<%2e7vT7bLu=yF> z;LS#*H+{Ughs{v)SsyH31;gapBz*KG5mJrAxGhGvVRq~~_$K|DYqNgItJyn-N(=`h zSvZ<1_zdArvB!}1rW3&{fj4JfG`LT*GB2+%Q>+ow7QB zO-UmmwWJ;*#`+O)DBu;dAvo;!)qIkuS9uy`!eyfuc=MAVukguLjC}uC(9<;zm7{{- zz^O2-Us`}C6MM;oZb^36_cF|pZ^17HHYhv!73yqAhOO0UaJ9!8KW*{_SA9Nq_p4Fa z>anyxFc&xIDq_}WGYF_3!Gd*Dpuaf}uP%B4pN;l{!HSC@o~TN;Iook!YpvnpXv7AG zBiN>FjGG5W(dO9CC^k-)P0S^{n#w+b!t6-WB=W1uwd!C_#4qmqhCrN_(?B-y9%J)c z73>oTQ0l=<+~0kR8=*6isuq2P;%k?&@X2Tvq+Np({l_!serfi)HIQefcm@?6N8sa6 zqiKc3KZq(dV3I@M;ElL6of7p0o6Xa~d^v%LW1~TKRwBH<)j+f-4P%_uZ2Zvuhx_#{ z3}=gQ!Cq%7Gks_PBXXX>)IG|yWZNiQuRnm1yjn8PY9CqmNQw44e&VDB;`H~fPPDO- zq0)va_$G5S+?}e+42@5~*K_wU@l+Tw+8To)t-CN!odbhwVd$q&OC+>w;me;y5}?_L zJCk&oNyQ0rWVa8#HNOZZsp+tN!YpXssl=A%ej_nAhq=X%Yr!DY1dA6?)@3)3?v;%~ z&FBSW@+V35rPCP0wn)%e{Vd!cF^QJPrVvl#0vxU215QcXiS9Hv!Bn0zJYVq>KT3Uv zSP;jDqr(J)A}nj}G=ztT<>6>~KAw;mfjf3R#W4jDB;~3m1RRVfIpNZjy(}S}W+TYY zFCyPvv5vg4P~=vmR&f@CCWIR)4kKoHbHA&TiOJ&#)X$p@{Y!*U<*dYdUakj8r)JJy zx)XmX$HT%ONiepg7=o(G(CK9WPCgutN@;TFK6g8KnQuk;o-JIbNLT-p?}b*?A+XXh z74sV-MbD%cywopXWuFuNBwz8e-9B*rv6$?5Zv*KXM(7t|3lFC?qn(`)*WRCwBTub_ zGXc*8Va^XO3kdYf>=pG-Ig@zCu%i2DvAEaOYUl#V?73+qs-r43d)%fNq&Lb$2> z zI`m&jME?jGoSWLl_3SmkoYf`}ve%bbNL=AM^2Re(myHu&n&IJ>(V!3*#l2GRg_s*| zf=f9)`1?P3Y*D_2N3*)fgp!@4iW1b$o=c`AA0`>=K6Ce)CGgT8V8O8f=EkbS2o^|0kFhlO%aVZ?NC`ZTKBY+?UD642;{3!|Qpkl6E3w00S;6fWj0 z$H>7n-+qCodOl0enuJ{Ttn?3Y z@2Ws2Jr#U%M5G1k-y@9!3-SG+KQ=yACZW^AVWU+ZiQK!1+zB1Se0wT51z`yJteTAG z&qhQ4t5#zBC}s>P<<>03{CM~X_^3ONVf=!4Si;q`gUb)zfTIXr>}|B=*%YC}%oGCDop4!d>L+5DxOFk@*H$Klt{^uhQl>d)g5ScGX8*)a|?qqm3kOF%1^2t_n0o*cOgpKL%pr~1$d+a2} zwtk!s4>nhjnjP}&=9TFXe`g(@Zs^62Z!Tk9S}WY`sO9#2{s#+}o3pbulj!XedTffr zWtez%HQh4zEe5t8$HgzyFx#OUM!!{K7d<90x5yB-yVo631d`xA*^j(>%(yyH&R71D z8+V|qs63)vL2yXzIzeR_>vZ!c-+p%x-Yp+g_LE>Yzx;xlb&};*+FG#U zmMqj<_6N7P1K9Ss7-NPw_z-xRU7aKW4U0~|de1CelDV0^mCb@R#gVYoApml1N3$P? zk7CKWA0U2r9LJ2=E;e$Dl_vbin%S_<){jsIZI`%kJsf7<; zCc)E3wF1?wgZMsDi%vTgPuvGCFyqe)VR_wsc<7aZW*c=mTYM?lr)dd7tio}lB-mi^ zLvk_iG7R^=!@098*r#Kq5IEs9o?DZS(;fuFdAG;h%FDr|`C>6xT5cexMsEVM7gAVv zP?Z90fWF+RTu{doZuBQh6uW4P&tkr!V!J=?yFG@91RA(0KM`vjlrZas0WPr{i`F*} zz}Wo?=<+a#>q}fq_s?xacmG;!J$xNJPVHkGSN$PqSOy<@)={aEGnws$nRGY&#RYeg zI9pQ7wpn=Ksm(29$)*^5csraN)?LZXky(Y87jtH2hL$3ab2|%6D}<7z3-OK0GLY(7 z03&l=la5heaAK+tS{j{zd72y%ENC^4(;6$_O}>pEj!(zmrHQc0E*4BoPC|ykRotE# zgi?*mu}#ATKbU0^0pA+~Z`6aadp6GfU;>5Ofw5t!5QgSTAm8o2$5g`t0@G-c0u{AI#;`#mOzGGRfO zLr21^&*t>5zZtf-^nyiDFgndX1$l!1aDn(RjJ9^>b{GwzcS0>WaX%f$yzAk%Ctk!E zS)$BSsuu)YdjmVSpW{4jeqvTZE2p}?9$hw$1aY1vFM5R)RxI9xzh-L#8#e*vKKP(Q z@MfNQLoMV7RD!p1CtB?y?E5%1Je()W>lHWRu0WB#PA-t4B{sPC`%#jeKZbg#l)x*$ zbtI&K!?S0{b8qjj!mRjAJXB>tE^L?u=gK39;mtvmKQH2^AFh*I{-V5KMkhCRdnVL` zB*0jK2|9~1LVFGK@XF<1c>mZhPH`roRmqFtzfl8l+GH_~em{$f*eFjd?kBhRbOJoR z)<{lj97OA19>idi1KF5thYxsVpqrzKW+j(#lwuD~XzIZ4ZA~OT;T6}v-5>lsB5@=8 z1XH?O;QO2?T-^tBoaRNW6-y;wf0e-Y@tV}QqyW5)3z@|E6Zq=hGIn>nI=9B@Bh1!m zfXvwjm@Pa9IjU)*?D}60GxnmDOFf+0y@Z8UF2J-2uQ2jaIz)E3;IV)OnD}N3tTHTx zWq&oHht6ZgwL`EyHU`Vil?eP61u)O_K+a{`68N*XkH>11X{nSs82mAzclS8st-rxo z-Yf$%;04T+lV(Z5Cqa@$;MT%$ID5~62`)`wht@9Oex=>S*>b<}%ZD>?=)oumdnZNR z_cfEqC8y!ciG8^4pgev$T8I5+Pp|^MbCpU}g8B`zXrld#9GsPjRrd<;RL8UOomL8y~dnv##mNl|A1SWeE}Lam0-s%H9GrMCo%3CO;oi@;F{@UJk$19O z{GS9EvEmhxTIGpL4IaZT+d}aAPe2Y#swVp}ALn0-A^UO?MY^IQ_qJprj=yl36Vxuo z4_n0OIg<BocU4u|&~6X_d<^yB#XG~%uyYYxx&W)$Lp+L!4&=_Zzq}apM%??NN&IQX*_78Ky4gPfqT9#i@Oqm ziPqE5V%%@C<7PdnHaSbiD;&V+ad$zxUY%N8D})`f(EuTXob}ZuApS%Pcehx#qnWe=UGRu?F-`WDLrN z2SK0lI;_?%!x@+Q$|venu%1KtV zL3mCFZh4iR@YZ|$bvuZ+c+FFi`oxs$+MtJ}`M05A{iLfPuv`C{-1JCKRyw8 z?R&Y8(<*S~crUPSeu80Q4V=TLv0$-nkWA=5gdvJI;gHTsII(>S8Z4LQa(s4jg^o0;Vf@!mhVZ;i{ShdbgK=UbHGU z&U_ENT`JtByif@LavZW3XK{YVPD}w(7L^j=&bU_(6~p)J8&PG+^s-8 zCy*h(p!krCuj|6f_-{=sLm6J16nCc{Ws= zZiQC;Vblj}bUo&e$t?xsn0El^?MJ-#`H#7uof^%vNZ{DL7Bs6_fd($u@sYuM(v!Od zW*yDM35#W+LqDAJI#$YUu1SK=;;!(lq=}fer9pnhE&OTu22A%465-QATqY;W@}E1x z$uM&avkrwN4+Ug+qb`~lZ^O4)$C!%^H!3#V$*j3@8$u!R^aZ>@w&4`+jm zLmw6t#ge@rBdJY#6{j~q1)3cWq~W#(8(;Z>TjG=tDb^WK;Q15kwIAc)ycJ~i#SkDr zcC!(qbeT)HHhRRZpte4{n7z;&+!pQ<%=}BCPv$uFPL5z#Mn#bMTOQ!%>oM%JQ~(~Z zSBCM*15)1=v$m?4ct2a4g!&v+oC znz*BS=b@|WE!aPXEmW|S2y0;LvN)@1MV=GRY zD#1owu*Q~0mjo;0(n;&AEEseb;M|HC7~hbFWhcQ8> z9%HU0;o9Z}xLo=HHa(cchB}|($@DN#E!4o|#e^~^7ygV}dcrAMWz6sTOg@Vn!#|dF zibd8wpncEY(qa2JYW!Cpx-N>&>K~`7J&m z&rL;fnZ023(G|+1ICy2c1x`MlCd#KCA+Ak{5MgS{4rr}m;U5&~$IU6=c5NJJZoCP4 zVLQp@aVNM;{DJzVa%5Dz68oAl1QD|%V8J_i`um?Y`^|mCW8=iw+0Fql_Cjvj#*2tiCv2ZYuRZ<-rteRJW=5YN_r!Vh?uY`PIC)`H zrVcx3_z-Y@5K-G`53iU4UCF&Ii2#OXxw+4D-X1WyudMRnfrR&;iiW!dq%@g!C}FM z(s5uKH;o+q5X1RSorw33JjX}tqG0lu{bkzMqmZn#fyQHspb?(|ddtUwis<~)VWU&9 zG-EAuTUv=9*#*>k-3aDQs|1#z8ce9ENaw1Jqls5Gv%=biq_wv9R>7t8J2+GP7Fo0E1XXd5X7f2&{IReXC!4N? zm@lJnr@Iusx^o4j63ywv{KZ&b(8cv0zJYSTev)4K=Ww#70hcR9l7F6EaL8Z`J(znQ zWZvu2#=#LZGd3RF=P9w*8n?iyD1)~2%%P_j)zT?x`{<^Ob&RZ2q(3$!;*(W{m~uaq zmQB3Fp4iFbDEn^I)%b?dty`J5O*UN`8^J6pJ7{2fDeSu(%qovoQ0r$~*$3}T^oTHL zcXq}D&svjBmb+ZK?oK4D&b@+fbi%Re>wld5f7&$Lv4U!?m`d07$wRk=3rcM@C;XeY z;m(^yLO&}Pd{!@ntVMBfgKS|2=SAM>!VWzBgoh6WE8zycLC);z0aaf`dgq2FZG7{N zNiJ%~TfQ@}VCM+XtukYd-BrlmzQ8Ye9CaRP2fDIyoW|Hnl6kNLjt9QLx-%l|>XQ{p zn|(lkD|1?A8$h+^W^j^PEuwsBGpGa&vyqnVm^C_+s4QIr8=W>%snN^efa*;&T(y{5 ze+q^rlkK6s_XZkB*J5wdbqouvg(D(QaNn*-uEORfNu7|1f>D)Z&%+={G@Hx52FjxP z%fkZWqHm&1cO2c=(JjJjkw>7IE2s|ofv=)(VS-yAwR!#@x%}b__i<_zbKdnA?t~ko z!p21?QSV7M7J1;jNITY2bY3uiZ#12(gET*OCp@H?-7$F3xC)1GPz)+E0mHE(x=lgTvNCgP{b8+~we zq8oZo{8)B+R5O`>E|olcqt7BTvrxIjklr6|Ch;wjY{`%b6^;HhI@5tE-x>u=e)L0G zffQV9%ZK2x&$<6hwt|qxeDzA@vTX;YQg z;pHz1;<*2La{u$%Y%6}j{=-U4@J5>1sxE_AUMm@tRALX(0>M2)hP`k}HBTu!Pxjk) zqMT^IYn10;Yzx=4o?%;7)-9aoJfz%wiO@0EEQR_r%n1$_=&v7NsM^IbGX59Q$#Lr$$#iVyf z(WWemi#hoWY%eL&iz{n*b8YtG`<+igTOu2xMuww|_(BZyDn!X}U3}@A%sE&ZiZVNA zQOiCIW~sO1v%s+b>)=e>souIcE@Vg{6j!N)R7kg|=8~ zDimE6k)e_XA(foHeuvvAl&F+4bP+c~Q8ana`~C_0dG@pR+Mn+LiF!dSWpWxkxzYf& z9d!_#tqxnP9ie6QKlEE}7ko1-!3-Nmw*JN$R`^DSO4UTd;<1@HlJo$Rb;K;>_YxH5 zHF7Lk=QWyW(1H zeTX@bO@2Hr5Iuao1kBEbFx!i7;cvSQFn-S!{3?F{#tH|aD>@Z(v*Jmt z@dP9aXW4`OsaTw-OZ}G|0Pnemc81(WNlb!#aLLvzb~}5JWilh^|LP zV_xbxn$j)di*m)pTbu(X)}wjvG5I)l`3(#ne-hFYB)tsDZ!E?wDS~f5lJMr{FSlbN za$x4_tq|9$0`_wPNLSzrh+1D%cJp8<7(ZNqZFjnm`;rKgU_kWuzHrPCNd8nq6X|R% zEsXs%8>?)qFv8^#Nh}(TE4+rO?UrOX)pnNblf0wC4@dFSo$=_cnTy&wH${z^XK=k# zA~~3@PICN^CZCcB99jX`mXl6WPs>va`{~?gB|{prP!&?<9f1Hj1-9(Y6lhnLXrcT* z)~Cu~QvWneh>k(WvKrCX5eLvPt7booBcP$rggy?`=gJk|bAJ~n&_3-9L7H0@Rd8}d z$!Y+xq!=vvC-RR!6_5<4)sR&*lU}tI^DCE6=k4-O(2VjDI%8}+cP6_R%1yJOFJlq& z_?Sw1PDkUDtIsj6DGW-kg~C7NRsdt1uAzs0TuVekX6;g=u?f!=Sa9NWe{`EXiWZ+6CRgl-=%V%swERUs+7`&L z_RKrn!>g^}dclB(_c()oTo!398^ckRGIAz*fL`iIfX&;9MM*_9R5bOl@M~HyG8Br> z3JW1wRi6w_4ua7q6TxQRUE(>r0>21@>CE|m)0wV&sn6FUawEi^Gg;5$yvP#P@zO|C zsStq^Z>lhJD-Tv`nJMwJYeHDsIwl!7LcsiW#eR)d<5 zr8k?l_{Kv)TrL>YE0c^6V-RlG3l1WEBAB}rF!T z_t0IF+M#i4DV%6O56do_JF!SL8C@UcA4?_z}BH(yExYPEQ2sx39&k;>IQmmz_0 z9t-Ad!0&R}@FcXF-8C!4*M{@4Joy`(nspK%PoIjbWJb{`bx}CxY!`ff+Q{rWbm2x< zA06M;$gR@x;jhaXkm100`~iudoqn)^`#tM%b>dz+F#kHV8dc(Pl1UYp^wC9cUAWTt zbgnJuBsYEaR_f5X4U8mSnOfWp`hM!vY0nRgx@gNwJI&%;B^*HbSw!bH%888b7fI&* zE9mg|fdz+~!8Z0ct}HwnpDv$6S8}I$t8hj7Fl9C;<97kJ*S!}lo+S_B+GU*F!U-Vd za9xrMI|BK`YPf6TJ=EKN0S>y)p~-`J+^b1m)H7QbtPR7cVbBl>>D|E<@awoQio@)U z|9=*{D)k4e@7=^h{4&yqbb~3o8hi(*a z=T1q-Vq}{s(G(VxrJhZL58Y>x+I{jK)34N z!>&)d)Y!ce6!NsNdvz!k{3lJ|$LEpfT~~0VZWXlAI-DQM;V$DEv}s9$e3f84RzpeK zXgQ440PGQaP#R>PW1d?B)GWJ}0hT-{NK$w|?zYmSWz|arm*WxA!8;O8)zXm*^5P+7x3(;tQ4ok4_!Z-ew zIQqC1c#M1n>t!R%&Y&3EY}8=hnZslky+Ggld$SdNoj{jmm7kj zld9Bw{Z*!JqsW~yci>*;5Uzn<#g z|3Wmk5@_6Zhv>g}3qQXKaYxP`YV4uUMWi%v;R6X|iDc#soEoN)YVlx)o)Ejc9#@xG z;I^W0%v0J9egO*H2Pr2W5~uUee2ehY)8F|`PN`f=>3*`jSkmn=P(m)veS_7)f0=8k zD!8alHBRXK*4$ z12V5Iq)R*f%Up*GL?fk=^Umr3C7t_NI&K|Q9rneTlmMtGHHGs-7uaS~J$&{$8UC0a zPq_<>)6XeJ!|Cp@rn8qNUOA2_`>P=6-cnk6az1SMb&Ia~d>@~fInt4)1S{9Ol^CiR z!z~-tPv5q*p~}hzT5I|jO&nfBtv2rkDQ*uc-CoO8&U;B)I0NnvIR?QZDeCP0k=y)w zoJiToi!Y#u@b)?#F6$pkg^Ll1MPU^tYJ9=v5m8_TwXFB40lewAAkLQQsJb%*%@z+ammMwW zq&W|KrA>&p#RrxvvyU|hWa-Wtdze1(99Je1`cc=F+h8T6f-Rp#I~{M3?)r@o6}lU9 zZ<)j4gEcJti-dnPw!`-ifzax!a@n(ck!*r`J5x5EPRm1mpqV9MjmJ@_z4wYtxns^vxOtXo zw$;HMd5U+#vmp8RF?0`KC(tWA0}F(SpfbA&YRab3X?5G-qC=yo!F)D<$$0|rB@@j> z*8e416y=VdFHn-HG8FvVVgbPw6X~&4b9n!yqaf_jec1HJW(?k1h3?8n*#W^Pcw41H zXRa*9sTb9_S^vD@>M!}SRykxjkG>H5_w}S5I?CD{GQd=JJk^mt&o+Nc61JDK4K4Ll0htVDFupDQ7zWH0uZS@CC%jiahI8^{dt1X7}M zglp3Az&n+z$eNG;!f9&&*|L@1MtTwy6{MXTc8YVcP`l)52&>^#S(%RSI(% zV}sI%c0vBbEnp>;McM^}uxjCLva!kx}%WpjMFRmhs{ zzF-pT1LZz1IQBA#1-yI2EZdjkKvXx3;gT1h7|P;ugj#gpiUpu6m4ShVfAB+^h(E3J zX|n=B{QEbmw{)N|0oow+Zt@}VxRb^Qponu;W*Oqn`n9|JmiF6xCoU;))P uP*=H{%+xy$W#`JFU(x|X)t%w(ohMw1!!cr(KUpBuMO>jI$-O-2gZ~0=om!Xx literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/depthwise_separable_conv.weights b/src/autoschedulers/anderson2021/weights/depthwise_separable_conv.weights new file mode 100644 index 0000000000000000000000000000000000000000..8a7e84ff79683ef6ab2904abccec1d88223864a5 GIT binary patch literal 20400 zcmXV%c{Env_s3BoAqkNZLMlb3#B=v~3`s~Tl}b_}&2uR}WgZF{3keBHgh+*X8I?-xQgvdnX z1C7#GQEj_qLq=xz&u zJZQph<#OhC)D5cH_z0Lg^T?0>e)?qbV*0u+1UJm-CGY1&5{tYu82vH^9i?_bWA7!D zs?TQx#CLEy#nU*>ZjE4bw~&|>fNLzGOn>w`P${$hL_9y3u1#@abiBjahvHK( zo?QboHa$ci-)A7o{$ivA2AI~TIe4O`6)K-Df#Ts}+{8WvrPKy68_b0?WfPRoSA&7n zLPp3YA5zUN*x9fA!0PCAMp*nlgkUpcHLsWXOOBw7T?H#YWf`Qus0GLRcDCmT3*6%g zjEm?qcG&$HV%lpwyU84Xh}^;SzDH=xy@~m{w?T4SKD_XbMd7eE6jA8Hur3X-=j{ak z`&N#pq5!a(53uxLBZjA};|QGgqg#z7h-1kaM%z%6KA5qYS>0n#Jjb*6@1k~r79Iy@ zGeiCk%cn#&Qh@95JDKa3H-(!z-wl@v1^}*G57MIn)bdp&-T!eCk+TtF7b}*-{nz_g z6So8iNZLb73xcT3B@cQ<`7JThYGe+sio-3H5;Uc35jQ5Mi2rryDmPwi7uTs$lV8xe zg6pas$M-#^PTaIsGv?ac=mmp3`a4pAp29OkdwC0V?kvQ2^dEj%XaWDdcmi9}FVNK= zGGL0zcgSp-3)^PolS>*xq$cwNRb6aPiyS=2Tp3aRX8WJ8K&p_|_pGFMKc2)g;~lte zCnER9Cr;(9YM44Uicggffqg|F?f5cVI}1o0P_9XFgcS4QyJ-p7VW0 z&sg1qjL(+{P0S*j2A_e7mlgp%Gk(h%KeA+r1x?LF#^;rxN(EFB*UFXD#b1*a4!pujpHS{)OHJyw zEru>QHI*-_X~g}Hm8Ne_SCOCDt@QCCYkFjwG~LyjLci-*5tAc!P_GhzIX@rZosXk< zK&y+jcpidAn&ZIz_nLFp;ylK`y$#l?e=zggTI#MGN*p)-#9fn=!2SIw6n!rus$W); z%%d0S;I8ZRWY;C`@#TDcI37+L3RL(JW4c_Sp=1c0{gX&+_NR@jLdc@!F;uT5gWCH2 zfy|9OdNg%8(aLuQsYqLf%a3Bk9G795{sV0LH^j0ah zsYgsG(a&t8C$>~m_l+jR_)-A{m3UHFAc&9sT+!&kQQ+TM!SIgr+2aM*QAS-B%k2H( zxmpNym=r*R)2_j-jHl?ZWhX2@H3qH1(@3{_7D1Cl8vD_Twq)()9w-r{TY~E7l0kjG z=p($fNL|OXZ4c@}$vG?eO*_niX#FY@yIA!SwC{Ud5xsg*w>4oppxsQ&kawDrZgGa@B=&~tBm-akHUj8V~*B!&=_VrLw5CVIrHNnCw zO7!>4NEqY?amx>#B_BSY;C_miX04ot==9Nk2%Dmb?NS~jV!k|%&yFQ_16P)zWN{OYD?h;`(*V@mT*p|AXTnmohtT<&<-E5pM=R;m zm_>Bp^0&(nd3zLeUiqPTVi9YRa}kS1n?d=&3r2q3f0$!@1)7CFK>mVp?6v*EMqaOn z`1A8&_T~|IC|8IPr$0gbk4wx`Z3f}l3UrzmhQ{mf$7Seg2BjF0qE z;#GQ$HXgr14!c@#Pi2L}KQDW3nes4(*dC)ro+`AlA%!$t)1zF+i}cL9Y%+7ymu`;L zr@zhqBc^Z^ZmgWnzgEoBd6v_-=_~JXjx9JrnKoDQR!)^(P`XLYMosvpvMx6pyKCs~bOoFZ5nCmi-^yM= z!8vXE`F8?-+5H4JOy=R2`!`77q|5L{#f82M)bT2l=!ZP*$1&sdFmf z)6+PJaMh)}vL1TisSLL)Sq^e8xbVHlRq$xh2>JL&og9|a;-8!5E=*_`uZZ_#XMwtyi4-ipF6>t@P$$Hma z15aKC45n?t0qspp!G}B?FuaLTpNrW`zlHea(mLFxA}?Zg*o&W0N9f0)JfeTygn-sn zyydo&er$4sD>=Rp*%69@Qu<8gD+#a=m`Ab-->}JMkzm!4&e&^eK?@f#!>|ag_k07- z6@1pEynqNUwSc25S5i*;9pW!Ni@ViiH?8Szq7Qzl5}m3n#)ljNqi=D{iTg(|e@qk7 zRl~v7{Tm2j7mBfa@mk*&x;ri$7R*1-X|;ccvyB&_&d)q>`gH-#zfR`g7}X@fStIPi zFca>jmQ3yqn*e^fk|N)nr%KGSZo?uOK9DK)#Mf~jd?}1#3ci@&-%q9ReTD=O)}OBX z98Rw{Oo2f^LDtE~7rv(W;JmGtWcR&7&Zv1C2}n>!O~t$XBd((4O?(mKMtdRkxgl3e zC;_+LR^!fezf0-b55!fil1xu}!8e>TjoNt?aC>{rXtZ$xePk-gKmJgeYu-If2TRiU z5*xn0*bjd$ZVyljxvezM` zzJWP38A0@5GdP4SL~k`0sGn*9H8u|H!EK}1*Zvx%K9}N`Vm_96U4pSnAE=2I1jm5+ ztk{=aOpRB7XPI17+t|j+RDXu}UpJstEC-}_?8LL5Ot32J5e#hPGrj3ekai#w%)hkb zk$QU&h&3}l&bvVcl|J)VZU4X(d>u#XzQ5ovyPC?kzVL*IZztSAoe1C_wP9YIQbP+t z7btG#ftp1Q3Utn8D^)k)T^)w{m+qqF=X;IguT3L{OLTC?Zb@pRe1vpAQb6m>soY*8 z9lrkq!p$?7#O0*1RGZv`+L}Iku-=dS`l?HdZ0hKy3~8FWy_V?k*KieI`{GUeKq$(; z3_>e;bkV&z{Jgv6@bcG8$h$O^Ojo^5bl&yCBK(J+6_(P~rfXXN0A8yC~PODEQdqav#nAwh%Ag_#&nb%b4sWoTWg$I*VKMt?9BsJcrQ zI~{_d-zFR~U&cVz@GKC@1BsQ||7~6TzAG2=$#yH!Lkef$AMBRrBdn}&FuddU% zZ)C~JN@u9!T*bi|d+8*xIQEXbF>$z83^60JSl6(CWZZVgx}h?j@ah~e2rFe|^A)LN zk_64r7)8aXT#s9LwB)_iG!hq z`Xr~m2S#>GqO)2usY3P);@2_@wv4plYF}q;OKF6iIlgGmyM$|;?aZ514c% z3l)90LTvpX*6_nce77)-X{Z;cI(LImGR2BG`Tt=fZa3g)$O2Ffyu)kDM4WTA1OJo; zp}1x)oBVnJ$LzE)YK|hQIdvb}muAx~D?@3->QA8U{2esjUdF~FhiRqSU+9Zg1o!@Y zbhTT7#gCMT*D)#dJT8J(hyDUrZJZN%gvS##tU=wv95mx6zyX_oaBstIdhEz-cH=oA zI+4hdpKBv9aQrj6R5;KBik;w>vX`z1v&3%gA@~+}n(`m=aedea=KZs2T;s6~blZt& z`ovWVl|QSau*L-_V{_1ln}ElwyFfnY;B2m(Y$=TnbHntLlZjiXIejE_8}%2B z@FI<;WBrN`nBD%A)7ThJ)rz-qd^WBmr#sd7yOyMpHEMrR-gk@`J|2U~ZFfk2*8n8C zDf4^Y*J4WkH@fR`6S0rJgr`qkBrkJ2iN~NKnW?J67d;$|w!P&vup@?iw#;JF&D@Ff zNC))Z_hoOm)#ISrJIrxD3FmkNASaWKn}>R_=H?;RP)#2%F7d%LykF?|yb3g%S3yqB zPsr$ggPY`2(B~@$n!VBRqFlhM) z9*af8;wP3J5LWIJO8_*Zw&;!eAw zMXEU$(3HJH3U}Y;e;3suM5>3VNBZFs>**91M<6?L6kbOL($2LGI3ndsY@fShz%2nH zQ*f8w)S1n9O>6{l`xy9dZzXBlC(nI(bu&Lc&mWa;1Nv`o#~SNWM%;QKMmXF7`@a(! zedr{JDP+;GYhEOA?=q+yGbam=C6HZ;L3DG6H@pnBqwV#Qbk~g8^p5{*su1uG#-;12 zVeNfr)UYR3>9=_6QVU?JWfgUwp~IKbHzZ3oD-)|4Byle-xEx_uerNJoNGg+stoS(A zC}aWaWETtF5~nB5zjPEUdWuIA(`nOCBkV1k&d#5=f&}m9kP(&V)c;O8?&g@`jOYt= z_}+x?_3sK|i46H@b)H@i;Ny|p1!TV159Z+!5uz3RnjSs5oFD!s2}IStfw)&M(T~{1 zwF+IsH*?^FlzSfT8odI#mxEEqb2n3;uLmt|Wtf@W3OC%WXwa}K=@)H=wI8S827O18 zs24-C&+mldHzr8){<16PJjCjBc{*8+g+syhG&3m}jybx)`LzRB^dXSc)HKnj9EQLC z^$lniSxlOmqKS5sHn&mDg&#_Pq0d?g9`~jVI}~gWJ72Y6&$ee^tS^d=;iF9Fh%W86 zv?Fo`d$4HFMiOHgPBe79saLcgZ{Yhms=@rmbUg{^J+vOU&u8&^C(VVL!U*=rEhk9c zgF}RmUO}tt|uhfg#+a(g%r#x8ZEuAyE%ppdzl4U_uCl$=_ICRxh|&WJp=I> zY0xvg4Am60AgSUz>O3k2*Y*%lF8#^*R9%6hjenTrsn0=ReOp=4LPb1LmVt(y-6$Gf z1)aCgK=00#;2kH4XX7)$VrddIo9Ls|t$OBrS~l)`c?c()uwZmumld#nWO5fLaz^7q zI3gMV+jb1%(v9EI(eyOvE>402dKGN!A~R@R{}+e8{)TxUo}y1=Bzr(O176I|!iQHg zz$0%h=D(7HevE*~;2uU)vHR3S$ zK4`6)27DC}R^~SYQUbti&G>}*=X#j@j%-GtIffbfk%BIVM0l>caS*xE6orOAgWl2s z)O8$33-uukyw!;V5~?5y3eY{E0{520q4>%~blNu^`lG`^(jgD{KB}yh$SAa`q+jL#EBHGY53X6F6a9B!r$-Re{92mbdgTjjb-o2!BnmII zW~17(Gw{{tElRw2jTbsc&}YSI&TNY?7_$h6dNqU_qz)qk{xXj%1JG!~dzD}ChpDfU z1^2IVsPip>7tr(o@hXd1x$z)k{1C4={zS9AelW0Ej6GuBjq#RW84ufwn7+&u+FDja zlW+x|9=?Y5J@Xi!I5F^{f6HTL{KHY{M-0eTBR6dra(3PZwf}0+$KQ$b=<7=qFgJ%H zzm=HVQHy71O5(fz8yHgfmWi%Cf$v?5z~bu?obzc8B+lE6XCGb0Bk_fx_OA)8Dps?a zZ|}37-_C)4Bcg?E1RD}|8fF{cg00)K;BL-&RKBGJhW)~ z0sOT`cmlcG8PS(PXx1Nz!aFx&V*5u(jl9TYIv+!U>*{RHTYK1ET!A6G^q_O?X4u(M zK|TD2>6aU#u#>Mz7AlD#?~4u%U(`ytiED|}6)8{-(t*^;uQ(n`@4(M|6AI0K4+0k2 zY)NDYyb9h#%Whww`O&NBy5>x{)a6DVODu))$5%mS;XG34^PKcASWc=gh496TYFF9FI5yp6wEihjdgm@?o^c=^kGZ%oF%xgiwdK3y zsqp=xe-oW#DgM}dcj73OPfOglaCII1NtB8cf9UNzu4SJPSNioV{;Rl^lvz8AZnG8U zuliw6DxyzPSm8z+PREl)@BcD8H!K6iJt5Gp{15H=bLiellPLe+Gyc#wEq;LXRig7; zg6p2C!1Zj1;m$a78qQn3qb9O-w6aH$i{V-Hy^|Ou`A5=CGV7Vv?md`ne~^YMy&$8a zi}*L1m-DZVekDI8k!{vI&u#Paf|`h6#?bE*^36F=ay#8)0@^6UOHo08j8g z7?^GhH52<1&ucQ+UND8Kst7Psq0s)&1M4di*!lzW*i9OIn6u?CdrqMU$6VV$Y3&ev z63T#q>S$K^X$fz(Q4ff3Qh=wb?=j*~Dsua7GH0jtK(hnE_Xa;e>`*qg4a@;$O>OL+ z+RF&({KCrTPH@6D1Cli?=<$V_q*!o-x@?(CKTb0tPdFdJK8Z^uu1q15Z~5aQ%O32X zs9Vj{=3!M#2>941o4B;cb2#5(&_&vo?%%nH8fb??{?sX?&)bDP?QMXW($QG|P>D`A z=wUV_&WD~Am*}gTakNZxK2zEp3CV|~NaS}fbYBt9O`D&_Z+gF-xNsi z|2Pu$?s zr4;x_=1`8+9$IU-nOJX3ppA19&`$0%bbeWpY0(OP+w*bK z)de(rP7zj~+r{)9e#Q2yk1_*yoT&RL1!|-m&3K1!=pWs1T(8>-@9L+s8zKssrJGdg zB55&JcFQ(;W?Kg}xwD%4?1>t;e6WHZ?ApgQIxWbrR)5Wj?(~DC3%1Z(I^knWiQ(g! zuJ|nLDJu82L9ZvD^)cGa)U%cr#cCJ z>nKW{O<}tAeljlNLCny2Ir}v58}Rp6a~!`Vf*nZ&*BfTA=};@2yI+b+EBK%ja)2M|s^#XW*s1Y4z-NH`h72^}S1cU^uVa{p;h-MyvW8gC; z{l#=J{C)<*URdFQDHDDf=bmxB)<34vp#elkroy4SQJBA5op~m=1hW%|;M{l{I4vNdYl2mwC}cU9znNLp~p7^>atq&||sSKfe`YHI9PMjW4YH!HGRoC=Y#x960t5R%2)3Z|F7m z!`_Y(@aTNZ=qq<(=bs~xZGCgXhy29!-VQ}ek^=ck-e9rw3dGON1rM1}wo~>m$91Cq zo0{H^iXmAzH!gRIw2JtLF7gOTB^l@@sK5+6IRA?jth7n5yh}pZHU~z zgSjIdkA0@0SUDmEp1*D|bXXS!(q#=&|P{+f?eQ0&6p3$&~#YO#}ph(%D z<#mn1xQYcD9*V;()rTx@DFc_zvrNsZ1#D$;5x&qhL(l7F%=O|2$m`R9%GU_?4t!qE zP#nZ2KSh3B1jyyY`4EEPSK{29A6G{%S`dyzuVw(rjzN^`@?ai1?(L`5saMhQFH|+^7P}$ zD9}2W(;OZOK?mAkAOWFH(g*op#o)->7&f&c0VW@h1m#PrjCF<#+83unnP3X)bcn(_ zp%3Vom;>U!mVoAqIyiE$fOkSw9l3cToD<%USx!|RxbO$r#<6MW>ec{K<6luSBOJoE zHG}?4GiV>J22SiB?9Aa}=FuQHaW|h8ep3qqsk3==hOMDnZJ6~K-v-AH7NS7VERJKt zO{S)M3WJ~3li2zqbFhpxM$0YX@X+o7 zT-Hy5yq33^wy_iL9x%hq+btkC!NB~FhnXZXje2`lfUD&!>gg%Sx}6WhNn2;4yTc?)@4G$jJ#j!z|jMnq9{HVbF>Hlmb@ zGDgEur zD9rI@46_d~|Im_Exqk~EGB&AN4xbn|8rcGRu0#m-HgsR=h3tIbJ6~e7`SD90CWB+ z-sLfsnlE(3Ga?-rCmavr^XlPL?i!TtiU+ruU2N|wP2y=cmBc9J<2J)p zxY?$eaqfAA)5AVN)5!<8A#*u%HlY|q#y-HS7eW|5_?Ee+p#h@5i*Z4o9Txr6!=S+H za53gP8qD~H9n=4C>Rt)Koul1suk2soe3?$fja|_uP!#1%4uHn46cjXhZz5uw3)vp4 znet4r3H^E#mL3U($92Xq?RN`y#}ZUO`T+~aB&ooW0L}yTS=hUN68cw8BL#cLn4v@^ z8uoTF`Tn(*F^hhQ@d0HJ=$is_-|fK6(|N41?P0jvJ&JP{Kg8$y#<-jt zp&6&AkZ3mtI{a}E*zOXxMsomX~ZNdNg5ra~aGbW21I1S$pdG^P1v=ayXpnbl*sKkGZZ(GVKN=86B2SoqRD4$h^WsOJ$1KP1DzRxbm^d$lph{2ko9@s~*$3TER@ zD6moCkD2YolDIhL6TUeV%JO91v2XW_z{gQ}=zQ9WY~gp->aiuvaxjPKAN^6>C=Fhh zhcN|-!R*oFxA0-;I6G;h-ixe8Cg{tS1pV73s17sk_yno?>fi1c?8%yb+BRC00XOg zp*pz&uS`FK5mA%rhO6gbV@)3#uT%k0*u|cm+Qz&sje+O)a&VvQI_C4+V3@Vgl-cF? zhTX}T&shG8XYq#sr0dDj;II?ymCx(oYB$V zkHRN{Ad)!RMR?!|a9&3Ty&u5$_@^d&iP36Ha z%VFkibstWZj70lW)zBPu52dfLIM=EaOB%v4w4e|S&dkR_jkU~P?|HaReJ)fz3}MQy z*Rlz(r-5mnGigpZ3rCG^!&?}Kqo3DPyOK+AkmHCo@8iI1b1t^E%Q3~9x8bxyA@KN% z3MZj@4ataD1&-5As4?d?o>3c{V4RjB&!q~j>Xw1TN=fF{f>8KxVhm=RUxfadHB3-W zA=Y{az@!%!(5Ph^N~)SOKg2uH>YXCp6|oYITRnt(7P;VF_?n~fF&z?nH{*aEup(hR^S_irXGTdh$QkS$7c|IOK?19^7DFhx$X=&n>X^Vi1^^iP3>GbBXJj z8Pqc3H`1O7oqxKXA*Sji;ZPXfezFdQF79Xl#I3-R4cyC@E?g{+MjPgaue`ogL&yyb1ImrNj47rz*ygpW4NgCbW+0k*odfi%^K$NpV)7tR~~hDo{N zRPOf>p7qUxnVl2-=lPEa$C9wCki~Mn*XWS{0Nz-h!}W~@bVf@Relpe}iPE;nEDyq( z_7p&2HIl#m5wA0IG11)1V+PaS<9FlZAon2%f|RDv1JYT%Y1}{Mt|x`4_wqrU<50zx zIA3JOJB~tukt{apPN$g)Ijrn|KiT`=Do}7kDtpzri3RV;w0}ZloZQVa$|PbUS5m>v z5h@VkaFTIwe2o3$tMK0h@0k3zhgYCh#oCx`hZL`+H2hr`vz@<(d1g~c9gfdtMHB7l z?T!eZRHqplT)N6LzT6APbnEerD-T-F6!Z|wI~o)svN~B+6EIWhC8!xK?&P`Adji4oekP6Y#FQK->~QULw3%M z{m|jP9IjW^G0XY~affz1rdl0_^mma=ulG83uy;1<9S*^TPwwL>g?y-yp96MlG~i&d zI4&NYifUQ&**}-JgItI$=5V5sFL?(3;2qX@@f&9M2p4N=C7892g{XONLa&9ja+Ysh z3)YdT7~X3HE+-oph9scaa#7$-^qu+D82+rAu>`68h=kEMnxCc**olT%T9NE zI!%PSBy5A8uT#lzOgz@S5&>_$v*@862y0Y$MjC=6;2XA>tn8Tst|op!&H*m&{mLc? z*n)IT1Do>J9Xfp6p}NByBUX)}w+g|rru&T58C71>o+*LYOU;&ug+L-stMg{|AF?QWajg}y-?_L4K<=l80kG( zAS2Pt9NMr43o>&--sTGKdLlqgwuEz}dkx5Roowdf1r0Lt))4<3&?E0GbD8_%@fdV4 z7UIE|aW;60Zyo?7KKaJzHq_zy*}`D@c@I0Q{Bzy#`b;8jYa2S{Xs`8 z^8SFcUy6V~uL&yR@^JEs%`iv4h+Q@iYtphtn69+giP5j3C(e#Rv=wQ_7oI^-@oJc- zd_b3MKI{lRwtH5TYrn&L-$!`QSMpHcwJMD;^~I8*aVC0)D5mJNGBR3Oc;$OMdr50K ztG3t|f-XH}>i3J1V~5Z2gksd_uhHVD6)~JG>bphzj>V>EJH4NRK0m>R#u)?GU&i#`EftfmJA7F{m zrw!QI*AGLrfG}28-3BF-d8}b?6C__!r(O4!)A6l$sG?LLE3jFPULT)I>?-P@Syq#N z)YoEnrj)X}Qpn4>aSMalK<2ygEV?54BeNzb3C(PG!pEJ$V03vh${lS0`R8fy{dEB& zxFHU=KAMi~X(90THUz%=RpySIpz$faAd_{>GBQhSfKh!U$o^12$ByT(h8~^EXjW?h z(z*|LmwXMOS~s3K_hgLuc6kh{Zi!QYGaE5eK@X}wnXv1Qt$;H(9y2}mcUU~U7**%W zQPB%maj(`QIFxe}xLqaC(zzWq75cCwZ5f?$?=v_|ev01Lqp;Q~m@IGHi)}j`^@rDVzUsmZ!bJmMf~HxG^3Xs}^ScvvF4S_H+1fZ2`OdR0n&~T!fzfa|^!y zcLA(&L+~Fj4E!HOVvtH1Uaj>3HT4?&di)9ecdY?mFKL2dn+>=nl+RZBR^VZE4SHGm z9iDYiAcl(C011g8mb4o@<0k6Vi{G-5BhSFj)f$Wr2I7U+`Jhs(!|W(@V{TN8a*Y1X zLgUkmAh@CpBAmCd-qU=_l&73!tw+y7Xr?o-cKrz^_~QvicG3mb&eWUIv)ynzz6si7 zM<7p88qV8n!K`0>AhN&@=LD?8yio?NO4TsO%bI;)r^iH})q^u%^yA1%VTMX8a1u2xsgC zxs-0U@M;T8cdBO%>Yu@e2~X_7YCHDYvI?*{Dv0u1!r<3S4G`EO#qs^K4rNR_a7NN= z&=jzNX$Ai5gA-mbCU^i}+rGy2iE1FUEROU2R1Q9Ml_SeOiO}$D9sJ@Z4Wp5=Fzvt~ z`}Lp;ylHA@3i5Jr|HPVz@%qHxQ}*X+Nku`^&#TCOn+yT(Y5ZeN~TcdGw;2Hu<}_BaO!@t9tv`7 zWPdq(ed>BRfA%9P^c_L3L@#K5J&Ec*eTF@nW3cnN1_td2ganaFlJ;;0iJmi?2LDo{ z8rCe-YgBXY^+(|O_BdwlwY8+KwwjGi5{AFePjgQ8B zHfl#Sv&SF`_L!Gp;>P><(cA@S7?*zQQRCgWo<#a~yu}++YN0;J8oi=)Xh_6SG@RhI z`%n1O{waPFJpTt)BacImeZ2|mx5uNHK>}^wx)0B7eG3XjZ(vKSDQu`Wr;33Ksr7>m zq*!wa**g3Y6nT4TM%hxDcEt|9Sv@^SnzgtvWQ55yYclYnFFHZ?;O zZFmnmbiYE2T0EW_3j$5K7hpVdE_*&DA0PIlVv^BHrgSh28~$Cul6?|nyOuvIA?Jh^ zGk0M{8KT$6JGgmO6HkvdU<75Am?E3WoCW3&F{fr3g9hqs&&wIG?&MssnyLdYzl6iw zNssVeeJh%+X~5FUmMHPDl%v|j%& znRRkKmo?qW5Bn^}zr5f8gxeMn^M^UKJ7F0_^$?6H>RM;1Wm2OjW)7Cpc2O9k(4Q!QrGC?1njW(0h{*se9)F3yfxgsC*vp^t_v> zB=VU}Kk<-P{6qjhER4fw??dp=Nrv#6YMEnEndrJL8n;>JL*ngr_Okx6iT^kk_~mzT zh2jSG)cLQRe#H(};_lZ{r9X(NbH1@NO}J3lB7*ZD@?o_@Fn%dd0Y+4iUQhEyBd2>f znl8p#j;67^%*nXv@?2(DM-y*cWfne}-jAJIqM`X+0w!3z$Ll5%Fy_66*>~R@ydw1B zc7{8ilA6#ImAW*0l{4-Z1Q;E6g9_aV9^jfe_O_3KY_<>y_;s9ao3ak~y`4f1ZEpjS zf?ixxvH=EN6rt)(4xW~&W)~kV0x8L(IJ%}24p3vJY414mSVx@Rv~GbE{S0vQy8`>R zyk-tQytzyQyyGU$k~rMf6oMlQyJ45L2IYhd!jpv+ zXey{io-3S$_`FVLKxaQZ%D)a$4)f`a?kGIM1i`$apBNsZOc(yTMjBmG=#9!6>{p7z zYyY`Hx@QYds$7w%s1?Bx)dJ@Eo+i}D?`D}SwfmQyNF1WHyxgnPPRI`@LUAu zL)x8}AT3f)du^mpX-JdGOxy)xSRQqI7f*(+yrLhi?~*8qX(YOF6HSF4*tI;EgsnM5&2k6Wmgp>#gHIMQm-h3@wZ6EK-nmn;$8;Os zky;1b!V=IC7-ZMnJCDOccCc^9d|sroBTZvvV5W{JX#Qo{?^4<9vsHbZy}F)^@a;@A zoqmZtKdVLccK1`MmzL~_lmcFi^fB7>a0u@U8!XjOhAObe1b-x)h|y zxcDsk=E)FivD5+9MvIW07C&*?hjeJ|nc(xjrLsd+RDfUKI1;Xpt<&|1#xoD*ioXE8Zut*SPjGsr(YHuBy+}`9mZAOQfn>edW)hR;N8PH< z(yfOq*zi6Rvd$zMmkw^k+JnXPl4m>(Qd|v@!o@V+D+n*IPNoixRgj!k3BHoE$+;XG zkT&*Ux-Qw`J~u9jYEp%5vQpH^OrKb_E~H2Ps1ljB048GIVz!zm!@a-P4n9j=;BFh4eTF#v6m($kmn@%C=F^Lv7cySG6xH~W#%MZeFX-~j8^U2_Z_H{o%fev}VjkW4%ZZ(&l zzNJcL>ZZVox?=oaP40iSwtwnY6RgY&Oj;pMow@|UBYKdL7LcKT4$lIm<74Rh*`Lk) zI?VZC`w2C_pGVOx@8Qk_iyB^dj8z)(h9_F;c&qR*8~m;kq;Bj(*A^BFo>agw`5xG4 z{2KmO(^LNcxG+_a7%eJ8n=?&N+DY;8h9>a*`X1aGPH=`x(s_nz7jcrc7b|yWf&)*M zCI{BcChJF*!Sd)^P}2MpZQuN1T}}v-;<7-r+Is?9XGP%x@it7Wzdxb>(qLc52%9SO zmds1lpvUAU*yQ2MC^uD-` zA-Hr28}o+2&VDI4KSKx3${4dv+^@Vt#wTEJYXD4cO$P7v*Pyr4n$eRJBT_IALl0bp z&Vb_>bq#n~rzU*;N0f-1RVE9LtYn=%XMx4-LnhZp-0<)7aM1R@!lrNSggHO;*qA%H z@avi(ZuecnJ2(y*-nSQv&iCm)Iw~bvL`2yqkuBmaiFl*zkxJR(ea^$QO=)B+iU=9YP>Pa9`n~f% z+@Et_=eoYv(f4r0>UCxq9Q6=y=Ii36t{3de?OVX&Q9Q;>1;B|<6*!i62IP4K!hF3# zNS3}Y%Fl(+xN9z~bWj4DS3hXvRT1m&^c*Lh5{28X)7f9#rRfAe46Kaq;Q7>tXx-o@ z3@o^YpZF>4bfa`$u$(o!^J5DB@GT$@WGh5G@@Unc3$n3)$Q;i8VhH_1?PS^A>jd@R z5i5mb*wVNGE=_S{UtcY6<4s#Acgp6ibnF6|nip_>^-H=%q6TG;j{+BZ9=smbu-%c5 z!QS`@I2eZD;JU-CMt=~7%O~?@TfZVj!lgKQs|f>LY;mw%omHI|frj4#AkdTpqsmk) z4D%u0!L6*4?s0b8raTN+{FypNr@*28dr&#Sh9teq!B(pY+-xTD#f=1b);5`X&d6d{ znDoK>IWep+iGlh-4WhMeF5SCHz+U7#u+9OiiR%yxuOCL?znW@ToKT8_i0^2wt;$Ko zSwq9znVg5xTwK8qL&^MVc8@|LIXWnVQ3rj<_d$DDnrhCwX?GPvJP#5Hqx7oW4qNaK zt`Z)&Vvb7$_h?Z>AQXJxMv55$NQ6z`p^I0svZff{tHpuCvHzIjPT|b+_u-AkQA~G| zL-X+)K=P9y_qSMxHmD_%%@**CMw9*1i(q5OCe&#g#4lgm!8N;)SH3C)KTOu)JX=G` zEjiCFi8jDDcRDcTc@sG>q>Nb#a!lPcN&48$5JooLzykSVY|$QNyC*a8ic}06EWtq7 zf}cpJK#2%#Y?$piXYg6W6uZwnRoInih3-Mm=&1uO)L(HY&;8|Pn*U6n)fDM6HOp(H z)@?mxNQ}YxJ@;9Q+I!g8AA+@qu8H{HnRE*q2O73ztXt-6=AdgMv+U)1I{a6qaNV^~ znmDi;|FYQ&*+1@rXMZpk_)ZUkTkIgLY&|2aILobZ5Os<4CaCm^d5p{vE2d?S23^&$ z5K0?lShYI|L_t9YkH^YzFFe$^hy+D)O!J*^#?o5w-l@;2ufL2&y<;dpFv?C=#n<3g(Rd2pD892IVnlZuxIDcvPqXrnVn>7uP?cmX(K~?$1kjRX7)N?Ur+aA<_8c z#UyLnxsyowjd5l%`@oSY!t~F>5F%A7tXsTXe6JBBa) zEFjh=62bx=z$eE9l+kgAIk_hpxmG1wyJ zDPn|sNNmOf@Kb*T7em+bEd2K205d`^jTzE9r33Wq>dlP$t&Q+EU-Ca`LFYu4Vh1mW zHUC`=<_)#sQI|mW&TB8C19|wR>_AoHWj}b_dJAr=nSVh%8lFz?hT)GdSeZqlF7?W3w8=u8J62l9X8n}H z6qFc>`cb|(P16Olx_3d3{8CnvFTovYazZbWKIzJC5%p)&QMb3Ei7Vnv(Ir)L zm7%4S|0N!e>u%s8k0df~aay##bsRNfN~zTHbl}b6;a!0ThR1KBR;$)yz`HcC4r{~W z2^&bwI|8L|`+-sX09n={T;dGM5z z`ygX49~;k9KzwK%_DyD?vWGX6IM1T`#&ejY_7WI7dx^A$Zls#uACcem#+jc?QyD#% zLaf%5Anlc@Br3@fCbF*JoLWuu^)4C2QqqI%(lNmk6JoS&XA$|V+5<~sZj+sMOX&Ej zZR`p(#Dcb3GR^TD?$@NO-_Cn5?)`*4q^itFi*{xEMt@q997Lt&r!oKjD|bJdJ`bmUe}qeP0B8i@z&e;q#gJ9ddzxb{NxhO}VIB-o)xK#c%G2UvHHO)i z!7Opj;CZaB;Ax%VhsWCn!b<N`J}5%nWh%tHnR~ z<)BEH{^3BUABclLFcDK0FM{HE7Z|IY??gIO7X2ERbGrhR@bcJvCPdx?>z}EkuWKsT z>-JW-sa%qWI)T`1FNxL5s^MUu4IKBo4%>Bpfsn&W`1j0mcB-Qj^6UDDIzIx`B{Oik z!T@_Xq!8mzH6kha0!#)UEkDcR`?LQL{h|)g5D$ZW9<6vm$_veonBdpoEH-}Q0Ja}9 zL&bBV%*FjHxLf^TZFKJ9yLF22?XC?3Er_RsEBv6;T_J<8HXufZvEQ z2@fXKT|SRUd}2DZtMX`h$!FfQ{zCRGxYB)Z%)qF=8U}P$^7gw^9Nz4Uw}K3~+AkCy zFIWz(O2gpV@fCKYSyjKkxR*27wi#t*6&N>agsSbL_YpY23l#OwGhX}*xh}sFskw77 zC`6OqJt>W5yTah0K${8%<(U}0+xX|!4Ax`qpG12uUwFDz9mOLB?EVi5RQI3F__J>_ zPf~Lm;Kxj;)P6v2xwn9af-TlvJ4VINn=vPN_nC9snwO zCXr&oT@I4mPd`bsd%7h#8>!o%TsQ;QhGqi!up6gND$?NTPl&8(3Y}=mW-Ll}l214D zG3rncTG~aya6}w?HKC!gPJ-Gi8pkT=s)xS}C;@65u+$VbD&BR8K(96JsXYRf(!!tf%%Iz*-2PN?m;F z>6{GJ7n0HQupychsWN-Q&FBK_r%d3xL3&;w2e*CWn7Z|%+eJ;j7+ow$7Os${zRcyy zn1I7@Q^lGrVPBJv!}I7`v72Z~J26i$k})tf;1n6p2;ZoN(U>8JhRi3hXzBBlp09wzftwvfr)ftVNS-;l-1%#?>FD z6wafqgA81<94E(~RYA9f1XZgJ0TQkYr)86|t0MM=M0PQ3kaOaE2fs=G^9?#X~39EhuGYJQ;wKQnP8z_tTiO(SV!{cu3x7dow#z z`S@bIhnT8`;&4<2;0|e8{hnj*SlUA=#u2lwH?X_Mh0%W#1&Jq{%&z7IV~dENNuT;a z&wbzvz1QiXWXD=C{L55mw||7>2_vY%h7xo<_dA5tmcWr@9##x}!rRx%c$+uKQ1`Cc zG}uFobOx$2ypIjc!ZYdExo$4*UtR}~m&(Kb-TG|g!5@&c$e%NPv4aHP5RgD_84kx8 z)0DVwu>9sjt;3tCprM3aoWX}gyof%TcETtt9(#6K76eWip_V$!x<>P>G_K6Rr}m3c zKemB1mw&@FP0>B5GjeOrHtNwP6>so=W5;N^SCM`bfslT>j&9ltG*nuPAkv*2 zT;6yc1>1Ikljl898Ze;K3zyPxCm*!RH3Z4MZ!xd2g}J9x1keAyik4T~ps+ld*LI){ zkA%JAgdf$ZYX2=LDoHgV44uAqhzcd7jdfq*6&zBuP?9lGJ=| zBS{fPk|Y^PDwQNjlHd7W*YA&W%|BhInsd(m@Or<_WT!pO-9;kN|Bjyj|EVr~8VSem z!a+-8pe$}=J50MUMeiJilO4QkyTQ75U)hYku)O!Wm_}o`?_DG=xc@B5QXmt>N@ZwuxO_(i%qp7Wo*iz#>g zbHv)JF@w^dXbl@g?X4MDVCIi+Ij3pPz+<#$>vMW~Slf-CZ&HMC6PG>v#icvs=~Rh4w*YaR^?`vx65M|c@_quN>n zD2ydIF!u{ZK6=k4kFFxk&QK^G{m1OLOhxs~{^Z=&1JxNT@jg6|jHgV1Jh+9@Jc~%P zYYas3J0&3>pYZsv%gEL}jg<{eg|cE0$(*iocaILr&G({x4_`rdVk|Y;o*VQhG@F3pr7zH_44{xTA6c3?n0e`ONqpHV>~?()^Nb(7 zXj2Xq&nO_-?>%9zK9||gs^(ge?~$Bd3hkyul37iFv3oIVw!45b%`~bw@&t+OHK#=y zBr<%)wGwx*j_kK6{Z&s&eoBnl9Kn$7;dp|2;2_rMVkL@-Xc0jq9{k^*6$UGcH~QpO@rk!quxw+{X1wkx>BF2 zBy?Ulg22B=BkI2hbnTBI`3h^KecQu(KK%%aR-XKX=!MyES@IS$ zzonu1Y9Oq4yhce&SyzAg4<~TOi zK9c^WP6>fOCECFvPf3>G7a|qwoqKa1JaFm=E@cSQQET< z`I#LhWE$6pVwc{NySK)W-pg1Xv>}K>)Sgq-A2DQd?;Ne^`az2%Ui3&ajD{H%z^XNm zmG%2TX+a}UZR(4b4Sq14!lC;*lskMF&c;6)favoBAbwPe>Z3|tlRuNJcU(jKiCrj2 zTZqS_Pm<-*II5E5Qd#PK689;h*cD^tWsW0|=N!nj-gI$)&8PHe!xP&2_5m$@bAr_S z-{Z!+gR$KZ|jHg9>}cpqA?SxV!sULxP$ zJmG#LgU;^10MkQ9nb)UAG`Fjvo32E`e=(vA*+7Du$j6S5jt`t>D5s4-Rrz{ybA?nyrx0LAIU@a3wxGZ zOtyXzP(FBq_8%T_s5mVNxt_Uk^#j!_Y0(0>;-^bdg5lSzoBNhF;Yz=A4=(SzBZ z=y{+KMH_A4wjvhoLy45WU1*=BfKf^nTRH3$9q(36((-4J_Vp)SxA}<9`V*DG@hI3= zNX~ulGTXDRlsD)JFFhtl+qlcf*wlzwLsCd*+@G*r-h!*TXQ`m^C<5c+DE!qq8ZVtn zwOI-lKkFkT_l9A?j^QNEkHEiLFG#xa19W{_;4K$ws?Q}!?Uc1pEa@xJSY$;dZ>(Ur zI{`ND9+GkCM{e$F31{6CaJc(W9$WNWqN9EsT|I;vB{i3Oow$poZF#iq%6ro9oeU%S zLcU!hqw0_!_|?%$eKZu1?*5k@7-~VH)HgXR_otCarzfp;3#4xMg5mJ(1#HxG$^YJI zI&sU7I{%7-tht3LzWR}CV+qJ!Uy%w!=9X zASt4J-;)Th%RtH6zsdGyFYbLN1(7k`Xoecf- z81i57h?)A&!*u5lwDOJ*GW0xP^*)Yk9yW&Cfb%T5t_{-Z4_U^jEX01$f~cu4Q&K;) zZ7)K`wN-Eny25IXT_FF3^Uyjc2d0nZtgC7u`RFfT5USboh$<*Bih?w%?1t7GzQDJMwm`;ruu5{YQ}c{9^P1zdL7 zf-Ts%lm>1XOUY@sA?vtj79D3sd3wJi_PFrOIW@6iyJ9GOco}40y5x~Fmr!!;6)JjQ zg6N!1uI6?EZrOAA*~i)x`8^eCzFWxH@+S08-6!`U!*SoZnVLVoBAJn=*%{Mvns(|5 zr47+WXT(a_cIH7h=^1aE5P^xu_LA1E$Lvb)0BWPH)Y3VFlK)MDbI+CJ7!%A(D{GLp zDq0@yt^;eY?U35fhiKL-6ls)%Idx4`_VGH2P7i=?>T5Re*&P@xJxEi| zsvtkV5FT+O$awY{KCIUaB0VoM=zS1RrhlUPvqvefOBcOfC`mn4MI9^6$!@6!Y5ohy zw>u)l@BkY-G#B;{SCYxJ1+Y5S$z^p(5@V;Q@V}Ty>vsM}n(mdb+E~qG!)D19(=YK+ z9^uq^XB(ES8cfQQeR!nfC~BW|j48U$lgLgq(A51eDLNnko%=k{U=&9s*H_Z?>pvi} zdX3mU>ae&~1Fhk$tYFg-+T9XChPBTSxYCx~ZO-EThiY;iOgLZlj+&otV&&3C>|L5h zrbZ#$>cbeUDL73IgNGyJ;Vot*{t0Dscf`tEA=y2ciZ*FOw)`WL{!`6#(k-!jR3_PO z38JQBFK}SQ5IA)_CTT(&eT#feRh3y3{GpSQ7QDfU>tQ6eUPY?9E!6avC51chg!tN8 zWDSmn`r1weO^%~F$6~5pBwXvDAE>p|!hv<|T)a7h8Ate#-t5oxmtQtH7l`h3TA&F-DQmQbz zL+e+HU%MMAZ3jXc$+A6d!Lg)ymHnQD*2{wI`ApV%jY=*XyNegjV z?=~j8WhohS<^k3AddhR0vuHugMOryHfMjib*@}j54XC)?1n1C~h}<&< zYEJ3oTQ`fEz7}BSxH`z+eMNduDH&xy<~mVdARBTh=Rklf7VEVMGrk72QYSLCnz>LJ z*fRBhqM1RXH-9m}ht%S`kTl>mq@Ii)>k6dWmU3o%&x;#ge+$b!0kB)MoHnoWqriSE zq3kvu_O~2Sp)w`Kp1bn&US66yDUx?ir|O@cxOL|#xzB7yWLRIC z8MdCvwvU4D2Op-bibHdP5fts37*_n0)*abO{v*|?Pq#WU=hI+$)e8?Crjb!jGPG>V zNi}AgJg&)yk{29=cjZN7Y5&eu!8=*=d2LjU>Prq6|3&NX=cwZ1boAL7O=9hOWS*Q$ zXBSt)b@+Ka@MuBH(=13tYe_dJl9|i;Go#`|bla$ehWvO%wF}BwYyNjiKhi1mqVJOQ z!jEKRoX6bm`LYMB6;*C=m>4IAm*0BIvtEe`ui50TEr(_A2S|%q!4BoGCeyrVI4JAn zB5_f+_jfm8o>hlOP&`CuUQ6Vw6kOM5I108TvOv7#c}dyFOY4?bhJ`2?a>$t{#y zb`P3H%@o`2F1NDN;7S7t^A8IkmBUP?Hs@ceXp6!3GZ!dZWZNnFpO zx*;DyJ)+^B_KY&#RPo~A2Dr?5OhKRDlcKjV*Si>w+Gjq{51&Xs^KQ}E9h@3Q=0n%O z2W4+nkX>5FqH7GP!9v{DUA55qFGUY2CVXnqnOegTuY)!KM>XYQCeLsrRU3XDvi~DzI$M zM|9LEFtYg}S*UF!+b|vIDlMQlI}K^lbS@s>jVt4aQSCp=p-7kI48cs2&mK!tTn1 zaXo(_*1>|+{_R8N3rr9((UUUo`%}<>o3L;>P11!?O#H@$W!~yRR>w==;oHTJ5BWq6 zJ;Y2Z>q|Nlj#7o*W{N&of!vrLT^?RXb?qde84M`mNTw;3r0;PHVO{Oms5qT?vO%4`nh(#Xi$IB@2 zLJN7!SKyK%CoRi9yuKoer0(}vx1CqX|9L4>T$?G8?%vJTy|y6JyZw=hEhPJ%2Y{~s;2_K(R1O3MJi~O#xcF1izLc`B>BQGB;@K*v9Qy4@90Zf zg`CCqm@d~%)57-TP`W%KjM9GH6MolTdHM7AaG4MZ=fAC>5avsnuDY0+GJwgFBKgsz>2TXLAliQOz%3k=6bQ@C1Vf`kiF!UhPYI7K<6PnM>W*KjjsIyYe zr#F0t*03I|z;Yztd_0?!|JXx4K^xl!{UEF2>rA(5J74xQlBRhaA+5Y>-n_mOig#xv z@`b+0YOCc*`kQE#y0E`&j%3y^Gsq*am`5!;gY*4ssdX3eiXA%iKs|)gTAq_bmxLb~ z|C0_pdrqpL`EZSyK~K`XNjlt{ye?cp7OmkSqz(65E3qkAPU9N~lOp0NOSt}&Rtxu4 z>;@$Tj?RPqg5hXA<_GVy!>PflOyDLLI37MH@A2QcW^*+}%`Q2SmJgWewVi0Xm4dFQ z(};a-Lx?ftS*PKLn{kO;g&6t{&t7;-FS}_w@&9Y9Qz&mq6B=IDm-5^o?^=D z>nIriiZ5~Uq4eo@&^GrhEqm1l?^|_TYpIAk)%2n$Z;}nWhW@Rjen&XRVRI~497N%bez5rmP#;@QLAr6+RGmy_UtiLOaWz!s-9plsNAf3T zsg%|`3U1MlL;(!lgqO!;dd71(95_F>m4 zwxAfwA1267{YuIsYUFEl4<57n@V2mQ^w}|i4BqG=uX+~}H(W-pnHt&jsm0>+$6)kN z0drea#H6kV5O}zntIM8~dYB__)&Hb~hSTU-{{?L+_o;Q-aVSC}aG>XH=&9dFQdb8~ zUOY@Tipd5J?Nz>oL{JW#1CLsdN(UVwyN(hY=?#0BU4!G;> zC6C%jO5CkrYpH-LdpDxnU=L~tm;>w90z}oe(&Et(_}#sk4CSw>=WC%3Y>9!W?C}im zGh%3EjAyF%yJ)K{m)foe)8eg32$}bZW&Cc0aML?H>UJ^HFrOvZOvw zgq_jZn2H-Cu&#F!#+Kbe@*oH52)Pd1nO{h9=oltt$$4_#^^)t1E+>ihPU?`~B>(&frqIlXc%qWD zI+|ImK`WbLeG29y_sESjUcmUv6XrT_7WqG0jB)+}G(GAZb^JSv+zg7DczOaQTUelZ z)*8y}(54+je^SY)tq}jVpN$QS#D+TsB+l)ED6505ZoNna;}fvmqK;(oN=e(=uV}pZ zhALVY(#p&WSoExmV*6Dxy|gsAclU!zrw~~BOH4n+@gIa^9eBU2KXz`Lz=p9Ui z)0{TC+89M9Q?jY%@qL_K^#LO5M6>GPpHO*TVOFO{(5!a?ZxC4HaQ*XCA|8s$p~7=) zq6KyR&n*61cPMloll(ysQiXXkJKZ`cy0}FB+hN|UV~B8HZOZK_?4^4XF-P+j;`A>; z6#HmK?ae~&s8|l?nP;eM(m~v*c_%QL7m{j2VGjJ~8W(vCFzfk`J+*JB)no!#ldpxmaF6y>qL<~!k9GlAT?8l+^t(V`G!Z4OWb!v z_RnUzR~ykI-~=`ARnev1AtbHqBDMKN)Fx5+7&ME!KSU!}*jt1~OcY=2JC_?(G zJamQ&dri<6%#(;uFZXM8JHsHKM+HlUNdtw>B-LUhoUL&u79Te*>m3?S#p= zQON2Ois@r+!?Irl?C&R&^X6P8+k6|wK5;NUe~H)Z$%aP%FKF2)gHF#zq-_|BjKDBH zVoNhQyqYJ^`SO{}V4f%aCbc4I5CnUlU*rx>@qI8!67s4&g>1!Xnxie3Q1kXHOeYrdHEF` z$jzXFgDEI*IK$-8N;rrjcrWQ4+Sk5~hDF{%@sCuBH(pD{YjfeeBZS!~pK{~z%&r3idirw3wQye8EbOJ>_tO;Q~#MAJlayn7Qx@&@R>|I9_A2WAyF zoGJZa6G~hzAv?4Ii{I75c&{mDOm3m80k&Wh4oTBSp2YJ%ia|e-$0ddJKnP0Q z8!_~c=hQAb2zQ%@WPX3Fq}z%Fn4h|ns zUug25LcUhAQWAcA3CyR>g1GDz#k$nuR^B5DKktn8yd2UUz8wuii^yoVA7%a70M-2j z=H6{MIcyA(N9H?o-Jjhl%X$rIm0uyNJ7#>pkR6NDkJHxGugQO14po$fkkiRa6qt4j zvTjTQjX+cneF9@6eUv6-pu$zis1hvT>e+;j*DWwK{z0<4b92gK%b41+1CZ?xBlBK9 zl4as+B)MIP+*dD9G%K9if62&b<0CeH-WFj_&W8O!Gb9%_B9?!Vh}1MB(P4LaT;$Y~ z@5n!IE3CuxNyGjV$e;?MdDD5dC>@IOACR6-WPgP7At8qXEZb(uuU<1;*6D}=t-NYl%fQh|Mhu%~#M zxz1fcnNOl24eGT~0d^W>WgmZ)j8R zL)yKBIm9j!(LcXJb=ZM7zn#xY>T)Qq`T$9~USQ9_G%_zfn6f*gPpg6pQ-0s&h`%~}9rfCfJ zWSytS~;HgAbtmVJg&PesEEsbf&|s+dMlWf%N{0 zq}o5an9gV=GKT6?%{y)C@@*z#lN^@nmr1ql*?e=oCu}BuqTK$BRP{6EMrR!0zwjL! zknc;OHaZlUdX1IO|4t&;tZdPP0j%RzJq7B_MN@?Xg=(tMEx?Xydw+pr#A80kO-f$X z2g%&?6my$e2l47B+;{5>VVCU&zg=l0UD65Djio4CF$cd14$S|2n4<3Hv_O7>+PBHk z(22_pMDLcuTfX!?@c#a?>Ulx67ezx|m!H{>80-3rSbJ zoHzTNU>RHMxS8?+Szl|U{Id`6AmAxl^7jt84`$u;yR{JjR@g6cgP zs|Qe7Xcag8G@rT#x1#f+4RlTvz~3>2nWsHx4v`K_-1kPng{F2kV} zT_`wqjmH{qlP5ri#7)=9#<&>j)eTI1{{|NqhoN|+FKJD)=RtYBNO2-pBAW3{^7za# zsMd5tqNWQp#_goU17)!2Bka(6b|mk09IfheN$b&dDtZ@!vV0vbLOO|_JDOG4PDJ_q zWu!DEs>uFK31OcoblWc&B;Du9DdXT~Wy~8YrXlV8R2DF!0;M&_g~;A4 z3yEVBx#)}&Gwn)ZiX%&5BrrjTAT@Fw^?^bLT)=qm(+Job0j($-SasGgi!Z*Y58g_f zbNr}V-5E@^R?*nK{prxMV5&-pfvsOVcXrK&@>dV^(!56r-7g___+(V73;eK?QRV0B za9NbZbxz#pH9gPav4$Mm#dQ=vD2J;l7m&!K8x!C4m1r6kLHBGc)3oq`TeVQj_bg=P zVs8pb2w{oKyh*e=&}^MwCbZ9m3zyRC%1BypB}}YTLtN_Z*YBAO{rdo zspLWcSIt}tsk$>;IX(zw(@U6CRWE-$T}H;cCHUI&8)fbgQRgooZvIoB9dHif8EUpL z*nI)^BM#xf)$1_N+$VQ&yTfDpmQwSaGbpZ%KQws+yKM*1kbB`hAKnAspdz*#rCqorSByR)Ie%YHnp%Tk_+8-9gA2LjmoJwsd%@gt zZ*Uud326#>RYq_#ZT3yU*lWLN;`*WVXJ;$fZx@r^&VwWy(<5j6^Se;awWQR30WdG; z_%$eoq!(S;lwYx=nij)UC*rusSt<`%?7<=WU+fob zIi(bWE+zO%S~UQ@$U*soHYRU&0nd%Llk)xI8l<2g^1&=<^SZ|pa*X~VVty+ zeJ%e&#ZIjVx%HE0y|_WUF5IFLVFoU#xXUN57)CB-f}(c)DpBbJi1rU-^D-Y&+>6uX zTlN}<4BsMTy9e1!T1cz&=TrPbVOD&m0b98eE|qyS@OQyybo@f{=w|r(pT{7@OL8}_ zN4BLmX@8EOkmC}F#;=`O@wW&%f7zn>{w0BV=*Vjm~vm8dl ztzc9%n~Xiz!)~bH4$PQG&f+?bNvBA1MTgAu&mwT(OGG>jrs{Q>Jn}{!ui7N!rcbso z@%z*08~BBc@Aqb%v%=AIqng%vAEH4|zLH@}C#up4s8l_eBK4afUDlgL(i{pocLvpQ zS4mY{$h?{gAzHB*CVd1J-S`h>PLCs7Ustk7Nk_+05hWX&;=j~fO6XWk&&QWhz?7b( zyZsG!%kyOB8x5J{%U6^QUc)_-TbW_uQz)i*N?Ze4Dd<`hmHeGgZG%eDQTYolnVuw@ z^UBQXKqW8u)J2*<-AT1!B5G#4lm6OskZp>Qe7Ysf^HXcMsdX(7 zX+>Xwdi_x{`|UP~Yz8oghagn$0dL%dGF{GZ$$ zy%qEDsHc!EG#^IDl?oD7!=?uvAH_C%LDbhj%eHSNvRY*#NY$g(4& z4FdNO_)D{28qaK!qvD%_$~r$`@t`6E2A-shE?vrW*hhVy--38}fLvT-Ef4wU38cRR z8=aC!ug4ac2UWdr`$+p$kAeY8azApMXPr7vi7UNPDak!3S)ymDYHt?<;1!#CLh4-`+ z@}3fic6I>mk_zU3B#5~gY{t6ZZ76Ne8t7cV| z(@FB)4!TSy^ zyl(&(*_O>{KK@OvwZvO)WFvSndmi!7X|d?GQ}C6KwsF<+NphVTrx31Y2IZ(oSdFhI zy+=9R-%7|_U&Yg4%VvRDEh1Zo5bkCC2J#^){$j`vQf-G^u~(V1E}3ToXHlyko&Lj<|>{4$b4ivw`gC6*iV`WIQ1L(eme%!{h#0*^)FX#4wNhs zRZ{e=J!HD@IG<%K)K_z%hSm>|uiGgTu8R^wMkFBOoDP|qcai(nE0DeK#cap8BWt#l zJDlvt{5}QJq>itY<61)IrgNe68^F@u9$*paXQ8=O@ZKKA)4(ZtSQZ^d)uKQ=nVmv$ zdomzCuFs0B1;(}YiUb3X3Qk{7Zr|@M4Ev`;EvgJXrDZ`~l%*}9Uy1K%ieCx2C*FaX&R0s9_XMibe{!wpdt`jbl&y*KCDEo9sOmT2 zz=jw6^5sCds%(_PaO9_U(t) ziHB5wDTUmNl-xpB2iXHWVQaU8I(92bu_}(0Evbeg;sR;*`$z^)Vxe4ENP4Ug%7;F3 zgZgoh9#ZhfPEiOn>5ld>ub|og2#FS5ptpafk-e&o2dwyo2O2*~_rfu@Td7T|k+tZ& zEEf{MZWz8dpz;Sy7kGqrzt2gm{UDFjr-%y!YQZg)b-#XrJVmxiv#jWW**F?|30Iy|9zt_+jcmtQDa`I38Zo28zP(iX!WYw zbXQ<@TC4i8*t#}J?Smm?B=8Z}gY!vqb_J6CQz+nO1*vX-qw1E`=v*)4bwPilsJGxO zSe!t3(4RPD`4(yU`CNg%toGvpH0<9}9%J?$Qbb5P zs7k+vw00ijCJhQ+q4Nb{z3a(X_K7+E8BFa%he0-Vx!nEATRQYA3GpKrK*7dgal1O@ z&H97_p{8n{oCTdv?y%6mLXxj47?up8(E9@4zdx8oblQ{Sx92dmJV<#)|G{{Kz$k2d z;a2C4z~EEJebIo1&7M4$TJT@aDRR5|RW7^yFelod$YEwX96xK4%>9PhYV9`k8R`q; zk2m2g4d+4W!r%7p8rQQcM(Rl+R; zXMG{oJk3mcPNLl5)5uyP@bsoco_^{TOvau>l{^(u7m6`su#o*<4QJ}dqo{VqMU2-u z&c_RGkipz6DEgHkNp_why)&rU#+W6XPK7u=SJIjt3e00Nl^?7J$avD(KS^ud4d`}pN*Urq{@f9cRyn+9+Bt0YEP{Dnu*7wvA7U>VKzEm% zLK}RsW5#7P`0OT&;f=8EIEQGxO@bRZmGtily|6q5x-$sI-;S{4{ZA?D_A54|R|;tc zIbyL%D-C_qNUGx_<&N@CBs!O$Gi>8sWHo+(u_T9?F3DjI-@_ysqc4;Gt036s)i9?~ zOGqx%0i%kE@Nlkzhu3w|+LSGSb?_ufcO7DuuNqJr`x~ORUIKaw{k&1wt>?dIX=QpW zHeK)|=sz~}(-qSA%^#h8Un8#QKPtN3jV2E7Pj($4q(F$owAU`)l2Jq@9~01ivxtRz zaMaX#!CG~ci%#CkRux^8xNZ%H!b`VmIf`;L>zvfh*!A~@Wo!)eEZ4pgr%M%L&RTyIMbZ(W>( zJE?->@G40zJDq3tY*0Rl9?oLJR<@wx>?^LR^5$+|XAAQ?XNulC_+|MoGA zxF7$-)mtMWlMIs7{4Jrx!e7kdg&Dfm--LL5Ff;v>KnpT&!oO03-~1?ee5y2NGSnUu z&UcYG_BQPB5Ss&U!gQ7}mv6J9di&F45+yh{SA)6p{Tj%h?SSzO6X=?~rSMtnp?GD< z^+qf~T9+vTy*w$t?=4)lIzbuEpxCJ@iAUWEm?swJ^psXota^%M$E%H$+nGqaPxhs> z;nA#g^&QZJab&`x_-a)xRbM?o&6dY_*4#UoV_8h@``ZXlFHrqnUvlWbhxGnyglxSr zDvAfAYDF88sW;WHtR(}d5Exv)PCe_7z(IUll6GSz3mD)9-I1et$?6K4?pg}x^~d2- z=zz!-`?0_J2gxm`QBYwB%I$UvS!I8!&OV0Z0|F0G7L%KUA=HnbU}{MY=tLY8=MLle z8&8UE=g|L9g0z7@G0-I)>i6eTj}uoZrTbgz_T@4ypLd1wuP+eX@u|dxoZNE9MT8iL zcy{gyh$c^vH;*giGaZi6c&ocmPyU65p8MdTn}r24Pa(y-f^`4e45Ng3Wb(ieE{2cz z>g^3^t$GdJaG_qQuaS!y?c}<(+sS><7fSCHODmmfsnqB?q2mJlj-^xbsS+X|F{$qj zBvC&%=F<6&RK1r2!4J_;caqHiFddV_G5BRLSzfni-g=NAekLL+d1@w>Gxq_#zviB^rK z>QAqr8oi9!*o07MUmqG%*hRIKjcjPwSIV8>kBSL#kUlDx7X+0s&5gUzp2At&_TJ=H z_LBUS|G<6BZsgDVPL3n|G5uvK1+F&%3?PLY{tKg!-gn z$CRU#J2RFXcKS9ah5DS>2`X;~!9B)iQE-zKaQMZ^*o$~&(-)qD@#iwR_n=gG z+07+YM>)z~*--4N0?EV|*<`lDIs(3=^+M)5Cq) zr0_I_y4y?AYWYjZjvtb(e?5Zq<0zy24Qc$mLk`bg!g=0Nrb=B4|By%2shdm^N~K8O zS6H_4EppGFgKFY!?lvWlO?&W#+|2f3v*UT(+;pCr7hUGE)`Ps`#U#p`Q%;Kb&2qPQ z_v98)w}d>c9_CNdxZA9D7W!8w#t4}~zl zdW`LqkSI94|E56IHI`Wyyr7I>&ZN_Dju*RdWJ%rO@xm3InVYcD;w*{s?&Ylb7)nO2 zR^(O@NQ=jVtZv?=yxy}BKBWnIf#+c(6`qr6v(b5ICb?yDI6Mf4=-VTC^V?6%YjUCR zKIvaGREht{D|QTrjeRmL&htd*w+Ha36D;Sa6G?7z14*x1Nv(&H@(+xm48ip=R|&4h zwVPzRt)6Q>DTb@b8FCukgtaxz6lDDa8fPw|q2Fe~M-Rc+l^4g@U#48Gc@7NQ<0wM=7v$#K5Ia$h*4yr+Gk67>?uVjlkPKEcPcg5{0wW!f#7$mI zBP+YFBwA8umRPILb5Eopth|NnPo-hq_bi;6(@cj1mvMiz4^_MML(85(7_5sH-Vrl^ zheZ^eGfq zR&IKPU>e)VuH7gh@wn5_hLFj)|I6DvN}*Y22idt;N#puza(kM@w1mBM?5iy#K9s-~ zPgp@!OP@nAr;6+CkE4c2Ax5lUMS0!Uld8~3?(i&F-mT{;sM0K$jWkz)EVnV|p1yFV6MUwyh4yLsQW+?GGB_5{lOYi%I(9E!AWT zoLIk*m-NXb{{`LQGGiRVPadX*CD}-{9|uv&7T69cf$ac&*e+d$o($b9NSs#a^ddO#l*ZY#c_8%ykp^Yg$I??{_ z0u${?k!$&lfK&EkxF30kq(N6n&t)(oKHP=wRd;^inHwGW%?+MW*M;w811ni<3dy41 z>3^Br|FSl>FJX{>ZKjT8=P-HJ2U0#fEjMb^faA%PFjjvdSDfoD$-UqsxIND#d3Eo3 zH~kXov-mOW4-FJJT{%n1KTl#hC3hQpgH@={LHQB|G!EaVa{ZgIUGjtfFVhqM|DnFA zjT!q`FlF*Nl>2(a)O;KF`}5(RcoZYvMN-_enZkV>!L)WHVCCvym>w=-J##{^ z*{BNsvJChRz5*+)v&=2@5|90=WE+>2(y4omWMc7-JNM8;yn7HS)5}@SGHY@h^jSV2 zsEtbentAT=k!0;{3VlP6-ltW<%(V_xuKOtC*?-7YD}zt33#5J88=E#3k=OiA-19t1 z&d&nK^`9oVReeLq^=nM2=+3n=ud|X>Q_1?(UX%*8OH|ZfKHzyAnP$JDhGiYdJp2ij ziM1pSc_$xV^qD%({Xq%ah0NW4IBZMp(6U4DVDEIN=801gtCk?oDh%Yt;f1XD+h8&r z`Ugc+b-;GU7;=1a7_y67*|XCxsNp~fxkct9aMy9z4cSE9^opSM=q2-*e2u;s=97`; z3t?ACX3eJx$l=gq=pP?Oc0K}Uv|CQ)x?xB@E9^nX`|++{p)fh$9c~_%SysXi_E(N4 z9opXvJJEY|YHQL*oe+{ad_rr11!NxYCBqKLuyXNDRG#sHbZ#6AHx}Mw*1RaOIbi@R zb7Mp$^rAj$ouoEI49%r=(6HT484p9T`oAy|YsDbRW@F8g>e4w$e zSJB>d34z(e==efU67_P)>3xYE0WOzL8Tx z)5J7sd|Q$vW*bRqzgL?nnkLC%$azl=vnNT@ge0VrkZdN}(TP$~NRn#ybJui6Dw)Zl zuvJquO@%0_e9!m&1@Bt#de*w1=e~Z|@4^7hOIu9dE1U%sDj|JQ&q`|(I1~95f|NTOIDz;iv(9}XyDC=k!&$KT;9Ke&*C?qd+nm46eD+$u}viz4gA5J4*d(S%ra<=OC`~;Gthp#giH@FA=$Mw zNW4Pn{iP>l7Lkv>-i0Jr&SOn>PnqE}DpGlQh;B@IR7H&UDYuIzCZe;Dcd&FL)W&`dnDjA;v@Y_q5wnR3 za>Sf}bin>U@6l$nAeixPN^-+4I#QO7ld;H~@fYoVa-XbYBd9p{ z5E)E7K#^bZeq&HHdA^;6`ul$H+x0E0_IG9WroU2UZw6_-4d$CTsZIKczi5vZK{I&$Q&5vMN>k3nviAm1yry6DX1uvj%_$dM$&M2 ztp0^mldh4@19x~FL$X+T|{EYeFD17|P_0eu8(( z1XddHh^+N~hU`ywX-;<>i*;|NHuG}wpY%J@hid-EXw!mrJFLLzhC^_y^)cR9r9 zv)zb-`3t25SLR@cx~)~menzKaN;$Zmi&(A&@E)vGZoAJz5~r{mDEf( z8kXtWBy-gjjAt`wvfiR_{Za0~2quYRirR7$&nKd?G?^~gzkp`*C)qrIqxKwleVe|qMnPn3rC+ISI@%~=)lwx?uu}9N`ud>5F|J_ zt9uUhpl|XQ@IB1kC9Bs!n==@ew~NuFHihG>;pDb{DHMy}vq$rNAa-9wsjY7i>eG%e zyUT*k3hvL@G=>VQ2AGy80HujMSB*C(BZFf4UR_0s{bN~sxfZEBUNOHphghdywK~Tt zjpcMdWL?Fn)MVX&VpR!yb2$ZNk6+VerHr!Y1k;#&?j2cG2J6KJ%q_H!`we=Lb$=t$ z+lEl&a|H@KdPtS;3DM21YT314@F(qqS$u&|!QZrj_G;lqdkys^DUd!YhE1Fmh?>X- z7@s0CxjPM=-nn?z$^YY```iaJN~-JnR0xyXVy4j9lkrUoFDQ#?_=h$b-BmXR&pA zeuHC+5nJF9Pp;1YlJCxb${BK>xqN;Kp#^n#Hufbvrj4O+$6oYl{|foDV8MUdeHwA@ znjky8Qb=fJYR`QySon>HD7V@Hzl>l(=Hn{_{Jfm9N}k|CbQtFg-Xq|H7dC8*fyuK) z&`I4*UA&84^ZnN}^HL=O#{Glr2~`+!`Y5T>wUKql0~TQ-I1t zd@mApPC8;k19MRoAoEBz8E$z5QB%n8#t!@q_;VubN_vRY;||apl}IsJ!#T$kg^ANI z<4n#CG^duJQ`Jh(mYb2=Q*TD1F%)pD3?@d8(L5xGVxqnxL$_(heUR2Z6U4ET0-)1ndB?(#+c4)5Jv|w?T_uS40R?0+i*6_ zjNdcb>QE6k3aU5j5V@3RGpn-%``dnym^7(10c(XViyFz^X(%eheiSg9v#12Mnd=nN zN*jv2jvZv}>j&SNClNuVbfU0_Dz2n3y~b~;DY%ik3|B)U@evf3)1dqDcM5-33D*Ih z#r5$nxmjkgFif^6|?ryUwUft`RxU{EWb<%dZPvTyFgf7D1y&IL-JEMBkRUd zaz0~8;uL*SSnQ|BlJQX4FoClcLSq|eh+_-Mby=<;x?3n!>PUquzcf-ONTKha3TGQ7 zwB}BQ#}1w;1r8#&B|Bi2$6XWoYD%5$4C#&Y2w2RWH@9yx$GLwoiy0Tuy!151eZH2K zUyUd4e?Mo^11c)E*@~cQLjEpwPdo2s3)i_X z;22wDoCM{EUrBbwUYan{1ET)mytZ2pXn*M?^i-~g*eO@4iI10Fs7fN)#zHoJQX&~X zIft&#v>{q`3o_|qc6V(f>FxWCsAC#Q>T=N&{v$hD7=_l|rX*TA-gfJ;wdChgDzt@` zu{w`#>J42&e(swf|LiBVIPxkfe_74!qw|?!JDAar72NO9X}fvHPteV@5oF~zNqgQA zc)v?wX5VjP*|r_9_#u-!eXb+oZ6!4o&qVQFSH6n~XPVgg@Re|n*c%Pg{M;kd&mCZL zk3hA{v=VYpz7J`=k_Aj|f}-=F@TB25If*;a*0B+C>j&!kI!ol-IfaIAG~}{rGVdjZ z(3ffrh5zJ$uEcmM9khlNhaOF}fT((kO@O}wF|kvq wKzpv>cfLdrFVZHf`6p<**FV&jJ`$Qt4O{1>3z3tLRCj;8a7%KSdi105U%tJn$^ZZW literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/harris.weights b/src/autoschedulers/anderson2021/weights/harris.weights new file mode 100644 index 0000000000000000000000000000000000000000..e6b7e4eb5ecf0e4a87e5b918dfbe26370d248c5c GIT binary patch literal 20400 zcmXV%c{Env_s5Ztp-@DnkW^Bkl6dYuPo<(%DxnflsZT|N=24~)k`O9n$~=d7&OT2i zQz}VFNGc%;X{7r3uJ!w4ue0vD_pWu%y6f!W{n}gK`NZB)A|fLHGl~CSrAF3SBPKaw zD~F5O>V_jwEM?AoS2e<}{U2DY%4y8FG?)K#)dNy`*&PNh<$=h)K5oa@a-PLO5$J68 zf-HD}o$4jr1j{J)N9zmswRjwByMBk=oP3RKwDaM_Y|DxKz174t>lB8(yoR>&PEgx( z1?6k9IT2Y0UYl$j&wh6;WFCztPu)D3pP4E-UEe`9ROhj!Zw82cr8_(DXA#po8%Q6> zj>ZVO78I->p!>zAAVq(1@*;g)Q*}BXt!RSM>bX$xy#TH01DG279F`4aK%BY>s$^?H zUu+&Hwk8{5mz&Z_^&Vh$_%GS5ZP`$FHWVcSno&~qJqEnj21|h> z2-}-@Cnt*lopm4AIhVkk%0RZds|>~Nyd-lfdqL~g6*lwRXSU%vV~5;~gt6z2k;&B# znDu3ySLkm$-djmh@Bfb#IJ?2V~8Q@OPEIb2Y5F1G2H(Z&itol zl6bMDXy!|ZrCAV3y5#{p>smI=h_a(sGDu0yJf`Hhj|?Ba#degIu-p&Fu-MoEH#j2l zzjp9SQ_5k?@1Iz$?h2NXvGnMsZE!ZXm;E^Kh>Ul1;*;Ssuz}MgKWs(V7?Qx=g{u&W zIjVeL>wBc7s|Y#ACh${PFC3piAT&#z#U>%=e%_O7uJ?yA{4e<2WdJ?4reOBrQ&4`R z7e5LjapR+G+TYcL8&x7uBBu$A4o_qGSz=_TFLG=BLfE%W6|B?eI)7{TR>oUi#57)+ zu^snh`GeueJr>(aHfmpH`*d8H&!__Y`erBF4QGkn??}Ol$BHKD-|l1Pho5*r_dPXz z=8HzNhJpXDftPD~0V7({pqf9O`PI#4`ES)pr=B^>-0>fcGN>iz#GVlTh85&?b|ssU zrz%{Xc#A*ru_C+qDVL>qhB57PlKdH~PLrw6`bn?12sg$lg)}ARv4~5L*h9td=&mFx zT&(2oA{k*Wa%%zGo@3We3jh`nS#~;TVE_-CAmE$ zH`ANTX!a(3Co*ZVlN&i8XlKuO=NMW22=lp6=5go?>7#qleY0=L_{1-0Yf;GMsC~kd z&U%op90uwWw$qtsQ<>ARw|M916jFP<15f&n;|zyfiD~*QvglC~aewAT%>5LFvBNt_ z_2t`Ox4Z;3Bxb_H9a`LUH9yw*)r;yW6VM4P1&{jA;NSHQc;U;yZu}!!^UM;;TYOmg zkswlVYZGJr-%#fKT;gUvo_W6B#GF2P5rgJUm|*6OQqDuzEwL2`(hA^j>vim;sSwsW zgoV4u@(thYVPBIc;(sqWe(c>3y-J4&y~i;xHwvf75F zM{FZG4)d6^pBMYE(~&GVzlBX5EMgXOB4Nt97ue`~0oF)4kmh3>S&-~rvVvbp|J59W z-REbM(;6vIxb_i>l>2Z}Ga6B3ho7+#Y^TvyS7F_cZD{+k7rNJO#>V&0Xrqn`)V03F zq__-d8oL1Z4P>E@=^b!We~dSmdf_aaC!E=E63o+j0BsGF*S@+0%@j{yGSP$ZFX0fJ z_7n8#Jy13(pPHs$#{8d+pnjl^Q`vA1(~Yk~qr^MNo;i#?7F{&>b`3;am=2R{e!v6e zJPbO~0TExXaMklUgr`f;Zdw2uZ!ARdtT%LU`!HBUd4rYuSUBQ&5yqy5qn%Pc$ba*u zEuSj6b8+!3t2A7Ad`=_ZzRa2^&KWOM-LK0mcj) zE$~lmMyu{5>J~W}>Nk#J-Cl7xby_)@!Ec0$wd+9twKdr`{}KzAd`P^jg4lIC0r?y` zlke%}iYijKiFacln_ld}ymlO92{*SBlX>nexo8h-75R^h-y*@}4*Ib~o00v?gmA(F z%JL7)6H0h5Hu~V4B(%c0Az^vl8YLUFq8_IJzAc?pQ*mYrZCZ z{@KKG>s8ohzYXOF{OA_JLdI|BVb!V?SmJ5S+U;YOD1QD5yH-Sj=@O)t>u#a5-F2{E z6a^DlI*8~Om>kO(#}xKV66W-$kh}LLlY|yANFVgaG3FV>;dnUtwEr%OmdgkiXN&Nc zdkEP5e-pU7qs2+L(<}1+K{M$YtwvheD0V?=8Cxec57kXHp=9_1H@Y?(md8Y+$n*my z+U!00*VKT>m*c!yaBX4XsZ#v^Hv2N;uVP&NE+_i1U6P+y(@pxLzQbqy z!al=0cr-+Vbr&lL&qw!=lDA&u(wjh>cW@d`E_EOgafM{eCKc8@$(QXC^59XYHw>{G zxLI>8Gr z4q|_a(y}-b`|=r!ca;(4>1A6A7TKfu0=gXA=92tHQnW zW69B%#==2%PU!dPE15QNC;xR=IPtvop3Rq;CCm>w#cw*4ApCbxTKFt+70Ar-;Op+# z#J@J<3eMXZ51OB9;B;y#G${6>os$rb8>VBzI9)iY91eqarBrX`IH;L_l-65$<6g^c z;Hy!I>JLW4`|qu2H`y8I&df)V?!@Bn(l5Z`t`8)P{R2f7Gw`I+ZSbvm&bf*sNVznE zmG5jktF;4a#+X9I8Y}9u?I-rOHlX~+d)QSV#Nsno;CHDzRD_6vt=Dua-Ial{5vuSs ziH};Fo9Tq|j}Y-I9-5@nLD9hxPj{GLS@I+3+braI5?(;ufh1Vo)ryB|EI}l6nen$& zJE6&W8=)}bIe(d%AIk_)U~Zz-!qAr0{Gph4WI!(n_=neUb;q^PRCEUvGzvh=G#y3S z)M%-uHGXspVc!!XSwe~wiOTN*wX#3-k9RFvCTNgVbBk~pQRjE7O%K?m`>$CTjWkBfVIc1{*UOdgXtn+=Uu!v$_PI*91b+ z%WIJQeIkfu2r;?tCJpXALfh_o;w_b06h9fxb=hn4w)V~<+uD0@r-B|kzGwl-GCRY& zdO)1b$v()|b*;uD$F+#3(_XM~h^849OW>rF7@N_#f$X36j`Ml47uzKZQ6RG)H`v-? z`u*$l?juPu>RlQJd>2yn6Bcx3+6**q&jEKQt)qIsWvW3F_XNg61T|DL3BWpL;$iJ1c{J zU0;s8^e5+8#hKk~w;`g9 z8;5nhIneRY5lxS6qmM`GMn$I&xGDXzaHCN)(=RLFXRYQ56ITpjb!n* zn)n2s=_WvGDUvcek-SJBBm7WhE%b?C?Cek`toFNw$0jF}{LoA`p~{ynsIJB1|HWZq z?l~50sz$yDmNM0E9r+^EFzl z#Gv~p9;oaTaBDAp#F0pjrFZn8(Y_JI!xKUG^$YA-vklF5-GMrZE*Mz(2dbn)psMvX zM%Z7WoyJjMaast?tNU@d6tQMuF_)cR$B8^CE3QKd#eef~isvin@fnMD7we&I-Y4D! zjn~ll<~?Vu_y^w~tc2X>BRSde>ripl0yeH)fjg8xLG?scuv_~9UXGh9EO-`7G%ia) z`>p`CvPG4B^Vbl*u2vLEe-Gd%>&c_h9Rz{nReUOwgR4arptf2Tm<5(`MK&KeLl?yo zkNy%iH~JFwDEW`@C%6z#bvn_f%x9Z2+pFayKciv>Ytq-Z)_^HI++Jj6B`NZBA zOcJ&vM$kV~hw<*Z0dh6Yi=588%_6cqacUZ%=k`{tSbdL^T|FCvtTMs!??{e5cnn4> zzGNrj^~nWyj_))v6q_1a$kCqh!j7y7MEbiK6AyI2Iblg~J~szLjE(V8>~H)zT9?Ur z9mSjbmXS@d+X=e9W7BMug~vPAQmy0z1R}rOvjBSf%B$g;Q?TUW-(B$ zI-SPcsiu>7iXhRE$0X-JMk{S+k~QQ__8%K$ISNySi@!`^|E{b=pK%iWvUA_bvZ0G? znUxUaowLyC=XEd$zl3@xcXBn^3*go6VoXYHg0mf0*=sCO@~z3 zWD7ao`zLwCOE!cQROB*0_6yzD%L({**3iLAmf%?b3cI&G1>=QMXdC#Gvs-kD9j}li z2HDztU+y5f>eLeRFa;s_MM0nE38u{rVZs7A=yBZ${AUvdJ)_j1A}@#@O0k1zCj{e& zOniD%1tZ?XVQ9!fP(PEy)xXg}smfw-?i)mFjb4Z{OoP*5u9%7?Frg)o<{PR4uipVK zrhi9)Y9l%?{10t1W3XcTBs4Gkh-a5c!I4$duyVKu)!%AzTP<_pu-?8$hvI1vroI#DXH4BFC8 zL674KI2$$=Pe&wy>AaiJXtEIHQ);;WxK!Nt(iO!`C>Y%~pdzaundIU~&1jqlhorn< zo5KLk+uV<~OHY8ooSSf9!9yCVvkaOx{>8yhLon@KHM$1}(*qKTP&X+VAKXZUy;gJ@RUSR6sD7d-jFV!@!K+DJhLB&uYq`%z@Qe#tTMMNa(ZP%sheqvlv z-w*J*>IWe;Z$ZQ>#8~X+c?`{u!Eo6>P>t8odTj*G+W7_6RnA0_OXVdA+h>5vj|AKm zc@|biUxU_yKhR#a8oU3y2R-@|KxF4@lO2_hz&&{}rzjfvq{;Jr=~h)7)(?AsOx76Vt|j>c2$lY9()$7}IBzYug+ z$$(Hpl1>=nKwbp6t%)6&eZHH^ev`_HG+yHdzsBGWS4qJxgD?nQu@uF=e}o0|`q033 z7)@skqIXIg_Q`326sSUHp9b8W8-}tgqR?*NIQSeA2xG0XKCktvZ*b&v9t=@ z)t1uq3nP0f(t&iEjYjKQK|TE#1{a=0#aaF+GI-tOePB6?Xzb#_yg^8QRKeM95(9PN zD;j(=1qRC&BLBofUg%_T{C)Kvo?p}p?lred+RLAVRrnojYYl{j0ns>cARN3_55oP2 zeV7_}1`{2wq0~QBY~DLWms!=r-+>6&l`J*#z5}-H_2C*D6zSIMU9iGyr~^211P%LOgkb!Crs4s!}gB8qr==X8z%7%9OzQ zlQQaki4=IfxQ}>)Vsb_VNE$!D>$cx;*?%6;XCqAyE$_sLm7lo17MC$WeuR2{WKS+bWDM$X*PKJQ8*ro$OEl^FVO6vIi1x~ zNKbw_4-0D%O)Y|`Z@>wdWSjz9w@}Tac2eIPyyuDN3g2!@e6r0ozBBt|c zQLry~j}B!j$1k$JA9Gl8t0P_8l0i-uK7uW&qRb^Wk4$}3&M);2Bdbdrh5xduiO%)8 zLT#u5|IX{|O7=P-?i|fZCvRn$pFJV$sT9uLro*i2KjW^CZ=ut09MT_igmQ7`*aST< ztbD2h<8)pzmEO;!ebFMeI$;q%jqW8Mhvy2v_r4@1na!mC&rde(t}ioLAHyEV*Rt-5 zPSnck9v**~$C?zw$Si|M__n2y>P!$Nn+8nK`A#Y1w_o8cN|z8ytGEc|@As3>ev7%- zmb4M=E{L!Cwi|WY#|R7F&S&dBM3RMD_X%@OuV-pw3z>6&A$gS*$3h%m;>&tZHeFR# zIR5QwdU>uSJo`C^>|H&d-+bl_R0Lh(3_UtfxSR(?w}a4W%}v}WzZ%^W!b+M?nm~1F zA(pA9p+J8$6n&iwpI-+-)Td)8j$Aj zED`$3L#TRnkzkThH^^G6LbYZ)2D!!}zxNJzTBaKstq8U+`U=vnso2~%1=MHF$IdZ5 zoS5D(EPZANM=cUz{xU1*I;Ttc>Tzt8Wep}&je&_a9%Smh8fN^ul8UEzqRz^0{5-;~ z#?KdE*)?Bqw~RK~(Hg`$ipVJcC|n@N^_I81Aw58%6M7f9nrV&JSJ>h37TmZ9)$4b3o7X6>(KM>x?3n071|++S^E^T#B!n@vG5u*?Zu zveTG71dN<#fDeamaX-p$(NhsMfD_MRbeE@qxG@WmQe~S@u zXYl#7iEPgL9+c+wal5ZgAzMI=iMZ)eOXXO!iyx$BoBwe?H!x15t!qh=X%l#C&qm$T z>g?!9WWJ=YM+S@(Xr}RD7PRg$>v|ZCs~%Yia@F>;swWp=@iz(PWvl^_V*fzTB?IgG zQ|Ym;aMVyO$3u_GNuK*${S!$wrIZ^tXVXdm}owyfjvg--S<; zt5LnD8G23%sk@O4mwiZ@-SgKbZVKzjw@WH?4GZLUg_Sd%>M+3)rTq}xvX371{6%-o zNq~Z54*EK*2A z_1s~e?EOg4Idm6~sHemJ;0TP96TwjTOmLS|;pt6Sz^#<^1R>A6#Ivvs*FJuXZRdh` zQ>Xrh^p1C6zwR3zn*W@$s}w@V=@+=)?4KfDqEM_|32`%Wpxw0=ttO^m8&}V{`^^P0(Q=q#z6e6NM_}vyluM`^ z2ZsHpFrdy14~!ns%XoKc<#x&Ap1*J7;iDmFHq=)F7N6ZU)nz?GtP~tbkQAeW>|u5WY$cp}^rTEbL9h8OQt4_qaA#{Zxn84KbX3 z-D|K^?GzN5FTzICC){_t`Pl7Kf)!C|ka@%ty3ag9;jKK5Z*0rE?wN+!O}U(%RTko^ zT9AybM&FMI>B5X3)bB_JexBZhq1uPRJ-&;oxQzHvu`F~SwBlLbH^;WTA?R7|fjw`E zU~gL$w@|$e+x{Ga)YW%JbjS{_C(RF6k{HOIdKOF_uS0}d2JD^SN86PC@^%e8h1i5v zoa~#73RkKyDzOE4BClxf`B2E*b{3^3WO2XGjO1@QJy54S5f<9*$B6Z5+|dnQ;H(&c z2Q(CMupk9~)~ZepvcL9!~y>=Y~b^VCTK>kTfv^?1K&Q_tzPCu%H*sj@NM7rlF|wxdZamJ*nXR zPZ-uPMMKvxOxAoraZ54mXgkeSteQzn3-YnfU>TmgUCiAsxQ~KfZ76L(u(T2ix(CA` zG`bpvPl7->JpzBHEymoLRya5m0d|%ndEcG^v|SekjXx()`A=)9*{h6^?@q*kJ5^YF zK^u!w@8Fw+KWG=S6rv+aKv{n&p8uBydr!4-Z43VJcCj3qDJqG;Ho{Sw z)Od}7e&BPU8Tuj-o{V)z;U{T0)N+l+K8yr$w_s4eqRFjJoPd@Eu}~};gL-eIV1w8@ zw2ev!*bmRl^_y3NigO6YUtGZPWKLPgCj0^DB?4bXZ!pP zSJC+!MT~wFf1ebKdR9L;QRnZrKtjB=)PcaCKZbAtt2ujg~u#A@NxnbBOzw=k4@ zs1CK3zSy=!5xf0gLf4Cz=&>^n+*Q zJtQ314QuRWN#vHrF!OU5n21_%5nKo5{T#_D9KTVuiw|LaN)SfONrcZ6$Ka0LB1A^7 z6>;TfI5=d3nkqW%nd>VQ^xuMlvuberK@qN~55WYf(J(ZyAM~RW&|U8t_uzdfZHRDy zTi1H1R&5L1QIo{+=kxH@2wRvD^a&q~&Y>w{aTs=P6ljdv3lOJ5SIC5cxUB_hM@iGE z)mOM5=G&-}oEn#umPR*Jro+n{pLxbt{o(JrSh^}P14=a4(fUi?C4!~BuvcpdiaF|2 zgQzkv9h?Qf7DR#9;}|s6bAq*j(sa+Q=TLuUKKv7N5=?N(p@Lo|5`T!|fnG84*X%Fm zN`C_f zu>k!07t(98oxpb*PYdTfLx{?Rq4mjlIYW(h1ts8;R{uy4Aq_aD zq0ICbJci0ZAw0%#xbi8pT;}}^%aM?u4lI)dY(5o=KVTyO=mjG)oa72dkIsh zRV0hj(z&RneemD$BX~u447N6(M2&$Wys|Zds)WU(`I}bGKA{&R_UeG;qCz~!(_q`J zwsQ|oA4SpqyWv1$5{7h7?lxUrHJb3c?%~BGQQ}wGV$$My73;go1f~k7xPa@%^uoo9@L~OL3{`&&&zFee z&gR9OV7mtDH#)+8Ul*ut05-SAmrC3Gz;QB(uyS+&Ox|A1;~AC1U8gc`zm2u=Xa9q+ zaf^_1^Epkk()tA}^>1=g&0}Gcd>W03^x@K82jS~ItKgHF5SBfVWC8wLA#-0f?Noh^ zw?5@TtVue|m#TujPLFYIs6Xag-$td_R+A5Iu5jXp2garRrTs?};6L!flF&3bzB3fh zb^GBF_mPSr;})h5+TfsF7v_<}U@s`=wbNZz#g|Qu!n7aulXn=)oZi z2_`x9w?Ono5DbG4hPDMm&eIgwcdj3gi6p~qqYd!=L<|NhZp0%kEs&W0jW=thFLgRn zK9P%1b1PRC6CTs zn}{1`i{jdf=_A^VE}T_IE^L__H%975$?AZ$n0a6&*SOCdACB;g6*VE?Y#a{4JP*_< zdV%QOgnKoAL1Jqn{BF7iZmCsN_PG+?iIn1^?{s3GUo+=FV+>h8Z^S1bp8?O#jwbEj zwxeOrck19$i|tGGsrB#$tW{&)LqhO&&cK$5@$gS43Tl>2fIZ<;P`^!+ zJL~tAd(b2Tmr^B}?Zdg~cuyL1{so}S(|k@cE*ai@yePPIEeVWAzr&iU2A*WW9h~#P zl1;3$L_6a?&izgyw{TA^yw?Kqv&@cL`^1`CnbAUb1snshe>c&g$dE06eh(^U-9)k& zdC?o=@a?MaR5R%bH~ZN`ESD;QMGiBF$LI)da&tF#smc|z*(dJWze{k&d_BeT+tj)A zANTw2d)~dKE}SqnmJ1B7L)&{};fby|O3F$zr{eKkRA3~AD@s7yhGL%NZ5i}^=MOsC z6S0DwMrx1*c?JIXT7EU0QY!$d4>xhIMkpkDOT)Oh23qssEj=DEn~I*@0{d*fadMrH zc^>cnNzbD~|_Tcfqqg&G_QQCb)jnSRime1tODW$n1pWV3AaV z)2te3-#a-XKH{_P*Kfn-?V>DD`y8k8covABm=D=&^r3jOC%)Qg0wDC|u2nt3X$>+E zo!0}m1S4@$D+_*e3t()~YVcp;3TGaS!*9*fB&FYxms=SReATgd?cY`K^by4eO@>f0 z$BnzXX95fNzK*BNudu39g*g34E57I>+~~z)Ih%j|Tu)aC?kJLjah0QJ$xsDG^zH$x z2c{q^Z_HT@*9*Q?7_<6+!#Mt-H~ zhu8dO-h1cICI?5)goV>A;OUM)G+e|W|iXLrB-gu?uAlsF^CO4Wtj6Q_P~DE-czx&vT3;TUKs z9tE?>^Kp5XI2=rq!jtAey-ccoF1)Tgs0bxVy9%su)|`$Q2SS$ z%x+7@JQ-t_UX~4y4Nqd&-xc&w%UGIa_kymM5jFW1SpfA@K7iyl8&!6lpYj4O(N(iHh-lPIbm@Fxo1|?qAu6y<@Jh z(I*4&(Ha4E1zX{Y%IWaEN|t_<9m)L+uW|QMMl`fN%Z<7cNt`L>owfzyc<%i>bNDxg5YCJEvJ2L3ub&- zf<;~lv@pmF4oJ^MC9g_YuuGAZopypSQ%~fV9pN;#=HYI+Q1so|#A}iB;tr-tv&W}; zIFpGo@Oqv#p8C&-`YwHpoo=Pzou|!NT4cfL!YsJuLrB}+tEhI;UJ&tU1D4kBKrfjh zUeLH}f;(>JXtZ@UcS!R*b(-%D#ac&ETl5Ef%~6Ar-al9?=>X1;#Mol4hc=8_3x|5_ z=#cv*FuOg0Jg@q|g|GUJFML|L-~Q1Uu5}LkWxwD;e?A`b3&v3j#cik;{N83`UP=6e}f4UFyl(ezG=msuX-Hdmix1s08Y}lAG6|?HM!CgT-uDjX* zfp51$eaHnAS92jvE#1^GGKm|%@*apT_yr%H1;YJVgk=sG;`-isEcam(XGW*s&I84; zTjv@YY`6}6YKfrwTMgwB52EuiW8zh{ml^v`rPx`>W-TB=SILSc-=Q~N^BbPxC=qBTA=PdB~B?%opPay9^ zEw6iz1N{1Yk+W$xWUEphz>0N+@I-79u3u)$<;shX{KG3ywD$^|SF#a{PTs<|n)9LV z-7&!!+XgQF)D>KncN=2Nb71npFxsJ3!C9zBfL6sK`f+S71pZT@=W;LLTvL0xBG?6O zmixnQOJ69f3*`>&oJP;eK85Se8}Z}At2F%##kqA-B=c4|oh0T?WVC;AwM#_V7}*2x zDfS+%nHGWR@hRwf#1q>rby@Phw=hxf9rby&8e~0+;QOdTlv;WNZW&D`(r>F!WK$TP ziSfdMQwOP_B%IrtOW~<#H1({p;4Z2zASBWSx_iq_p5Kq*Dcv)Nx@rA_vR9^X&O{%@ zW!;Tu9!=-JU!CdJOww=@?e z&b#4?(x*`3CQfes90gO;#o60qqHImUM;LQG5--GF#echwlbr#X=olRWnooSW@XNDV zZF@V6+w2YJR%o)7VV>~sVG#By`EzQWw{Y9jNu=HVI9Ms9b6WM{OmcY)O!#UG^G9;) z=YRK@7?S~-G_x2R9@)b-jSB3DRUsLU-e~sUad__;OrwooQk$|ayy>UOd@>%PTB!#) zIKhy+IG6;^gPXubWeAL;onX#2Wk)S*&70Y?%C0z=J^tWy$`psyA;Y;$Exu8i$0cr48X|QzX4SzIdcx#|5O9J7Si5mPyyzbBGrT zc{v*;$-3KL;lXqZ*sXYkH(ntC3r%*ysgWLDFjv5vhXC%$oQ2XOZ=o^L8?BuJf!jP9 zv$Gz9QRQ9gdeRq;5L3Ztvp~APE&?tmb_$%=gu?7it0BnRhyJ6pP`jZSM;VKeoW_lS z6`oW-Ndfjq9l&zEJWOg;fF7|@Uas?HoJU^{;>7i&i_fLs>%6zhDfiB0&8gbk_E$rNug@s?XP>(B? zP_sD#dEYGH^QI&;3o$|ArXaAtIT}9wOys&)FtqJfW21ijg`yP)1%pQ`u=(^v7VWJH zzbdApM8gd*XszbT_FScYT6=MipCV^|W*7eW+{opeT!5wP1E8yAkPb+MP_Z>hyz@SB zAiLro?scT7nQH)xekw4FqlUQhX$Ryu*MLr%<}*Y(x11&{_khF@Q7&%FYF3t-%02(R1oVl9OFJRS~8RU(s68U$0Dcc_3h8?dK5rcygiuV)eYn`y^1 z709hvLM*0V!~=hA@WPt8BxubBCO8;`{`1Sh>x3@!sJ_6*4zr=F`7DNhy^NPu+=L|g zN>J;2#{HUc6>d4i!f!)0YHRFEFLlS_xQADeJoZ41uCwUSszZJ{T0@)p6tt-b<4!e; zu?yvccxuOHxSiPwlYGWQs)7-?IE;L=*9JH>wuUNC-T=Pkp1=j@gT$9};5PB*NHbg$ z9$6WVA>k`g(YTzh)w=-|3XSj~FA|MhRVnZ-E305c>_6F6Be@w3V=Si6N*Ycw*~@twip%9msnB5|4%BSZAa@%uBB&)9-Z{7ssC?Cz1{kZs%#%x%?y-9vLr8 z-9CdKgO==!k2d-BIDszKo=&QQWJva^1L&h42gk}%xtuMZsfg786bLp@V}C>9n?J;@ zxue4s)Q|8L>4Wfbq%Y&71cf(I;#6EWm1;bF#Em);0{bd&;9R>Itmd*4Bz_73xvK!- zdb?q3|1=i8td(bG)sDxFXYr=Y7H6mKD3D!79-8qRZHBJ&N70u7~KY=@7p&09;=-;aBBR%(HA2 zSk203DVMgfX}V8Q%ufb%J|f2xzd<+Gp2cZ7;!I_AHXJu5*u796zd0_Y5w#TrL_Db6&-wENI<`pK;_Oqy(EscOx*wcD{QG6taDy()S~0@ZpI1Orei7=~jc~PRbIAKD zRS19CN;miD;9%T08rl91J;vVVj+x7`n+N>`)>*E`F;3IC#@U^4bwHA>_ZMhO1svr~d}9`>7mlFPlRi73=`EpoDfW-G*CUEg}58 zBA7U?3N@Qb(QMRHC|Fwsk>%rH+e~G)^W_|jy|;vo&I=~FAsLVqlK}km07(3vOl~h) zg@6C535mobEPn1lU!0Oeqit_7&U*%#GE1D;j#&fU9=51Hwv>C~Siy{rE0K~DLnxKk zioJ)&5yK5%!NcV*2CKxfH3!XwtyUbl_IsYto>O7#!a9Iorw3J?SBY8tB&txihtX%ARq#gPHN1Tg%1wAR1->Z+VM(4CTWW9@wihjDUL!Mf zZV`+gStv#7bTi12rg3EItxfpxKW*mgY{Le|J790rNn-DC9g<#8$Gj9iyV36r2Ll(P z@Ix%rJlV_DY+lXoE_Pv!Z}#%myZMsD9z{~oqJc-zgamqPK##OBH5DJlayE_;I=(xA z$LAHHO~WwmP_JbsE-LJiQM@3^JesL^ToFHeMA-(1X!$|Me= zcQ2zEkjh;e=gs|eozL1O80`^GC0DE-!x^bvxcihVS$1M8`F>~?JKMVn*ry_x^2v#A zzMc)X<6hAztM$ky2P>{|$QPb{j)k2Y<`N%g3FfLZl__i&A#OdjXp>NgC7n~q@Xv)X z^Vn?sACvnZYulw11&$3TF@D=4)IWU<4y@Y8otZ1n(#zc;z33NiiDmSY?OSfxq61&N z8-vrgw1d;zI}qP_+hqRqA}B9Qr=nl9nFou5J>~E5m8L9WK?#?fGyu_;&XWHzJ@x;$ zdPxMgn0(Bu$bb{0tMQ(!}s|lU~2`=9$keW zM)b}`(MK@zLMLeT3g}GTaLD|Y3Nd$X3G!8Q@nESpdQ55O;w^52%R~iWnmma>|lv} zdk}_f@ga38c4SA+X2q_em2H}ztmz2C`pGPS}(u8O()fJTn zLo!4{DO0#nO2%kV${W>t_9K}xG)W{>O34`M-nzo~et*Q?`<%7bZ=LN(gyh%3aJcgh zt{V_wz|kUX3!ROJRd2Alj;5e-@+KB|hk?0WHJZ|6;1;XE4oPK!w`L1CxSga~4>aM+ z0RlH<%4qQqjvD*dd2daVln|RV#-3IE%n11fdZMEZLc z3&~vwnql(P>OdPLOnn4y8ds=x^f#vcyb1oOI0Y|6waj^A5ybF7{f8-^PcwqNcA0Y}IUo6imdo9j-+3qZEFQ9D_CL_t7pw znaV5;0jW$=)Dh>FEH6I6A5*j;$X<>b{J4OB_U)!~-W`H4o3CJNcviG%a~62?zZSH| zt5TJ1pICFjW41zh5(Jdq1U0q~cW$U>t)Bx?{z3!3FqzGBUb!&ZNSZ%fD1jLy1mEgv zqN9}>RMZ4v06#=NXbh4Iz3LC!6QWQ)t)7IeEEjbaI#M>tK{P_QW1e~pUDB>c6Q@{! z-t2HVX55N)LsxOi*g|~GE5hAlcc8Dkl||;|gHp^6-1KuIMunt;lj#OfnzbLw?IQ}b zl?=gt{%yf}*L>L0!(sIsNq)oh72-^C84G)M4Xb9q!q@uRbbebsbA?qXxilQW^9$kh zOqgiKd-`j@ifXkb5~FKZP;KuR{`)UktWC_pdyiEiT}=gzEP436!i4Zr52&ei5ql#& z$gFJRNt>}91elD_OZu76=&~6;6qvHKJA^#XwuEcVhU~0-DjZxgk#5ENcxSkW-gQ=%GcZ(8=<5zlEkL(`fb&=qv?*>~SV zxa~$z@=Bm)OQ-OwvL^6v6=FEulP@_Ful3xEz7UXqqDHr;1d3-&K(c;$p_9yHe0jr# zKJpqzx5tWm04AzJbC(fH7N(%>d3io+c@w*x>ByZMDFK~=+uWs|vD8xYZ{qNgVL?s> zHYR>2brwn(ANqmH&0Pb!K?!*8nIAX`XY-1&(!A`C-Dq}W8LQoD4uduo%+oa$b&lji z>%2<#__g>BwawycclKePT^Ccx?nA+ZpEz8&ia-7;o!e3u2)CQc!LHKuVaMkEux8{P z8#0W<&WDTPZTKnLtbGIvLPt@Jnk|efI&kt?8qEG^Kfd4ID2minHunhJLp%5C(TbKi zyytRjI^NHp_8d20KLf02Q9~7I9{R#oWgMYqi{6WEnMZ6eY#bc)EF}?xeJrKBm=-^d zg(ST)qVXXduRJ%UUaD%GM@1T4)5n8XWe4|cV=0>I)v_nLX~bG79%Z{ySbTjNi(HwB zI-5M<`H$()qdE!XN`^>4?MvzrIYh><5x_B-G{Jf%!S=W3lirsbXm+$a{M8=^MLXw` zuNmt>zwsPcZ`vlP+qj7aG)BN%%TTsD`4-(_`h^p%QzQ%Y9MDT9kIvHi020x6z}|EW z6h7)P!*AW>h*~2Q9LHeHe@eH_9#QuyGQtGSszt6+Q*d*L{mO0kAkri9dERa@3p)Q zBlbQhFT6xrEz8+P>G5!m{}%<-KJ6>4ThC6x#N#@;SLpIfJ6q!s*1CPuU}%m zWWZF^7)T8O=v$chWr<+Ms7g3H7;P2n&p-VnkZ1$nvBw z7@oN*c&h#h2w6dG@@CVuGB25>r7fIw48-+AIqdoA3>y7Ahx1gfA#=5SA)JdN!~Z40 z&5jIg^BBT|!V)4`oQHPFsUVyY$$HX9pz-1joK+MGIWgn;*{Z9+|N9*6BeW8;>e<72 zpzd`G^&dFk@qDp<(>X)L-Zg^TxKJ@yV*@x)0;D(_mQqVlbg5uFTR7b4{F4QrTt)Mx zDX7}$(TJ$kbbq%`(Q(NH8mOpB{i`Y6dEq;`l#~n;a(%eN;gqBlo6{QoBuF0U!?p{P zXy=3^NO__T(JCL=l}djM{yhm>hJT<+U3AhH996)6x4eVwr(ej_m{7=?m4ag?iaR9SucOs&9`<)j&Y>4b{^{2I`c*81lt$tu{GXVXk9J^g=0LyJlz$q zXpa`W$~^keceyvZwuj=DX*|@9zRm8VGIMbcVD}2-=#$%O@R>6Lrwdj%68#P0`D3&s zTb9@SY9ZzdS97X<2Dte|KeNzGH&dP;XQr90EckW!smQ!Pjt%+Fq&CI6_*e5)YTy&j z@p;4aVc#CK-Rz9%Hg40piH9e+ME6QpHof!*9|@Nv!EB~B9~ z=v4D%=oGh}yQI}h9sFCkC%an&W^en6H`dXqN6vuOLJzofB?dp52(iq;5D(~kL+2%q zlMS82bT(`7snbjGt1u0hqcZ37b0;>DP(1T+GAdr`#>he;Bg5y>CaeRO*6HBwwR-%z z(r1td{uuQ6hiHgTV1=2J=~t7-Oj>0#9VjU0EF>=z?|K`&zcrcn69&QWv=p}LMUN=l zyBD6lk;N9bR}ZF$cetwZ7@W4)7=#y_Nb;^lqK{Xm)9H3y@TBJt$h^CZ=AL`OoSs*S z%I6D+&dOSl{F;d2J{h3>$PZU$&V;kSHi6+nQ>wa58jB}p!?$O7v@NWOd-D4vtFN*K zw==lf1O-d)uC8aQnq3h^f9uT!L~?Hq#N+br&&T%lC9zx)Pk%bVsxL`PiN@0Vo#$bD|1ponsuX z`*$>qL@dUT(KGOrRScWyCrMw$cu<9sAiO@0g9Y=tiC>2wSI||+hWEdNeeW9aqVO8X z9+##E8_tuc&}taks{#X`-od7rzo_;6e6~YQiU|{B__TEkc^T;=T-8T=bTrgMnG>_1 zq)VNg_OS=K#R}B9z@8uN^hL1ycMN1JePZDg40+!Z87waM20idhk(w=$<|qD{MipWc zxp8s9D17jdy+89Os;hUCscJ7pU2*e3;a@(psLR; z$h@5WaH84(R3#jOl@e#@;akSnQa6t7Ui*VJ$9K~9L;XD28_Z?B@P_yo-DcaBn_=wJ5*D2Q42{QC^Eyci)ceF7 zdS1uMeB+MuxNZC%_J7Uc|5||M#p&P}tHLi;UkZnsN5ExL2i{yJ!o%VldV1{wzDwOl z?CH1Ac%vv>ZFZBTkCouZq?dzI^<}6VUc$1Zd*RMIA*`68fnz#g$^681t{x^^r=ze literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/hist.weights b/src/autoschedulers/anderson2021/weights/hist.weights new file mode 100644 index 0000000000000000000000000000000000000000..6bfd0ff1b614406314ac7f5a706d02d1bb537b4b GIT binary patch literal 20400 zcmXV%c{mmC`^S%6l4zq{h14g7v~lKsPLegE6iSPdv`b2n5+#HvdqlEj-?E;$pJPc# zQb{UFrILiuCh2#+*Y*43zUH5qxn{1J=U!g#_oL1XBNwBU6c>Ah~on-UgFYOZ+0;N?Q#92{VF%iZU0IL3kQ_KT1U zPqACMf=TnOrGuxr#D#kXJ|34Am|s6a=9=+{(YU{a$X+J~xo7a&v+HOpzXzW5UPbxl zJcc8?i`yxi#ND~?31|#mBUu`+slo?6Qf{+=I$WHAiR+|za?ADTx4-77c{Z4>l9j+{ z)*KYJR-yNWCXi--GxD4P=4E3Ry4Agennn#M{!xru*(#Wm_!#sDvmr@Y50&y%U?8!O z5jD$$#B~Pjj8-2oK7Nallq~^&Y-5b)_A+C{1*ce4vq}MA=L-}k&8?B@U9C6iCMM}dpGYoNWVV@`znfQ_VaAo6z~^jl$SwlUN(3cOlP-; zGsN)pT6#5DO0dqRklu{`LF7_>$z~Hru5-O4y>em#Z`%EbRmL9d4bX%D&y(_bv5NrSQiyA2lfP~s&rnXKr~i`w#u4#bGt%;RzjH&&mYsw9Fy~!+vh*^2-nx(86Xf!wTk`0a z3DJ}{x=FBpmKyKcvz6rT!iU5+t(6X?#nMrwMbzT&Z7N<~NB%5whLh(d*sSlBnDgNm z9$wPL8a(pH)#{_b`}dq%U~nFzUuVKJFBkH7WISt+n@Hoft5kRTLAo<(6t=NjAjf7q zdAST}(eV!Yb<$nlu8tt8ka>;Hxp+_z_Ewn}C)-Gp2Kos5<2t=yaUMo{F3~*Ub6hx~ zO$P5?qgBp}$&$RCARoGc;R(W68Cxw((y7Guf5Ysw02_2U8pfz+_d~g(C-@wA!tlml zqGzWS&@;sWaCnw0|M$>Sa_DIro$KXHW6W}R%SUcdxw<-fH}sewNxqxVr}xO^f^^cD zsX=|;UxXpKdvw*$3fj5o7|CkCOz%5~k(zuFZ18c!)s@FVkh6y2pAfJo@^9f3)u~u+ z=?hYi`9MEj#%81YTz!*^^w@;0GDaAGI|cc~=~E+>iM&OFXgu#-VmvkTenMwT zcnMmnC-OM_WyI6Zi}<}wq7FO`TBzQoXL)K=tk0B)CWK&2+f6dS?nmz}{uPr``q6en z36nqj3!Zk;gsf>1pgd&<>rrq7svmPeW4|(4;>%Dc)Ae*C=LoyaeJRa3mQMU%RFf!& zE*e(0f_ylCn&jvBlKBEJT3wz`w7o|W#}b3|F9xBU0p%6+c#sf@gkz#IQJ|$TxLQ$6^7u}sVO|43w3M=WD-;PT&lHh5#zo*4(k(MG8aO#Aa>?d z>L)wESS%U`2T>gSWm_Tiz7056Z-OqfV%*)4%P1)w#|7FW*w(QLO7s1p=EPcTckY0? zK@X~Cb(8lg7RdRx-9)BtJzMhc7IofH1! zJUv=2{v7`N_Xv0jnsnywFw&u#P0u#|#j=zNuF2nX^nvFFlD4goxTzJ=Gh=1sAYDTD zJ+>nXZSllo-W@10zlWUrevGtg8*+9B=&ptx>`m)1u=u$hZ9jd59`h~O*7b;OTYLms zI^JSxQZ~GlTaE_@bJ5Qr4NfROz__&+QQhVtV?3G)OI50%^Eu0XXIg>AliV?#XhLLv zB!p)E0?k$*l#MN74YICa(XTd8KHS16S$ScWZWOdhzK6Vpqu9IQGaGuV8KTcGfEhMF zp=w$ohPZ!(=%K4j<1z-JX${)X4MN>bB`BWzh8^B93L9cCfwi(6xcXiIxr|7(pV|uY z-!HM36he7o^-2P@mqCI}o|(LS?fJZw6MnN(^==4?v?~N{xdKM#Mit6*adFC22@q8K zahtVA_<_KCWDhGtT5;(Dtol@u>vj3m zg!JZW@>U(bf}81VUP$i`^i!Np^Ir&ScGV-WpZ4J)zGB-h3Bi}qWcol(ft)eP!X~vKGO2F}9M{K!!5U;OE#lG1 zJ_>fOh=nOM3piTEdQtu9nAT?@_-bRuiySp07o}fQ%?+37)A_~Z{A@9z9eAD}m2h29 zb8{_kv6UU&IkcYUR$m~M9pOBqv`3IReHZan^Q7ZNT*H$h)an4<~OurA~SA*w!x~lZqD+Wgm6H1lJhSbykj^{Bw<-4$9z_Ptc-U48nPB zYcEhmJPwaDHd2n+Oxii@LFWXD(H7|_Vm!&3-?OiRNXyLwN6BaG*@y9Pnx6uLNp?7} zY%7!hJ{Jd8reV?Vhv?2V6x_Bv$eZR7PXbm+3i8tTQS~LR#B&QF`x27y`1cF6;r3*B zb*2Kumi@-#dyHY{j2yH!nL^SFU$Hm!L&5k>GGnQ}1YYnEQ&twC>HdB=y+*(qe_cv^ zZMiVbOiU2u{G9mTFz3zJ9mbtZ96cd>gJ{;KGv4GVtnQCs+)7+9Z$urE=Lf?Er+yH{ zE|mAbfPD3xbn*pl$aMU}>{L*ojV*di>C`=pPHYz%q^tAnzi#Bk?|g>0L->e(*}R}N z*90!B=L*!A>1g6U54Wu^$DLk2q-UWkDgE08D*_Z~{!DA~{bV~kHnf4tU5TbYYU-K) zIxjN^y{4m_x-r9F{GL%RxT3esX#_u)dy|)Q5(Q!Rg30BCExhSHGX-_;l6fWL6?oRC z*YZC9w@5H}?H1&$8)J2Kn*~3XuON~)>UbJ6ACM1eA872)f3)Yg4>`~lMa5I21KItpk?-9VMm06fPaS5EB774@lHLT{siO{^v zjcv8QgwB@R!GB&oDpyHB*N+agpXr1e3yYA`lT!Xe<|%Br>j%klf1z}PDxRKt3;dfO zGe^Y{q>r?Lwf`bKtFjxK#~DDKnKgT4`!D?3@f_tpmEq@N0hW7Sg^?O>sJkWtwig$$ zGM}?CF?u>QrSec^OFKK|{wIk3eH&iNWWl6e4(Rbw4{Ou!!N3*))0_Mhk`AZBy3ZZx z(rgKwaDClEm3YD_69_(eN~|^N{!PYzejvEl+$FdmZOqn9ok#{XLx6YOjA=Qgf(9bH zp}36?Dh64|>732h%-@O-u$Ve>PSNjL-ptgya%9!+dzj!ig+{+yLi+EPqic}_FXWG= z;AzKHo)NE%c<}jjiF+Jr_*zJ{tDeHx<}5TA`bvktS5c`$QoQZ}8{Vx~AyB)}gq2wo zNA9xA1#=Wz=rX5PuvapnS7v`hw+JUP{!RtqNv|MBckHCWQr2|o1yj&pyN(pjo=D9u z24SJA9sA{1J%5&*EcCj~K*y{R*8B8NJ?ZESw4C3`ce%=7+p+a%D|-}rO#wE&Xn^vW z@i1z(2!3_sc$-bCg^HO|O>nUYK4(r%CgvedT|GJ^P|bqEZ~r_%MSW z+f7l9*M+WEma~&@Or!#RHR>ccp+a=F3+{a|i}GbZV2$QPDmtkc9(}SRs+(`Hw`%z`2H*>y<&p$MYc0!(j-V?)2XvYmH*v2f`G^oY9+HlrM#pm>D8?&fuT zbVNrm#o3%p;cli5wf(qp!BM)){t#{?n~7mgHR!a6AnQ~K*A&~(TQ7^fxc4nP;kyP^ z$1Wx%I}jeWWilq|`Y@K54r1>!G3bYYRd(ON8fL1Z?z+N%BiKZSPO3_C>Vd+SniNdF|+aZu$gX6-x)=lBCKZAvoQ4EJJP+EI3;iVj}U@WwOz-{|wG z7S!8}AS>%Tq;$W+tx5^#{e=rkd-;s{^0kt#*S&w zBKa8x4gW%e%r$7}c!kkBud>~`v9Q5I0PUvVaI^-oS*M)ID{5gl4{OU?kcINGLY(FM z0($-A(EdUzbT0kEoigtgw7uzKbSM4Au4DC3@VE`?PDMf8*$uGCd_C@-_5~Uhr-Qxu z2jKTA3V!%)=lu)!=B*XKA=tEEll*SX6uhu1p@OI4ymU=@T%Cr%cL>9#@%d=VS&nM6 zbHO;cmMOLQz-;l1fzsYNg3YbvB>cPs`MGa5e9Lkd;4DgVRxYRNR+kwE)h*=9hmFj5 zhbW5YPS6*#r35Pz%?0gxTXWBI%D> zGko^>PeslHE3)By3BBL-i2lfZh%=+RFyFa=B)$$N(b4y5bgnPX$prM>(SdcQWsI!p zA`G$40n0IAk3M`7T77s_C$E;&YcA&*&By`8^)_Tk+E8HWyO+-Qf5E=)Q3tK#5p+=d z4gTAjO}=lAVRhClr`9)$Fk{XN@ICz+N~U*Er;T!gr@N{^M9>V0{ndo;K9e`g(m>F8 z(*xqlr$Tyk1iRXQA!~0D4&4*nh4n8PWr`YcE8$X0yEA0VPF3C%qd17jX@Cb>&H{&p zWwf?(6yjn^(PLc{oxqA=V^ay~Opqh$=_S;wK~y-CCBeKEv&nxy`>6QnRKdc-1AGO( z7i2#jAj-3Jcw^U=3iPc7An%lmdwxZMcI0K$JiV7`&RY&I_LXC5#!L7regqR-Vn~F# zKF>WymVVI|AyduG1bH3%sp`-_n6r60G5&O%+MRfX`)C{qOiEnk;{^61&VX7(en|BsK)%kGmgP=6!WzT%3_03|Gt}h3tfnnS_?<>Tv%mj~! zqnLpeFy(bHTeNaIaKG(>3t2yqKfMi|bPk|RP6F2Ln1Lp>pYW`{G`Jeg#rn}+RDL_3 z*=AV)r?(`7{OV7@$;e}aG=8$(Mv|;b5Erzg{{dX^fu-kIR1W)y;WpRVpl9ijUeeA8 z+~b(KRoa-C+XS-0Z`Jcd3st5sfw<~#sClm#96S6$x$HalpzhqDoK=n!M>&bH@@^IY5*t9ry_s!vf%1^IPCtyrwG}cMijg5-?KsFEnBlZZ(fa^}YRI zQNIv5m+x03?@$G$pULPLa~2G5UWbn2zwoZX6nhSoL9f;n;Ou>+x4Zrxc&D#oCdtNt zw@8Yf?0=E0MfU~J?O2HiBf`-2<{$Kyn8q}g+hN~{LVz6u(6u-fIMQMKgWHo}!(bSs z&1=E|sRuA{>IwefmB13C@gSHd#ZLLdfIJ77Z7Cly?_3X)_a=kkv|VS0hZ1o2Q7OKo zb_9g3UyGtYKEd*(1E_5~iUz8~c!HnyquSa&4&{Bi4#_Uj%o)xHp2`%cMUMkZi~=S^e$_-<^b+)-GgmbSxmb& z3-a-EfP1?DYv;ViO*IYRJ$o&ibzb;RrHdhp&BN6e9iW_b5<^Q)<0SP!HWUykJA)?!a)Z4)WY}xZyL!aU`q^&#m|h-p#iv-rau;){$x0 z*%1snK{s*ma3owb9fr#40n7;Y#FSmvQTpF>A)oof>RY$M*kCj`rb`R2--VsdeoWi* zN$j?$&#-Oh9auubFz{stsx+N}FW#?lLd$au?EHz|Yuvdr41!?9AQ+le5N?x)7>$PVFFJjtI_>OJX-e5WxOL~z?+U$Tvzyqzb4&dVCsG3CH;UbhfGlM zszYyId+xn2&ycfj9TfSj$Lu%t=rLIi-}c=`|H9YIwR$&v=U4;=Uo>#m$5{|N*9JZA zMWRb|A*lR&ipJF@torK`_H_R_(0PJrupxx?4|0bYx_4mP_H-!7I*-a%9AM|+6Ug^H z3`<}d#Ab)V)-FC8JRJhTMi)LOdj})^%n$YZLQ&FT3&wVQfW*)%OzKWot0*J zj^b+c-?JP#H`>6)Z#U?7`*$?zUC=-WY!{r1f2DbF6RyS@^Jx|G=N_K1#RfRn>!{$_xGGXSLxEauk`NpyyhFvO z)#0BHE~K|nlb#9Y@+1rb1pk!R3H-lBP>G3dp^2p3DzoSB40}aN{2L{PdLBDemRK(-Vtt}l^(9N&H%})ufaB2 z01YOqF{Aw{M(3>nzK9nLOw@%sVP4{Uio=G$wNP6d0{ZhPbW}NGb9D^ce0VOqRZRf1 z?8ex0(~EG#u^r}Y9EOjgDKK#V8mrt`%Ac{i2V}QShsOEuFyv?=^1h}q9^-qU&6?o5 z6+<9%Gy~fQW`VN$GVC7L%ZO_J#+pa=;I<(JtZr4X1?$VnL9GRXr~5Y1KXDu~r)&Y) zHO-ek532<6JHEKsum}5uy4B=me5}3h58jqH^>%kebGiN3v26Pwc8QBnv3c3Bxqm!a zF=C7v$(mT%8-t997Ok=3VW0JH=54AzRSr8uH_9e*uk2d}vl|nk)afF7M9YtNPw-rD zY0Dmxd(jLN&8`R}Gs1XxWrHwg36E-}nqxxzIKj9)2i_ftSXQ(-f^PgaLlDr_0B@JY zlPvR{)aAVbwclGs{HDp$c3mk!#kWA3JV}Y1JoXr}|A>&uB1ZJFmpWa3=pepJpF&$# zZz7S!s-(=qTfhkzt7cP4!W5wwc{t(}s!xZe)A#tFj=k zW=3WB3qG+Zr18ZzuB*S9M3ZYWDU4oH&m&#| zwsf()s9=-!PpCPUnth)osyE3?GA8W9JwqV;7Z3q$e z9n%5^uunG?X77?@L|vMhnje|Ca<(hId{c(5Eget%Qi5o}-ei1o$DZ6w>x85w+H^nf zA}hAblUbc`mG-zyCh<=r=>i88!7Q9fX0%P9mH{bbURfl0cG-n;PH3{0(-P7C_AqO_ z4JHfSnFIswu6YfVJD)l5oP-&jZn{w*}uV%Y+%H2BdGteumgZ36^-Pf@$V$))?)v19 z=VXED+A=(=JdT9EKFGTH{$?HjONQc;Rgh;e3f%3{?6ia*P^&N%m+=3h^Co$a)l0zi z&YftN+>7ngf8n;*cX8JJIK1$XqP#}})4lvVvs>1W86K@*8!z>P;Lv@pZGS9SkXUfM ztq)s|zJzlnWw^9j0GfVhnci+WSYPr5i;T^I=NZB6T-C=N7B>SQr$C6^;o&1+oul3h0>62tp}S7 z1F^;_3#D5}7`ZYBWV#>Xspij+qrnG9+cAi4{tC~{A7a~a1#B2Ufb+i(!;thJ$Z_?B9?yFy zh%aP#y0+XX-%QMVS-{v^=OP+C0jb1B^#62>)ye+J2DoNp--4GIu67)}Z+~W$jtFz8 zXfApWTXQWdO|Y}@5A?3{!QMBe;N023=qPt$=U*4dFijJ3$d62KW&j$J1jw6n77QGs zAbNH-I8O;+JEx9u9S55rF}VY0`lqA9)dq}Bc@1373%1}~IOJ?Ui_%kanGsK6|CZki zEz=Z1N656IH)b+!Ru{o(QV<@VHwlM}@4&Aoh7kS58A4`tLL%6L)a`sUR0#*&VM9D8 zsf?rC-Eh`L8iQUggU~~}m>kJy{JJ&(YktbZ>EE}RQIRz4F8cwgirKI;bR~`qsp7HX zuV{R#nNc$c$Hjdgp-9=6<#+vp(Rl{A@@NF6&#z+9t{irEdN6fH3)!0DB5cvt$J4jU znOnt`$p5MaHO~<&tp)s^;Rp!7*@%LNAuuf~8b>l$VZlOc9R3px_LjoFZ~q|LTEs%z zuPLnj7jxG5MYiy{Q!ps40iT>#!_tg2e3Sea?XRtco6!|8O=~Tl`=q2N zPe#V0!P7uFjQGoLZnU{cJJFUokM#h5ZAd@N%J=1)Qo*9Iov`Wc!W+>A2gw?T$>JwC}CVL3&!VfnXI zc$GAPEZ}sqC9je}B~FA`?YIP}+Y6OrUQn`E1tjm@VwT3NIZU z1$%x|$rDKbcn-I%7J`%h|mu8F%UCNK{Y9sdwx=caDC4s3O*e2-LnAQXc;7nU0}NQOlFtL zr8A$eBw$IJIN7V4ic4oOKlj(My~AL#aEHNAIQ%0vcWLr zxpczEQy3{f50sr>!GPy(;JLK34ziy?^6*K_6ZsXSN;V>7-=BqP%Z`vaNW2^c;>_a zp4(`G7ORhfW}6l~EfuFTmR^U3_LsOdVKzz5UeBhf4JE$Ni{Mh@Kvpizjp3N=gqaVH!gkI#jK6f0y<;a$Ya4waqxJ@4_x1+&I$A)| zEpH|?tC`u%afje#6JY1~WHu!!lpR{Lihp^h0k`qe1;(c-1lI+cz|aN(lvMu1 z&&LnL-cAl)Kl>Q3E^q?#329)qcR*MxOEAlE655%Zz~s$4aV*jd&K1r^3EI!E#Eq|Pu-?2IGdCZH!J}fZ&&v*8wHe~|TgRbx{cl{g zsth{|>{zQVTezh20O>g%Guz*RY2F@*-ZdAX(^;IX$`z#&M?Hj`fya10X7R$)wL`#Z9MIUuZ`tHOHl;rxZn@Un>*Quv0+J{8C{KJp4r4R2y!iCI9{ z=VX{{DZKZ7J}egYXUa-*@&4j7cpx^8y_L5eNkcxYsg{5*)`=Kw)`e~lGH~lRWvbeF z9h9_$dPB@7Ncw%7^=Z$?h$>!tp?StG8B$qX4m;#+C_dzne4A)I| zhk!Gl;Gk5@{{0rlj&4}M1Xb<9q8}Vc2<|`|SjpWanZaDXR)vGLBixenJ+QVU3bnPf znQyjAbcUidYu)e4+UEPy%D0zLQC$`e9u|dc4Wa)p@&K9M(8SI1mH}fIDVlLG8{U75 z!_%|g;<5*2kW%Hh68&c@W>}zO7BY`(K4vFE5(KZZDpUi~tU*cks ztFR<4j(f=NBI+Gm2&v^2I8d#R+;<2+Pu<2JA>I&R*9O}8V`!~j0|}F(na#WHvGKbS z>su1R%I)z1MJAYKuCIi^%odz46N9{>YG~a16;8elf%=d;O?6g@uNV@_UO04u`Rq5851I4SQJ`?WL~$6wC^zXmznXHg0(ea2D7 zxsS=M^=AwfPocq!@i5oXfjC7?W?G)b!=Ai(pca(PwtIWAZg*etn;8XSB`SsQp5I~A z^Ze`Rmj}Ay;-QMZ&YK0uLL8l6&dMfbm ze0k=>sh6yB-V4mz7XWE<{&5Zec%zHd=;ZG>0W_Wy;0(H4+0@vJb@coZkgZuBvFtuSKoe+|V=AP?GwZM(-)ANQ~t9Fp9 z=9`6n2N`(Nca$!$ydh+_S8$j1d;D#lg}3yjY2-9yIMuGmek1&8SEr!rT7A})Q^wa* zl*ez}`?xyZk5<=BLygiS%yOGRP+A>@t+5+nT3v9qhQm3p8VU z26!ksVT{9DCgkiH$QxFLoslPy4VPxahgry;X^!$6)bMh0Hh;~*%kaS~78eH|q`jJ} zS^uo%WUrhaUY^;5QfZ=8`mhZ7RV<5h2Zr!S(j9bFZ^6ADjW9mJgjUPUq1>K5xa;{+ zGE;5??$e3Fwl@hN79&IY7lvcW{OM$z<2`2Em@N|3D7c##2&Ka7$4gA0+f@CS`7W~b z+CUrHPaMG16))Mro^7;_!8rGCP~QfqL|v0!{D}}6t)^> zgYK^mR#GThy4tORzHe)w@y#G|uV%5&3irZ0T`BlCS&E%DO$4G44yZl7 z2~(e}g`f%z*uCsAb_$Z&rgv+ApC`uHc$%ZYsux!Ls=<3dE-^1>ZbRAdEasVYA+8N~ zXA6$L=Vl3_miCvu`9qc1hy7x&F1rLJHg{Nek)M#eU_4p>yNX?w zehRj(9#3P9d)ZOrHs<>DkK9|EZ$h`*Tv*bV%vu={+%ayU3M5)3w< zLQee{>)9+#(<*1dSup`d7K?+vMGp4;JOJkxq`=JLURE{i82(*9o`_VRpnD@0V*iSp zD7w}dddC#`DqHuklV(@IAH}n9Tf7*sIGP=stUzWZC{nfIlQ>7ohwjRoOb>sNBpv5? zxPI0&l4seZ7k+98<{h@ho&6G2&?*HtM;6lcKhcQ3mnU%|xVS}q7+*}X}rIyOTyS|)G&Q!vP-X)}OD422Um>{XGa4s#A zpHTk-NPY`9^TThAZT=P(C5HbvkiJ0qOiAIxOe1RUReCFf& z1x##)5=CPHY&%g4hQgkz>R|x%`)DIitsFa34}sBtb@(K0AJeQlf;KyE!tOV(S^G&P z{MWM*WJv}f(bb9_b=dz-ND~Y6>M+je!QpN#41+~v8H`<;JTmCN3r!H z&faA;%_SUBI|E00~lN8;`fVV(~X5l7I>YAWP~?|~nk z@8I>23$FDpgjTm%_{s1ye09IWnB+x(;r6*qK^5g6H@W~G{_k0lllS4@gYB?Ad>+#w zyx-)eR;+yS3G@P%!p;_7HvWhz9RGI+pKgDPEk2JKncp7e-oFBH&yI_5f9_*!s<%bQ zA}tsrcOc7C=)p@HVP^if9?qOU2>dY?Sq zXxt8W&qgz=`#E^KbX zyFK~1JNXg**KbF2R3|f+Q#4tqYr`%@D@ym>XU#^t!E}cO-LdLDlPV)g+J(Ghm7)aq zxL!PCROOBG#bKy1oCdq^3w6qYJRCN=!e}n}0HgA=S!?q*ta48xKB=-`$`~DVG7ZE7 zLa&FnkqKPQ)WaL1*Vv)POPDvj4&~c5n9z4V{5zZ1v45{-vud1vd@@+YEWJ~xx2=9I z+j~a>rreW-=fmL;#&T#*^Gm!D+kw%MhWJ6S4VyV2m?2E0p}QoLj8?bKm;D(O35L^e7Jzl)W;pgTqj;uY?a?2?bV76bo? zo_s55wF;tfvByc4&r1CF$%i-ClY`Z0OCIici2>{jvf!*TjTk8W-l^vs{bW4`FN38v#b!7awagw zlb28lMxDGUzY6U0T4t`%QIOSGPes?yW0l9g;<<-2!AHIo2mT5@QZu#TOT0hMZz{v? zEACiWE9fcC@V%KglaDG`;uX&x)X>5d%?5VU!`rZO{Ux0BPyluv1-QdCLa#si zAPwx@fFm2{Vf$SkY;^M$?jxyZr$+fRW%ZMpEejO!jH5QaVs{PHpSYIKcg)0iM_UNW z+RrE$#=@-f5hh2v0WIfl1x@Ehoa3_yKOMS;1qKT6{?KCdvCQLUews z#Do0tFYtWnR#2+^3AI7em{P#u7F9jMpD$eT%9~u)-Y*G1c@8oHF(<5134us8D;!$X z4Xtvk=vx0l_;}V2hZp6;)OZc%;Y}0xz{K)vb`dg1g`rmzJJCBQ0sg-FhxhM1V}?d9 zl1=ZtY3ZEb*yg*OTYB*v#Ct{I2O-aYV37b@T1?=9hb7Ig3f~ z!oSXY$LRG(0ary6q8Dx@;{unEIZv{HyF%zQ2~wlo`eOJ>#FZE|^bwaueA1Oq2uV0g zZR!e{#ov(Cc?IxuZztfZ5A>^v;9+Pn_@J%@`MlhI&0~9 z$**9rOlCjbkfE0Eo7m+GGnv^@5iqCMo8i1vMbFwe{5vW`;#{@J zA@S;pp0(Ri!DTDq6_!Hs=OEP1)1yzluY<<#e*Pg`gObnA(E~{F`56mTB$vp1Pb=nA zRuJ8BaSGU2UnP6;9-+exJ8JibA#1i-(YgEop#6#gnz*?i>K-N&+i_;FPwN0xmaAdX zPjGTYFDlIERCf@0yH{Y7kyqq%-H(ycr z$QvFBu37<|?H`aMW=~pluE6T)vO>-BiBJ=3z;>a(sLZzkp1#o{12@fBr~NWa`yWrN zy>$%d+G~>U4%&1{sxi{MJ50Nu7XE`hmn+zvokXwyK25}2)rrE5E7%!j&E4=Wmf?gS zVqecnV`qF=MXQgQf{&;gEpl`Lwavd2%tcc9t61?c0oICqLqlB$wE2 zFu|YU3#p;TbC{A}501Lh_&-hVe_ESC#2aQ`uL$kAKA-Zwtb|yi4%Wx@X@bQ+hK!e_ zI+h#RG{qveeZxnLG7iRZcJJWHv=sg-_x@CPE1GiWmtQ$i68592Al;~Aw4^k>4^FVr$0w? z=PbSoSDembmP#1^Wa${I7coE{l`zCM*RQy3+k0kKf;(4fk^@?Jg~R#8iCooXkKxZA z8SGcS1q+M`PB1g%{%d#wFMeAyD(+igipn!)>h>0RY9hlwqNJ>sx-Fm4YxT!7f&h5* zmXAM$`&9B>r{FmCMzr-PXX{Tg+@&`ozPG@otVFr9qW zZ@@*q$B|PW#XTu@0LNX4Xa3%K0bc|YNZY#!4DZx({M1zp--3i**bU2()DJSfr}*eF ziOc?91!v-qb=C!NBA%p4#!x9+Q_*51Q9SqDpEXJn4Mmc)M;d97lr>bOY=f~!h?JqE z=blF*S&~SkWlEMzmKMoGywCd|-21urbI(2Je7`4IUmhFI{tP_BDt2jq1Z0W4wBX*u zfVOv8>e*r#s^|lj@(fJwD1gcdLpT?w#FeRDg}WNPm>E;cU+me)_SNy|l^x+At(}WM z@3@QWdSy^wvjlr{1DLM58Q;(IFe{uBfnG)zNO8a!@byZ7!NxQ^=5_&coh?BrHJG?8 zFCjEg6^g>V@KK~OjQflehjXv*)ITxFH()5HerS43UAojY3~H%Tl56$)O28Qivb#{yc5M0Ih@~+%J-Ovz>b+S=@$P_ z5Z1o}qL0f)SXjt$ESUWq*A3N?ozcrh>(=6mw0X4r!#%RGHXrTWzG7F}N48Kb8BNM1 zvFwdH>x$b@-A`R_{XekjA`rb$H*p0a{aIsHI{CIFx1cr@05Pt>=C*l09o6 z@}jly_R9vcI8lYVocRT>hh4(p+Fz46^}`pXEUXd}2+B7Y%Uvr6^DN}i-2MU{tt{a48ZF_R$dmr{fVD3B$JG)O+=wz(Jjgq1t^@<#*KD-zdHqRhsN`?64SuIxREEV+wbjYabGc2E@ zFTDEh5+sHALmfVZp!;7~=!5+*aUg`VHlB@dof3g=@#N%BTcfvND&0QV&2Bzd5hgjM zz(B`Wq$1W*=E?+TpDuECv);nPkR`O{yEHXgkPF@2i^)}QDOzwZ1)6u-p|<1L|78a^eq9CU>Pudcthl^oPZn;Kdw1`Iz4hq3)**= zL-dptEap})DcW<0F2i2%+m#H%QIX`}`e2w!c+^SdJi-4y-?i2Zhh&@kWcu>w)G}4t z_~#ch;P{T`?8U_bdMfN4M6D@<(Ws}OH*}Fj=>7>mu0MzO<)y-^E;-7Ve*|Wmz5-8D zh5q$ZET^Gf4=vOG!i}*Xi0(lH_%^ke)#i?&&ORl+)S{0hqO1s0?nPtPVmq4InuRH5 zPob=_jpYZ#(v7-2PV40)cX5T6P=D+OSsIos{M0o9yTvxanOQ-kZI3=TB~FaKU0R2I zrxWNg&VsXie~X%a8KV&r;Z)YtkN#P{58%FrF#BjVE8O7(<|d^$T((b`p^-xsecW)R zQX{>5R{vNWGJ6|Rew`BJ;#f_WfI)5M;>S@1gOQoFy0Kb#_kwnH6UK+#zhn`U6vH>Q(80k|Ef{sN6hGD# z;jtJ|?LluG-8xUef@=|AX?_%%R8`@1)@86rdj|T$w;|KlA4gIX;n6BlAEj^{eGVr> ze$W{7I7@+QWk1dtFo2OY_t5g6e#o7;3Ga(x(4Y^u=Jrf|fr9o{mT+uMzM_i$?3Q8Tk9b4`lG@YsjArgiz0Ryv#d{hWib0 zw|^FkTHl8+y$tb3lrcncfE<)bnVM-wWcg zbeS+?$4a=c;3GTy#G6*6`;!AQ(!kU7#+8n0z*BFcPELV*d;b=!9R3T!r_3c=^2gbw z!(rI!D9veJlLH4k9grMU5c*9F!R*#~hNj)i=!f&GP|9TyRZ%j;qMS)qI%x&1W+%vr zaa$PlkzxDZT){gZcT?pmifmAo3pH`!YComX3|CjI`tg~SceH_>+EH@Iu9H1o+`#W~ zk)y`Fe#}opO2nD+;l|G8PE8agx|KtqGx$uc_sF%GVs=3%#pZGPQN$(U9o z4!ZB9h1G|k_O~Ed9x6jaZrxxj#B2BtvtAO-=t@W$sKN5m^N=qYCE{x&VPDS|l5nqp z@YZXfc=dYVw~KR*6E8XD`hoqea}XRxlA$Ec4L82}2iV=ML{0HK*zNL$z?C(4#XAIV zuD*>Uk!=DU<#dwb7{-1r*nqBmHpF92H@lke!2%4^pjx~aR>z)ZUB^QSXYGz5tr9GM zr9ZTl$>3O#o%_y&DvM7)_P-F&T*U(377#EfRshTj4q%|H^jm^Ja06_ugn^HcV7 zi2{UdUchqshw-?p6)WzOC21^ySnIpvkcdG)lwksyR1?hvDYRJ5pHthhNp$ZjS&7sw z*qwMp)I;(>wU8y?tJA^`#F~Imx(n(LhQYQ^zOb!m11o$|h$m|Uv2mK1u;y`yXof*P z*7zu)WrzluqDE{h5Ay~riAB+CXiKf2o3fbM7zR&2Y@f^=0R#B@5X)RPW}^ur^k41S?Qjj$qaC43*3A}&X+i+J}T zEIL-lo_nXzm4W`SdW9@al=p=*TSWY{T_)u{C?efz)41EcZ&-tqm~o<$BmLEK2|wz> zE9f$K#p<6lAqy#_+ZWhy*P@3oNf^Pgf&>&z0b&2IIsCsCko))z+1FqyoG25`RG1X8 zwHrV?Nw;%*k9-wF&_N*U4?3SilU#?~~$n)o5pR9}7eNh5rFI C@ONGS literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/iir_blur.weights b/src/autoschedulers/anderson2021/weights/iir_blur.weights new file mode 100644 index 0000000000000000000000000000000000000000..3f806a2441b7a73f10863934e6f819c518ef7c35 GIT binary patch literal 20400 zcmXV%c~s2*_s83XN=TAY5@ktBlA8B@4TUUe5&95WLI@$*Nt>k7szqqmM$xW$-`BLL zBvdMuHY7{2aY-Zhm|=Kfxixppr(m-2@WR?el5$u>H0t&3dADIg~47ci(U7;R+tLru?Z zlOgrw{^?NYIvJqK$*T%nV!YI{$6q*Qq^wh>s6bc-e%3UMb~Iz~@|60_{GA6q0P zj^V5sOx<3DuGe3KB>S6@73yc2D$~%pya`GwwIF9G2e-3DFekbi)(kv?SXCob$WVv= z=uAdrQwBsEo3M)YZeV&UhLMoU1#fI%OjUZAapHt>TZ&l)aUF=SeE~LAt!%dw3*0ji z%wEaY?9idth_UbR;&x;FIpHzJ_dZ8M?nBJbO$C`<8BlvU1SNc%al)()^zB#x7QB7H z|JcMiKU)Y`%|g6wtdCyd4_RvZkj~Z4hT4hY@ZXO>^gOFd&K%Svv%gvMjpP@Rk>X+u zKUqraKD85#u+QXc)(vjL0dKmtV-YSDZ|6v?@q~ozNVItxjh-UxO>Oi@#Gx+bTCv#`1xp4byRB-1}Ut;dv&eYzC zr!nV5iCXwD`Ecty>23^RPJYu1cc!`C2PSgAWA*U?blaJX z8K*8l+1*~W7~P5;d!?ywqYrgY_y~v1&vNY|74g&36j=Q08l4#CPPbQFq-k9d+$=vy z`u#vHwa#9~_qota_HXHA=I%--^QKhNm48>z!N(tY1!3FiP}gH3Uiu5-)ECgSUxoPi z^9VXF>0nLXc%y;l7;yi+<2*IFhT$!#pml2^d6E~2&z3$XBSoP!sAmh^J(dT@3{Jp` zyJ@5%6;-_O6@A z_sfEg>k%};c(;%?y_x$WF4+!C1XQZ2mXM%HQ09PA3AH6VAt(V}=q&MeEIp z<@?KG)) z!;mjGk})RvC*Xcp>S#ZS#=Vm#<#GNrYOw<;c_NGzZuV$UcnbKBS24UZeD=(f7?hho z1D{#A!|r_x*}tnrsJ)R9xg&Lp4y&)EW_xR3W{Mh>`ZY-Q9eF_q#nb5UKTqx_`4U|4 zPl9ijy^FuaE}hIj@`x!fe+wH!9qD4{0%p~bz4S*;1e`he87eH3ap9Beq@O*EuG_o| zrYC$yo6Wh*6XkDs{?IZ=lMe+|x!vp>u0HBZ{$LFvQ;4Y8COUif0@^NXgO71BU77HW z3=Taedhew89vMn-Wy1qf)bfkuukfRbc246PhyMp%KRwuG@&p$7m4I7)ANYLz1e`l- zz;^m`w(5-q^garu-8T-Cr~cc~_d+B(cTeW#s@$Qc4EMp+HIYQOc`L@7dZOf^KiDm? z69-arV7xU9yVw*6ZT*8k_FN)Ubf!_EQcak>dpfs!>O+R-CdvIgC4^ovbHyu8`D|QN zEj#IlFl=ySaoGGL+vWKPjQEkP+^+v1plk;?6k9>ZrX1Ycn$9RFoWgm!qu9`D1$j@r z!R>q)Z`s{k=JMkU^l|1>Zk4(@dGbk#t065!%C)N1-w=Msj`w3hljVXaKv}`__T5Wq23beCdU5 zvu)VW@rG?!bR241KVV|)BWRLYjz7zPazpCChH40|?z zWdmZWApF`qP~0&LMe>>Gf36e4f8J&)wHbuhtI$@(7Y(g)Q7pZU9o#(znk+yyp)T$Nre{#8bXI(xpN4e5MF?`{!J z>foT<3~}HWdvU6CM|nOe&1l`7$o?mJ6Hh-{O0^wZ;E?Vy^3zZjTfCmbr?fXXUD$-^ z{Hle9?H-hF%_K^xy4)uBPIj}c6nD!fS1f#?M}O}2q$O=?)By>?{7voXkBd(nS8uU;QU?; zwEa>6&OSk*d5%U4Xb1XKRe{iV7f$o!Ef79q8=JJLm`SQ0hLJn^c>cK-pW~>+-Ciq2 zI<2~>4a}hL4VRK*@28QIhWqHiQBD43n;>#-y(sksPa<!%mM!E>K=n>6AcqgSul=FP?OqwOV z?i&X?jnd$gTs%ImdCrlP4T2rouOTV(J$rvm0GPJLGZvalpplE1pr3^s4}XXAtN3h( z)p}xO_J{Ylet`ZykWK9VOy@=oUBLJ0N9n`9S+J}$iE$+-z~Fl*(_le2 zK$W<W{@&4i>w>d+rNVEfR;0p;ok=eF(KQvQVfy;n~onx3D?e3*u$|Lf+>2cz#9< zcvn?3C&UmWk2ip|_d>j^z89({nn3v`YxelA5$tV!hq7Pt@oNqrpIy2Qqb06T9wZDl z9`o2qUmsy~_$+vx$VK&S&8%G67YP3y2ThaGV4B@Nyx3`krAg1Be;c3aiGK^Rj)`FW zwH2MJEI=q^jiHp47hRg4$e*!sg!@GD0ts$O;dfU(;k%59ahdNqWMG*;a8GSwYF*UP zM0hXcH1I&(Bn^eymDv)t?YQKg6m2oJqS`HM7zxe+sM^UN$4s4je-bhJC3VNIs(LQaIbv-|9BpIH97HaK0r`rrRoLq}GQYWBiBY;(7 z1w5O555_hvgpt-Kc=_OAJhbH*B%eM<1A64>vTjN4#98Z*b2yo%*bLF&z2BKP>(WTs zyn1|?7KIvL4xr!OU#zK|I$iWqkwyys%i8s8F@0qctFmz;m5?!@ODB8)Lv=1W{oEYh ze(HyvOI&EX^$qCTp1zl2ob&V9#?~x-hD-K?7?VJf2p70tM|4qVpk1p^l9Aek6twN3cKbbWb z4?<&wGL;dY0XmoEXx>RLift1}!e;I<~1iG zD&1|vzxiG$rTK`RSnWgo=iHz%h4Juwbk^)XI>6OSjUUyiz&X}E?T0gjvgfz~p0 zsu||TiVO_l_HEi^>n4iRR^7(Er!45It_)CDF`%iHS!}DU9w{^Kfb@)N>RK(v9}Mm! z*S`fIS>Q;ca?>#Uo+F+O&ji^E|FCDaIeJ}PN}k4v;3pLwerTQ}s)z0(4?e2#Ke+JV zc!vuL*S{xIRJ+J*CYWz4d7mhD#*!cNFG6X`PHxNBm3+PmmwT%-m7e^ajHAa-!Gv8h zxKTTVeyH(c-yGQpSM?IG&q4$9Yb@!d=28^?+Y7(*RrnK!b;(hWVb<};J<_A2KvESF z$;<8Ka7(3}=00@fM->CzaWa8yy}6S}o85!VG%0TI>cN5=Dvk7tfja+=g*+8G9!z4U zj>07EZ8&YX4SI5~vvCKja6tV7rtLWkS9$#)FPJks279pl;R#k>T@P<*x#9)hZ*+T8 z3Yrb;Aua6}By_#U?FtXj^&1Bi4)7SW8(+|fpMxb2mqEjx1{AxK088G##hy*O(A54R z)JlAXfpveOVp0%Pw7$piJ-68|!${bCkq^xqf8baNVwK)ACL^ns5qeentQJ{#HlB$} z?v2pnC4;ut>!E$=H;&xg_t4PR!5B{aiybFlz|-mmD0c~i^2?jS%4{v}mH!5nGiQOV z*=OkbsKQs=bbzbgb%@(7AIje@eH3kiLik!s7E*)v;@qTVvS{!SfwwOZUr&C58-*!M^uE*>{Cax=h@gV!jekUG_rZ75I;QK9p|J1y6iX z?e{N2)Uz3o6duYNcx$k>TSA~q`kY|?#p9%`N{sjOqcOX3nYD{T$@Ue7@V)g6*)*O< zZS40^vV)e1?fBO+btl>7AZv zps{aAv(gf3RC|LYzPLbl2kj0m#^`Nq#$U$X?5NrGVjWu^19`5>zieVd~o!q2!BaaHxL}x6kc`Nc~i} z7s9c@SP!ax zP-AvlJcaYy;z8En3kaoTu)bQuY}a}TR^68ay5au-uDij~t1PMpc4EkmVAi)T36gS~ z8UDE_rhKI?MyJ09sj0EhJ*0!`vz9;nzc(m>WiW#wB z5TnZqZG3L@6a|{mFcX|4Jz$sJ050A318r8H1Kq_@;JCb)4Oz4Xnykig@Y^3y`BaIn z0RgO|L;}<*CSlRt1aL?<$BcSe=tF-9xY5l>O1{OEQ;`sLc$`(+P>vQ61HAG-evtOT z0VHKo*z)iQT()}&t9ny}$?G2mkHDJ{RP_ObJc0~GqOM{{)&smF^%p8J47ZzwqvnC{ zu;qmY3f(9xh~GUQ6o%u`KH@U0yB`d#Ie+0}#YXHtk`FyPav*fzz0uwm&%rfmB{NMb z0$ha?jHDLdVYhT$2g6ovZ6|sAh%0ebUW*>S~9*lyR)I5;mvdy#B*-KrzFw|+l1Sf(+} zx-7`vn**GOzTUtRi?JNf5p2umLJ}5L8X4K(V zhC*}gIk0pPlAf0{Hd{qNmEXt)L?y#usUC9A>2X44i{WTsK3-kX3$9f$1s}_*!TQca zY;W}gJ>UB{;CKf-HV#5zaX+T`UBU#rV3hnf3!5GOuxqUAVSFGQ?2{w~@3+Ht2QQ}K z-86P**jL!OCmEKIK=f%!LG{-c;G1g;O4q(apY~yNU3HG5XyOZ_CVo(*ju1y)VSvXt zQ&Hl91_JL@LE|q|RXPI>eUry!-y?V)ZwnFcvY7NJ93~hRVVKP?Tyw+?`gcrXos7FM zeBC$3Ve>7F*I5nCjT_*tL@}Nlx`!6sDvWFBBygqU1;JDQ;mEY-49qA)ZtM`G?Mnsq zE9L0wZp(T8tqz5ZjUmfzEk0^{A@C$*@Izl5dS|vUK`)%~qkR^beA7auP9=y`*?||I z-$5tA-lP8SEt(c@U^QEE+4JA8f?f@x$!34n+xHwO8YaWeT}kjX?Ha1y+6Q|a&mhm; z5te{FL_P|H?HxQcdHWOi=1x4JN4puxIxk$)7l0D`wqa!JXNV5C#U$=IjY2W=+42?( z*qu|1-ustByZH_%zmrL~FA(Ry_;DDw zJ;!0r2XHgqjv|U5LC8d#%?t2`+=L@^(T}w>@~F(gGw>-lL*Cv$7h7vb%;>$G9t5l(z#e}SaADLr!d zJ=T2GBO})rvUX*k;E#zCUuXUvex8pex37F9-)?;q>$3U~@5Y8OGFQ!u{N5qQH_~6t z6)}25F58{pU%oBGx6(_X3eDFkGkO-v-e01mz>b=Y1rareRwmfA7lf1R@$%vCO!Au7 zq<^zJJcujgOSp*f)3TqV&zcA_TYHc^oiLZX=H7BL7(Ib+w)7@uy_(G(I2@e~cZL2COfs8sui{wJc5+xw8YIJp}d ztO#jpRem{ncbaj3`ClKolx03+> zPh>?!4VoTV4PwdexM*EB_6fMvbZs7%27806#eJi_t>GNb_h5|Xs8g$;A$;I?48GT- zLa(PB@>8ABoi`unj>ypD2w!N*xDIE-&FET_NGg58fSr`w#U^Zh0Rh_*@$zmD0jqL=My}knd#TB^|%iiLk#$BRa;6uf~sggMw zDeQFpP}&ipO{MogCC7cfW1Z$~er5U=3N4pNpl%`T+i;XLI9{V0W?R#~2Wx50-TgH6 zUKr_~_W?ex5aSEo9N;)c)`3N%8#`EKjCq2Y<*n+9l5`F7FITaNSMn#a;@Yrz0>Q1ll~ikCC161YL~)!*ja)Fv=$rWbirS+ zcP#(mMGr2JhW#Hn@a4!ns0t5ehq?(K@{^|9f_9O%u?#AjDnws=*g`~8KSN=I4K2%* zf$;PiIJwN7j$Mj{foOkfy-125DjWl>&k2|!{($5KJt5u?y{XWdWvqpKG}^`uvZmYq zF(a0g5o-UsD$%3~+;(T+Je5n-?`uAKZ&!hi4NK8oHyS^?wc{@FYdBRghxy{>P5IHb z;4e4E)O4C)Nc`MM^x_xFKAiU+7};cR}xs!DEYIm^x01^)cOY;JRF41bxWIA8Jh zT!`}73_bY*A6s@3R!q0Y*GZMA+S3d@=lQIw!476*krQ>8@q$P;oF^GmC8>{C0EzN{ zNAnj;(3mnS63}v#b$0*F+AofWoU=ucVKN4sUE!?!gCQuLIs=#R{-T4GEJztWz@+v) z_+NYvHqRQtoh{j@R2GHTUs063_<-qJ{)^cw<;4t+6|j|_-+_OujAQdX61I>?u#Z~< z+fOvX)!cksTFi%KUYD7kE*V&x`wg>9&47C;l+(Vlk25H?3EU3(K%|`)MAB4rFq(n} zFYQ?VTNj?fvF8x|*$h@pO-0qGV;rf%2w3Elji*)9;8;L7#!3rei0fl;l~&*^lUvTL zlX3?>$Fsmaw;j!1zQpz`{+u~;Mj@^96YSaY3!Sv98QT|p=)CwAkKG)>{tw+iX2oOl z3x+M3upV9QeRz3uSHWmaCFI>OWP@}Ha8K89>}nN;XA53H?EEM2@k9+;&rHU4rk-)V zsRbg!WuUZS1q3nA!N&776JI+8^nYAH-&#|26c_kqoNU9YCI6Tj>uQi3o(LzN24cp6 z`OIs1Eli0VgsWrCV6(K1^?iE~T7@3K75Wd`^GBJc-fpnlM0GeTBzbJg~PJhw!RicxUzs8%|Bdipl+`_G=J+O8!BfT{h_T zCg6ORAL#9}0IWw;A=>f*v#0hwSj_6;g$W(ZI+8w$tJxBh% zOonS{!wGXw#f+w>jIDJ#;`$nx5M7DhUrw@mkA~TsrypV8ye14;a0*=GzOo9(1wB+G z9bE^lITnQ*us!n+^ccHgPg@>1v{x{Cs_oeR*9lTKJ{0(nolH;aOb0e&tPx+MW%ec23wMog|)hC@O;cO zCMKs4dA$pubM3IQyW28kb(JM9%e_*AiiDk9DGVA z(xJk5mZO!1AtRz#VfdW2QVK*7!D|!*wLwsmZz>2f9q+I}T^hcqbg~=FSI~}=HB4Uu z7q3*G2mZA>4lC@3ym3|Nz5kZoymA`GPPYYxGu7xZZ~`}lxf%3Os3J96s?4dzF=Ov^xej2Hb_40*=%C z&6#z2dmGwjJi?hXvzZvrc+R3*pEy1GZtR>8Pd33d1}+CFFe>xo;o-J*7_>eTbaf9f z3%Bj%G?dmtZLl8(4y|Gm&GSG+ZXx{KrC`p_ z^9+C(-v=0^BT6U1}dR+gaAMc=VuTNP`0 z_b%R%7N+wx#yGjRXR!RVTLrz3OL+2zeegISk~Ka4l@nBT0=u$U_V2vg7@AkfE;+pe zk1dm-ilK9{YPvGa^Lqos^8XnIXDw&jBKm+AwU)DHM>49|Wk5niCicA3WS$vltqgVBf1T`Z-OQxzEajctnU9Y5kAeI@U=^x1u})fB(RITWJiJy8llJPMl$$Bs zt!Rc@En^HZeu@UZiuCgy9*2L*8IOnvrl=Bb9ZY zwV19Nlp*KB?y+9S=RmFqMd$x^;-bO7;CeC)b>hrOys;@`w&4ceJd$lFmwtznTcE^f zbzFuI#-liXasp{vD2(1`!`UtSWI%MLKGaP+1O3NZ(0NFoVW&@}F}`6CHZBYqA~%`d z_@$tdA^~RyE-?#(rwa7P0?>(=#;aMEYV|f!5woqtMN`hgT2JqLZ#@{cz*#pPCP(@u9eTKc^z07(H+`Jow z;>w^;Vk=tOvB0p|So{7oPOdG3z};C$-|j%61{vBY;0t_*Zsv)JHi+Cc!H=aE;LC0S z*V4#CAEVdor*w4)(T~Tp>sg$bKneKs=pgDG@qpZl-!MCT0LKgu!CTIE=6mfE%sVs% z7i_G7v|Vjl4w!}oRTUV1*VxE5emZWh6!fBnot!%_3((TsjXf9R z0Nbqgg87*}xZk)O`mOSztKOA`@?9u*ZW5L(7sryl51`yD8?e+Xi*XCjz*RGXnK~bH-o-T?JpR$2tXP#QHYQ(Xl27EZ zvjYC%)yK7ryz)3>+cFzD5i=+?bw$%fU0}628OmCML2EIxyuumyPT&b9$R%U_?t7f` z)vXY}g$KLsC~ul~Bm5Rlz~fJ>(IQ)%E_n8Sm1uGhb}Tl8kNcW<3d~;6&y0e$mUfKp zw1h8hA)v~HG3&R-bMh?aF)NSlW)pu2!KEXGC{tp_oPN~H6t%v@O^Ke&?7i3F(#@^7 z{No~~@#asQQNw2!*#u&wo-O@u;R_QOWwQRm6nGOF4Q^6?B>c)o=BZXU^Kg)lOJ+<) ztNtj=E$l?AihOWreueK2Izom`ELP;bVq-<`gI48idgWIo)ZG3B!AA}-pWjVpYy`cr zP<%b7(j*AB@U208SEs-)GJr9$L+p-Ck@TmA82pm`h0~5{fqRcA&MEnWpKeWOip?bG zjQ3Wcibt8#PwjArfTfyf@v(8YA$GQX~jo)lZghs)=T)LIu*5Qr8!w|9-{BE3%uY@Ndnz?96Ywn zL}jmOoQEaeOv$1&RBf0H>)t95B>|UDJ-!t@bR9vba3VcqER3RCbJ<9?1AdFS;J^Vh zPzc!v_q%z}Ddk#_w?KtN1-^y_0tWKkAQXd-n$Yl?64VfvpiNdQVag&I=3U-1=J=Na zoS&n@*n~}FNB!O*e|bE-G}Ytf@!fEbNH#O2wGziaO4H`&E5WE(h;jc=1}CSz;9TGY zfsR5Q`}*?&S~$%MwK`TX#%fn_n)P4yZSiFCEBqgu@Hn3_`)PyYo6j@-e?4LSxi#QD z?=eifx{&6+6oc~PQF!T<;CHkhVm2;H<|)0(16sHqWEbh6pQHqPRa}GksLW>8X=H*< zqZuCKen6Rz?oelEK<8TAh1$4FV6XcL6kxAwL=XPymB7T7wQ%nugNHW>a*B>|?8o^$Ugytum~BhIc*AoXSgr#W zPP%BWDU5F&zQM5cM||P(99Q-g73_#V0+VMr!k6V*R6V;A#KRAPU2_K0^CTCVyjD?W z`hTFiu!0vTGMnyll%!eiz3c~h25+7d<*hfEKmwxI;k8wkOqgaT>#BAeBR(<^T_MlDNm04cbwu4)i4=%k;vC8=!>9KbpuwRyd~2QxgIPhGB28^N zORSosbyJ^6%WlSKTUVUX{s7i)SVz2c-Pj5;g~YUNWHKc0(rppDnX-kJplI8Q16NL= z$ccJpTlh5g+CUTfd8yLdY57b+XA9h2w37X+oQ}V?M#E7-Z{aL7#$#I=*!{cTvd>y` zusR|KCVb+8`+rWTobG_LA9q8I(pSd3CwGV~668M)DA1|%bl4kX{#J!(R4U#NxQ zvg!HITf7+Trs#n}Rshz9vY6U(mnCB!K>KdOQhi%gtNR1UxZ%B#d60KE6;IhKGd0^X zv44>Xdv$9QXSvii=CgPt?pQYsf=A2*I_wEP`s~AA7l^H{c~M~CsKP`#3iemmCg!;3 z05mxSqVc0I?7DD#=goMl@Ebc4WY7FJLlIQ3#zX6~OORtioK-^alb?(Zd%z34n6T`zNwzA zk(Hz`8U>jjF?TFHt%upqByr8_3Ron$id>!1iHppZ(tjV{K+oq2_U7_-l-AgbqT^B! zCzQqRx}DA3-+TnLq#uLzQwx0FaUXrMDCg2u3k-2ez&jW2@NU=Cv(s|@nZ%j0Orwt% ztT34j*{=7nQ(J~eA5MZ_%JSGs``BB0&y1X&tD>dXZT=6lZeEPZQWDvw3#v=x(BQfa zzJ1XL`hTjK=l|Wq?h6GtaP1ef>ivCeSx^9r2YT3R`6^hs^Agj2T^sp%chT|nVvMS4 z$C!~D%w4WM&e?Dg8nitzS&;o3ew7Ut)p~fUMwi&vZ359d!OW+<+hOQ)5aG<&EIZNzBunL?V=R=*|l&IAtIY zKU-Tt)Qf0#CT~01|8{~k-B)1hx_Iyz4`LbT%O9XhNGODFd;+U=J~HtJ<;>gD{?PKq z1g0H$22PG~9D4GSz)KHc2UZ~a@{t3~+93fe?_0vz@j0x+^r@J@tH-yQ|IwRaAJE`~ z7QCCWoAT`>VQilkqit_UZ%ZU{+VAJHJy(1&JT4hus+SeiW*&St6+hF8oS8hOM1 zJ^YS!%iaJkmx6HNTN7xv*=gkRT?$qk?qjb0cnb23#h|R@fv>HBhoB-{4FCp~nBqFjR1x4)CXsg9k+MWFcjyShqZEPTN z&CZgpid?)nNeaAw9cEu&dWD&7MNr+R4!fn=@xpGwjg7Pij;U&ZbmJj(j;Mn-C-$-> zZvQw*J&UQw%e7##MV?%|6^bPiVr0g)S?DwG8TNG!K&whQZlRgX=GdpWV`C(A+`P|e zyg!9CGV8!8-_5A-=Tm4xmXdid-I?mMlS#(2LtyRM47;B!!R~u=iCgd$cr0M3HLuml z(3Qz}=)({kTYU;#gcZYO3*&I5u`Rps-cz`CSQsW)|HbF$ zqM%QFGboq7K`paHrf#b_#%r$!E!SfBp*)iq&6a^ewZA-r(OW3~vxrID^#q)@M&Pd} zijbjx9vmMC{JTq{Y;t4>SQHr|Irzb+tbutD zx)77Po9~(-kLr^s2uX_Y~EfBAV zijp3!Rpbg$puUq=(_n=GkUBPj3@*t+!=X^-oTM|+uPbBqixXMT*HbxRr+?tlGdf^( zs~8@nxe^6Fg@&OJ98t_;To#qWpIN17yL%YRyv5+7Vk}-T?Pe7#x3HHI{qT=&8F5=b z5zlAsrfsA3sDD79gKQ4aK5;knoaD~@zUhJIM4p098H?S~GIXiQ9FTXoLN2Vo0*3E} zNlmDr*X>cFZEb5w%WPE=E@TV^MIZ4$>0ZqKkj{pLRN+?H^>pb`UAimc1aGwC63|_C znA!aV=Z*T{!>NY!L)IjY-GM`l$cGj@C-H!uI+cKbCX~SVOdGl=`X$Hf^A1w|w-N5N z*YkY&v)SbjBv5i}BK>l393K1~fv+yTcq?}wToz>IU&<-75(hrA{@u}dV#*UJ@tw(v zGj{kXZ#wa|=r8zwUIXYvQc4r;#ZVF?!MOc78>jqzxD{w+X1xn{G z!0^yItlt-hihquBaQjYpb@mk_ta6_5unB_v6PUXAUA*SdJgoe@ zm^A%#!(SayD5?JxM=W)5uDcJ&ERY}}LB9BX?<4%(R)t~8r|``8nRNO;5$e4|nk0J6 zLY{mcGzF!TN9i;96$8zns{9u?mukrRT1$TZ0UPen5*{$k*@F8k7upq}18cU45r>>k z!_CEZq*KV5s3_~vWwE04(}OZ*W2_rj`}HYG#LttG6_SJ*eM2lg>Y0|GqPYBCDmyNB zk#$@Y!|433h8jVv<%Q)ITs_|%d^C+x_D<0{H{OBp0~e; z-^TC3(HUNhoN^j`e4q^DL7MoV1&24_w2Db0o3OF{5I&nMPKK(bVU6f|w(Fe+xh&8R z;_I)_?(S`PYJ4iGR8)stsn2*IZ!0tl)wAXA)6nIW8j6Rlf{jYa_*XUw?2 zn3Dy67k0tC)DblC9)O`aVQj#60b}HhaK0Laf}Q3;#vPx*e9=MF8UMl=6tEeyr8{Z= zi3E7_?iSgneUq*l>cz!V#ORO06`%}Hg-X|lrOH%Ju!BRYCt zN&MGKbf$GVglrhb`hmxgchC%)I)Bm6n`DueEunI%L-=fe0=4rK<905np=}m#h-&Lh z)Q@B7-24n;DS*;aGg?`_wNX^#Lm)CtJXO1Nox!=!>CLnU7;++xIdR(^YV84+%=!f` zTUS9?#t-K6>Ro8kZb-P9d9Z(?4H=Yl#f;=sIIA6nH|mV&{wdFKyRjrqdbXJM@)FVY z#&0%Sz>tenV$r2|BFPcmMjtNBqekcRnbZ^moN-%-EHLS&Ka*|fja|jWX5uC|yo05x zG9^sX*>)&3sw3tDi5T`_A|4ZDVk@eUUa@s2CxgUMc4a2jzIhMg>US{TzbVoEmO;QY z&4-pB&Scu-Q}C=ojJif|gB2gFL1OYPw#Fq3T_h^-?~GVMR;U7u4^@HAmQYq?bRI8O z`vcNNL+t)(TBKif4OQ=U!(ihY7|#?U5w-P3RrTSH zysxmh&XOd|m`K-+|7B10{4i2maFdN4jsz;z$g0-dg1bkond5ydyRrBO^I@t2S+ijV z9bGTD*V|GHBU6X*!1@VPw3Q(#6K=qA*K7DcCig$qmQ?NoQ!0h&{!e0*-dBW?{R51v zkroZw@51zLn@MY^FIH~4&Ai&&iH5Byf*i<4c$1R}<3|fwujsWfcjZc){K*eDq)_lp z&JgJ3YK$`b2TuffM~E~b|6_Wp|6if2MxotkSz{U}TyRwAL1Yv9b0X!!5kC_XaxX1hB?$V)jtye)qO6IMOM+%G)r z^QaLYBebV>fvZ5<2 zJFCY`+BOQ)YP8sAV}krep+D*Qv4gkl%xpaUw3FmtksvR2E@poo6@_=x=Wq^xJcT6A zU!XCvctTMv@aUf!TXE_EIJzj~@G5!E7TO5qoV84x?mD=qa}&~!v@tEhmyFV9{$!8p zXET;Fys=VTk~qyt$2UTa0^X;EkNryUZT5FIyu6cF=2J{M3UUG39>78M7P4&r7kqF# z2s5%Tu-0>?6Sdw7oW1WVYNh;Q9b{e6BwZ6lOX^{-gCcpZU&jl&y&6~8K8MPCuJGuH z2M)Pu;FR;PF#6qWv}84KSbhn(W{3V?1!w+_<@SYfk|ZTf5~7F(b0y*3Yd;#KRFsNJ z=csg4hp0}%QOT4_Xpjt<7}b(q3HD~*!=a5kizSa$3xz@7gxUTfxI$c(9GxCVy8d`>YP~QU zHGi1EV&^r`yX*xCHCzS%(O2Y_hbQiyW(ze&-^l1^v-?)LY7o36lQy0F07WB@VVm(F z-DW?LSDeufyKN((;Qeg2rLzpglq(>oD+EuyU&R(2s%Nh!SPK*Xn+BC{W~0S}28=RN zM@`E|f{b+`m}hhj-t{#?vSAq3w4Ns&i8<`Z{V>6pQ7(E|Poc62QNZF&F;cUV_(esc z`T<8O<1S4VTQ*|I-eQ{hu!<#Wj-x%DyV+L9kjWayd&Yfa^Vk{ut>HR*IwPHoHCdzbKYe81XE#`r z`WMOjBM_M;SL}AYOL+BQIlY`wC3GS)aOwU!s&99fj41|_+T_bHDAR|%@`>nuF$c#Q ztsu=N0&4dNfCEr2wwWn?vxSZ)g@U8O1t9OZ0 zf?Lb=VN00=3;8_|f1G}cliQZi!?v5)tlXVguj)Z34ZI}lH6<`sT#eIwGl4c&^pkb# z($Vi{4TjYJV&a$M(fj@$W|WnKAMY;%pO_h>GQNkt74cMk$9!3*)f1sc#2$EWZOT0{ z`xiV;dXk(>R~A?!4s*PnprLE2VCqCQ2s)%rgXbCG=9XOCHS;&D^S=o3-&3fHjp(K> zZxJMiui^%6>baTzMl>mXJ(F9QL@N@U(Nn=3Y~#YB!zj&G*1#F)1z zy@96HF_eV5a4BVl^yG`Tz(k((;%-0o?$B?L7;B8nEG7A6x8#H+o9tkAbtIFnC=vt} zD)66nO5oHCQ!HC9iAk4J1un%`Xz8bVZqLg(bm&exSt=922Yc3_g<>-~D!URR=jw46 zS7Zt=E(}Ne`9S6$QszgUWceLwqWkaH6Hv>Z#HF5XBepx^(Jx%Y*KhfU_+7W6VN&h* zQp6x7I!f^Nl?qH$2_mdhdVt$iEvaS3S!Pf;MPaovy zxh43offV&>t7coaN^tjyGM%N{k1^7sH}t64ri5@HMJbrQRhiee5259HU#Qcfo8*Vv z6>6rcNGnA?K;ycrtm6+QY~K+|;(eouZRcW0Npfa3e^}Gx1H0&Zc_HaM7y?G$Gw7WZ z6V5ovj_P#20U8s}X$=)%Yu;IQ;;;?)O4s7k(%0-{=_z)0s0bY=`UpeR>>#?MQiOf` zz>n4t8a}8C%`Gb-qVk;ZM6D)i5E{bk>UNNRBSuZOID=4cmcVw`3TQn&n|As4l5GQ} z)bh(k5I+;hj+s89(wm#Og31$UQL2o!)dRFTx&vMwz6U+-GXW>A#xq;BiG1)I(F`W@ zw=Q+VP)|EdZnOjqv)y#bmtZa;N0Dql{Wos==8hKYcfzn!3@cyWj^968vS7>2%y{gCbA05gn5 zw?g?gx>)xWeAKGKIpHfPuQx{hp1q?t54Ta(q?5Erbrp6CkAT7_S>E`U2^q^>PCZB7 zq5Z?{Z)^-r|5i)T2JZ>qWOBXslA%vPU_ zgt7E%U@&V0GFrC5Ktli*=HScSoz%*-?eBt4YAkzQ5yKLC&Z4W-AQ$A>NZ#b@Fhz%{ zXgW5IuKu}#?#xbrZ@V?YCgcJgt$ru0pDv4q`%B2`@&2e-@sw#rM1h`qoScGY2Wf-^APJ`g@~8Q_$- zfW}5%Lc@l;bdt6^E_!bwH*kaR6n^E|K{C7M zGxpAnhtfS4>4~}zFfI5dMock<@26GxU2Oxr!RrSYCFa5S{QG1~V=`OdWk|9rd~u)o z0nn&S#7VkNxG4J$MrZAT?#-WZo?a=HixkIoxnlf`;9&ah%m~-}d>Y)IXN^Bzm60 zl~Iwf-!vU&We=m5TQrCT4uQA(1Q^NgLcO0xa7j>#79YCc_Gk{MpU6VpqV0I*pH}>O zT1BX@)Bt}-1&F+w4S;=RFn^dQ74`Mth1Xa>qZxh`{qYLLDqyTw4`hyX!Md6dShP{8 zRN|66YE3u;C52YJ6d#1*(d{IS>w&hA7NJk-QK~UJ0E7nW{MBY3*qfU}ZNCn)sX4MZ zdEQo(C^f?RjQ{wUnWy1doFq2X6!51fTCncwGT{RaFRa@>0ZVjCz|qSBUjCg1>GM>; zOJxSyXce&0_aET)({`d5761uqaX5b3XXfgZj)G$^(C1D+*u;gQ@!%9Z<jb8hQ9>?R1gna*k^sRN#cm z$LNOc4p^w4M0YqJAtR#R4#hv0Vq9A_SsI`~OX5$FTGyALee)>V98V&Z`wZaxla;i! zaxE7518`J~dJ33Ja2u>YZu9?+75bXX20Uf;;__U|zEm@B&RuAUCP31vPSlgvK7 zG{xb?Vp!^wSWGtd2)4wzz*3Z8F<%35(FhOTq85U@C>vN4*$+X@Kk4JQ)mZrOD=Qkm z1g@lgX4mRR*}}wh(px!$erb}#cy6QMM7;sDjb4Ca$15h*u4B|jluhYqOs7Ui-8iD~ zgd4ORg1MUnP<6n7)1{pnOFFwnbTk z>SwjF%9(+hi|3`cMS3OSI_i z#Obuprkzz@4xo3$v$zliWuo#b1n*3aKzl%r=C!|}N@u50pHFWgBJ+c27BYkj=4WF5#(jAHaS83MilVlXUEB+o8x-AR*+_#S z{r%H%|$A86mrrTw=0B=v?NYcIP4n#pN6KTv|y z$4m0OY5`c4n_;1+m#KZvXmMSF3x4-7CoAu1!e5p9S%k|;v~a#dM=b5ZKthcCc)x@8 zcmZb^YD)z#uCU*CJ%T_RSGauR5=MmV0JD$#;O_(dqE40;z*=Vr)Sm}8ZPHAmHGFWu z{5D&b8%_O7J!$xfVAO8C2`}o-!Q2z-bkT-VokaU*@#%Z8(f=rfwlu;) zi$JR5pGPJRrLl!0sc7sc!Pj#C(KoWAEWT_T=4tJKBc`%MZ~Z*bvUmxHg5F_fP#8&c zdjuu22k~Zo9qx9EC937)x!DV*(YB0JjN7(}v$lFC>c@}}uGo;n4%*#-AxSal)Sbhg z6lp+(TrVwengaV{eFe@U?pL;d0^Lol;Q28_I%kpsy|=7{PgwOE-P;&|L%M48ih3kT zPTd7fI3`qDpaKr#hS^^mdf4Io@xs_93SCc=(R?5e+#kLYK6X%}Ykx&UOOZ4;XRbJF z&X&Uuy(*YpKap-&mmyf|;s<%@W}IEXR}!ePnmiup0PDj#_@Y0R^@@6KOk{;vA(ljM zbt=$P1&isisRBA+uo9g#BbnipQ|w562Go6(r!$kP$b9u?2%U_iI>DEXdUVhae+J`Q zj~A>@atf#ZQp!v|a035T;Xd>0t6}RCzQgo=Hjt&?z#1a|q+bIYp-;r)G|$;WY=5?J zPsa{Y{pP>Hq09qQ`&_uitMUaAj&-oxUY!Q6nMJq$eif$eoljTZ*~2wy1XGV(D_ZyD z2X^Qnz0|?+%2`i{U3L@f8z0E3UR7bngPq*X5jU=E=o6AP0bIKARaDrQApC#k@c%4e zPS}0nYEeIpd_Vw5tr!DuA0xh0#t{daN?@CD)B*nkwRO(3 literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/interpolate.weights b/src/autoschedulers/anderson2021/weights/interpolate.weights new file mode 100644 index 0000000000000000000000000000000000000000..41987c11a34223a8e6619e18fec49b37fad17254 GIT binary patch literal 20400 zcmXV1c~p(x*Kg2(N-7CS5<-z=sP41RrBF#ih{_l;N11&|g+%k5Qc_8?(xC3y=QdCh zijXD|Ns}Zg!h3&fy?^ZWtaaDDcb#XgbI$%ud;Xr2&XQtcV*e}Y|9{Ggo;sqE5!II5 zRE}R{OK(be!*AymY{*?!`0{WqsT@C#-!$nHlwEU&zAHH(w!fF#u29aiIwA(`jb4xm zFR@*-h%218f*okff-v1Ye0DT|gSef!8vPwN34<8}8$l^KKdCeNZ{Vwt)d>vs`MalAP+av63~dNfn|N^5U**1YFXOQ8<)dL ztjdBoQ!_g8wFj6Vy~Rn(<%2iYapqIHxWB{&Rjf;?n$&zqsC^7})y=fSg#zC-lG`r( zj1KI1h8X__&ulcs?=l&f(ESjN`FAkOAQcqsv!M1|2ufdXM41U6@%qOZU?bQ8!VmSl zQJ+1j+*`04?q{>T_mj9|jbyFABTISpgf+c&VX5Ck*q>h$g|~_xL-FyusC;L< zP#I!K`3Y5io82Y;g0nNZ4Xz9Et<76#D+|HQeY@D1DV1zN%3<<+R|k#VHJ;e5tU$#T zk6^@|40iPJX0}ttf-N+CO%$u%adL}4aIdBmlXD+V!nn=?A)8;oZ+mCMUzZ)omh8$Q z+p$>aE@*+vQ>@@ga5jsN4rPM)SuFPSB~tl15{j2y=Uk-@fwe^jjv2R&JCL1@%b70s z!DTK{7!cx*`;9PJ!-zO#N3(>AE3C0Tk`!7zXOjMIWT9du%Sb_%{pmOs8gIk3I}rKb z+jwPn%VFfNL9Eg|3^tK*^lr5UxtFRSd>pfcALppZ7tCBo63xCsyTT{5{#C;CZk-}G z$CeXb*(APur9J8$ZKNFy3xo!L4v+s49Xy-V@Huve1>U1iZSEUARg5H zNX?#m;}V@=;QxEW%Qm}!5lyL}diy=od=9}$-LuMgMleZyvNGyi+b8NB{fyuQo z>{R4*qMNl96a%a{zA%uEvYU_bhQ-+Uub--3-GnZO1G(iT3PST>2i86JJ5jRDXU4Ak zacM~#j7yT^$IVsaH=TOOj^?|uj8_5tbx|!S-#lIT=zy6}<-}QXtH^fA6)@rZA8z8xGNR-y!EXt^M%Z0B z;XV%?e%_l9@=3Lf9BZE@d~cS;)Rvya_U~R)UzNaY-!kxc-2*;fT7eg~3>=gn((30n zAmO;51wWref?Cr^)anU%=dB9ag*I&T_V+;c$`FIb^_XCO31#>E#t!K%*q53If1AUx zou)u&^KXo)?;@)gt1>dUnY|X5=0D`Of&WGsl5eq(wS3&mX%@uNyC(0czq}KtA5n;X z@5|`VW(JQ-n&{^E)eum=37kvU!N*m3xV<@(Q&T&N(+qxLUGq99xaSRr~38(F$bOsKCb&5x!j ztfI7pWEj3DXQtkT{M8Rpto$-3JG~CYwqG?~0*>^iZ6H|xv`4$o-O#am1J-?fPU~hL zg4*Wym=vE5^$H7ce_tkEHoF6^nolr#sTb;Oddis(C&66p5@>ltc^@o`&|K*xCKG)K z`x*uTse_>Z+5_dH?o+e0Yj}UK4m1zea%yY;!!+Y?sFQAmteL~uW%Y#y+^U9%3)5iY zrk_xvnuC5P+aTh*KUX!6LwII{4pXk9@w$8*k@=4HI}U?Y)FrUhRDffi7eOH<3?0V3 z2E`wjs7-<~Kdm7|=whnJbZ)-lHwvquQ%*p>xTF%#&_rQfrjRqdQG%mB@=#@*6bMT% z^QsMg34Br-(Y7Or@=9iqQF|}3vUlPnVqqY$o-~2zb*OS*UM_{9D*}8G)k$2O7qMEk zOwhCu;}5)=MP@uu;p^{FW4T3#*w>Y+B)Y$k`h}Bo!o>)JGmU$Z0WGrP-jBz8in+ zJji{1^_At!1QJwTM_34Dk00G*w+}2qEu%E%u!hI(yBrq|#Y*y*)SP7+7QSRrkRBOQ zvf#USMUaN?n#9BAA7))_;2nwl!Z=+ocGxQv?hYDr8EX#{rJnELv^)yTjF8$`$KW1^ zaM-#q3RGAch|SM4F)cS>rmm(!>4L5NE8l!UCfS<}@4d?ApF2*H?l_RqhGX!-f1bh^ z!FYbbt)Zkw{NcR+ym5&|~V@`vK zO*?2ZgMCDw>S_@CdV<$D+8QFpZJ^1kO1b2kpYYGagvqz=7dBL_;g9l`B(aAn!vzMc z&dG@wd7L1JyoTuL&iz8em?`}7v16H~k_1_NHk?y1+0GwRQb5j6n*cLc#4`W!3)uN{ zUeF-6nRj9DXCiA-ObYD0x&7AnV3Bzo92)%{`(G!3T6{2QYK(@vLqYH2Oh>45tqvhVOaLPzX-P5wk z^l4k*bX5Ucvo43^zE9=)vspTgPux|7tdaEtKC@>0M6~7i7ZViXMryXHa=5D6v=gA(Q zp9C2ts{~%|6X8~oA*#r0l7BA_gT}WBtU+r(#_3%VYVDg0`yS}>B_-~&*0D4Amz*s~ znqNE@t$R$krtK*)66&)#_uYkKf^CVGm;?Vy$V-xXezs8i7YO~&Hj{15(QMSq8m2US zh|FH<$9FLBBf-=Caqh0$pw(FoXHrt&jZ!x{>=eR@MQNBYMi)-0hC#nW8P%UT2CC<| z(bu+@(AmZwyr(`!%@Qg2IM9p^llGwA%=;+Tkytn|>Lpm^UWNpPKTu#b9Z!wB1>V&) z+~E-jvWM!x)_WG7)7}o%Bh8>EK6?iepN9Q-x=fI)qh{Cj2D}I!T|tAMlT^;%ZN5qnY@2 z$g2~8wpkjAwM?dES{t#HzkVvxcf^{uS_V#Mj>27CDi))G4TG0RW z9#Wjf3a5P<}JhyuSQ#$uQVbZ!l$6<9nJ=mmJ3%^%k$5Oji)m5Ga3Ky zKBU5H==}30is|R0O^70^`k4;oLo>JOY#^w2?`JLQd)cC@iR4x2QS!vS5G>NvNt4Sp z)?#}FdRF;D(yL%d9vBZ2=|W7djiv$J$7oBgCys5hzy`NAc*SKF1-_g~Nbp9sUD8?@ z_MwyPcs+*QP#MA2YFe-_8}?Ckr7^_2U>y1KY7UJuTtYnN#?h%g*5twkRkk$AgY~Hu zLD2q6xL7SkX78OzGMDS1WNRv39}rT_lU8&^>U1>za36f4qrqXrambQRoH1E$Fb^F(|4_fw8XWYV37VHO1dU0E>Nnou zp9hywPA8o%HZjEPf>~$~;tW;Yro5my#{NuPCM=1mC#n81s4qU6sXOdv%Zq=*Srp^K zBe$d81A0|aMeF>*)xTux;f&} zjh$FOu!}gqE5Qlnt5CPWgH8Ez3*L?W%Zi`Mv(gvS$epe;*x@o0iq8x1=7&q%<%k{- zUmD6fQ^nBPhY6&kFHon(OStlPGERt8BICEj)8olG!m9V0Y+G13dwz1WuzAY|5>5WV ztL@kM)1=n$r#tNw)~f^%>*W&sSzorY(Z4##WTznEJ56o=jgd|yLPwREZCX#fA{e{e zagW6peB|zpp2ukEIvj2_CJ-bhtUc$*($9!EjoCWkyFLkixAz`mpm>S=-T8soPo?z! zfII7dB`!Sc_=xqK%w+s+AtWzom>fLxo?Iw1Cw;SSvr%P#n7pwo8`FD}otolGVyQU) zZL=Ry`uPsJ@-NccyQ{HJ`#q*@Jr3suy`ZX+fSdZeu;R{Px=7m)uj#qtX~7VBJbwf_ zbt@q)?FS^bH{eFKSak2?f!ZzsxBALwG!f=u*&ThT+ggVs!V*EZ;U#vhvPW~LJ5VeA z1^QO}fyz-qP}$so5nKIfyKxj)oe@H#x09zbwE^nhedLUl{@}+Wk0HCJ4k}KBL&Z5OShspPZddJus__%RVf82I z%+p~qGa`X(vfw|SEiZKMbr9x;*RgdwmI#e<89!NH5trOS5bOxVXQS_-rPu%*Ja)vhc?? zME++iS(`n9WRy=K6}$HeH`G3bQF@ozn=TPCUvk5fp+(y{NX_3T>Gx;d2;WOU<=;OC@_H$|vpwhW(M z^uoK7%GmVMk%G5x9N67iW67Rp8hloM9&@H%gD_=3pfCKX*SQ|n_n)P3rrIG$n~3n1 zN%OD$FynhZkrytr6@ucPOx!sb4hCUYQ2*2}t~zT0yxm=hNh$TP-9CpYoxeoZC4M5Y z{>iLpPZO#45*Mca&4I%<7xC#+B_@4pKhzb4u-|`jVP0f6d8zjp_Pm*fH=IR#pKAs0 zhaF5x(MC8l?kqe~R3;jlxuol^3P1Lxw(u$&LiaWD0{)#TJ#6HQE-uZ{o6X>UW5NP?=sLU(_|L}+x+Et< zMUEeJx$6KocOn=^WZ<)CHH>%{k0C)vK=bT9?)AIbDEqh&_Vo7S#;M&9wI~(NgdWBe zECQ7#UwVJh1mJz!1{c!?P%xnm_ZaTOO&PIR;W!a39(~4h%Vgo$$|?AGxC=GkYjIm_ zvfF^d5W5WZ5XmCm|lOC49WS8obY5c zSFzXt<1(Lt+}L>N7?_XR6LcZE^c(6w%mb%pZ_s@3gV$DGin1^Ma5qOj2eEaHh4*Jo zLbt+1T-4Hzvc8X?CG|9P?N|=yLKW~#L=u?IjfOf=-%z|;&3%hc!Tqle;|LQ9OKurZ zG0TT0*(ljn1P`f%=sZ)mslBpA$zhJy=AX~^tlP`~ak_ILhccPUk_s&F&*xaI&j_QJ>6NgyU0DA;cw4_1AFaA)c> z?3H-}y(eDa06!meSB?hZR2i!Bn*&8L;I<^TVb=K$F6&(iCsr5C^?#4W?T2LqP6nY6 zuzV>>419(Kb9>RiZWzs`_v59zE!Zos1+p*!+Iy!$u3jk0EssKn{bQgf$QKlBGePLC zMa^Xfp?+!{Rv(BJ#K|uN!|BcR@5EKuvaOeMcAJGx61jAR(LUV%dMDPcP2(C3C@98g z0Pl7-KGJByb!C;{K6xokyC8a}g4vKpvv7%ZGiatA$AJ7(sHEeAV*TMJAAQSFY^oCv z=JrGK!wSxBy##0q-_n5SyU_p05cww!c_EWV;IF_3B0kd%?$x)7K9tvhZP*=bY4!!f z>o>9QU>JB=_Cs-LFQ)jO#l&sFDEn^$Hah>N%WPl6-@XWNN|qJ(73<_p!@2)D^o4DkBPRhD_- z5|Q?*HuDcx{b(HQ=~PAiuaN?;m&J%TC?=;zfQ)eohTHwXW&1p!che~9V%m-oD>^x6 zt817rerFe2625mZ~aPFa_z@7aq3Lg6p2bCUjFs>Z=@dJ>yBNep&t3Y>8 z2j0WZS14v`3imyhWBR+tct%+P-}l@`@0=zs=&>7qaJmm>oqDL=rVde4HsP6vVdxT( z1KR&yqIszW)oIG7r@o#C!xxBVR({m``bn5*d>6LZCqs7H1=PH@1GXM?MSP{Aa1N z(ZIGq_$BQWzjxm}p^>x!(HSKv)PP0od4d(2_TMSy2bNsyidz`nrNFem-^JNRzNDVi z(rMqvG08KVgnAJNg<@GsWby?en=rG0A9Z>hv1rkSxns_eRT)2-Lhc#9)vQaTJ#v;X zZPh+p|FfwV-_pZSVXSAe8_uj8pMS5)6nZCg@Sr(rl8YHe6BKK z%TR`Jo@O=4kfi);uf6!K&S&@+PPjw;IAh`c9ldOp z?g^nqji*eYsw^KLX{MrJz7!OE*Mpu0KZxo)jzaz+Ue8(+Tx6R9(*8|g7a@d7izS%S z_!1+s7J@+hKjBh#NpN-qSoti4N00nq*;EG2CC*q~8cC}UPN5rT2tj@GUwVGR zef;Ir2pVhpp-mzYddq{TW>tY;;*t)K+c*KLv_7EU;W*@X-{H=T?tnU5f*%%shf#-9 zu(4MiGRU(wv*T7e|THL%|Qg}9OIdK@dkj?#elT;VV2`zRH z;)uJRID170_K18d<#_^p6zmP|HaAVSH%IVzUxP8*oX>8A{J^i<3t?fiGKn{m#vXrv z)L%D=xuz+w6G^6Yi%JMA?Tlj5V~?=35IZ(tb99$y#0t7J1ayeONwC*CHn&yOTa8g_%Z*$m-E89woJo)7N-3fZ`! zgE(S!0^tX1u$tNshNIRp#efjPZ6Ai~TeW~^^%ci%Hf0B_=X3A-zjDEK3yEE52PnJE z6^dQ$;~k881vYOzXn(aS7KnP5x28MFvSlbdS51H0_u~QgGN_sE$c@~rM-oU3Oiv_C z%}a(;xaCjNmRGXq`Zg@l9OTwK{Z`ojLYFw- z;QJ+$4P76}lJ;!kCV7*Eb^G9`&3&M2dKWaerSR^wm0=O9#X1!O@Dt&V`Z;p!!}!0n zwxfyL`AL)9*w6;}QJNXwTg`O(KEMpo`#-cT!Vv4Fr0mWN*wQeMZERZ%VUaV)YP){? z;I{_KPF!V|@2(aOKNtoz*+6n~-Af`px|~d@$Yx@$`qV}>4jpdyQ}Ye~xWToI6KnZm zlw?*99*$XfsHzX|kJe=8>f$-++6Xu?aw9C9x03nX`iFA*cQB;Dij{s=M(?jzpxsUz zy_7F7Q?b>}LXzD)pU-Vw#l!GDsT?Hrk_KMMXZ}BAl<-b#e-FtL}7E>pXXdf%+ zdLYuV6-QyEvJ*Z_u0qYOM(8>v(&mC7f5Otc90C~qtAj@nRc=i!gHFf|VjU9)&fIG_Ij z=avdV|MEGmt6c$>=Xc_L^VPsV8_H{0+{5c1u?jr)_(0UQ%Mis<(b;4SE_t$z3SXWO zs2z9+ai3Ph!m+8SnLW&tD~^QOF1dJ2GYt*|L}0wU7>2lKfV;dJPhVvLw?fVnguF{d zp7|}f`pFY)`OlB1q45jS+FD_&^$&EJSHn3x7DC&Zmw4dnAojlR05Tx~;~sOgOlBpz zJNXC-rW(Po7gbPj#h3=oFT$^oc|AH{o+As%wkEJ{g%6hPNkiG! zzc_^lJCJLCiYKbSK!%hkqaVJ@{zk#JTrliT#OWu# zq4$XyU^}P@acg6_t+fqcGof8jV6hPE%${-s4)d_%auHTUr9#FrPv|)N5QQ;09N*ZE z7w(ygS@qeRgKZ|_$`>FLSB2i6k5I$(pY-anbnKZ{k0CRTg8S_+RPB)H4wcA6_kLTR zO|b>Gq@Snj7n?*p4eNOeLe&-?9ZXBN+$Q~tO&o|>w;R<@nGn1 z03+6Mallb|7>6967S;L;RK<4Pl8glhn3`fG8Pi7IH5 z7YBvnv8ew}7S>9%qFq!P$PMX%PVG~0Ido6prZpY;nKC@LbCs0$C=<2|`{;{bqtR)1 z4JZzGqC%oCT(_?SL*-@AJXj9AkU!Xx#>b?im%%MNi%K^<2C=w_0`&n)XxAQ~&cpU_ z>`)GhT^`S~tGUBfwEse}B|i%XCdQ$@?N1JQmrVKijEZNm5sEs%Y1878H^1#wXZ=6`vZQ;VvRJDd%Yar(?X?*N2` z`(Tg5J$m}Q86LWR7bQgBQ4^Jgs9bX`Fh}D(%6pxK&A~?8I1g>MvS|*=>uBS6rF92~<(Jd2Ty-OEl0O9AK94a$C5#SutcF?bSHSf1NaCuV zL5njGCJ!i*0be&bc1Mv+%khU8hxKqR^C|7CD8TXWC*s%n;uwGF35>t#hYQ!gfY@PK z-1jFN_Ioenj(#XzGZ4&R?&fvqQ6ingz~1%Cfx@%!$X zoXUFFLc_;iFk-STX0!=l+o2q~=65hk%{D=A$4>l|AcjU#@fhlo0#+lep*_2R)=!Us z(NU8zGQNy^;l*+5o2@yUS^EVW=DeZ9ZTG-uMh(|H*#pW}6w{IYjgVJ=#Y9dropQ?_ z!4D&MJaynGZ?uLGG*`|7JXM_N~lJkt_+RLCti{&~TPu3S@Ya&}TI_v>mb z%G8Q*6-mNX(sPKL_6fQ_R*JRl3xaE#DB2C^kU3-JLFoIEo^ij8N4OANVN{AU@eN8> zui!prCG*0z_Hucj;soshd2ma;mbO!WPQ^DG6MqF!---S>)GDNQmo}k%!w{-Q>fw*O z((vj*HJmlt2`hs&d7rF%V4U*`&JJ#)L`)D!jf=%)zu$8@@`V_*dmZ;_Tq`XfaEFbf zCGpc@k#E4{0rp8U?M*IVAX>*U>v32t^A1l-r-E4edl+qDiMu-Y!g7H>Ei=_2O>W{u z+WQU)TIXZ=#(&(>s;Ah!>N}j-A%?5XmcyU?La?ulgpaQ$q2{BbsACuphMgmr=Eoa2 z>eB)=9&CoA!QaqP@&R`2dCyC9ljfAF4XC2XJJ0nPLqdKJ(uFgk>8Zw;qUcO1_I!&Z zHtp}`BISgzU}uW4PjNo<=`4rvMXzAMPMx{0^5P6{KStX94No8T;&soIhP~?~QK~;4 zE86?p=m#-t&*h z|8^C%4?RL9y-(;r6ag83XF#08TGV+JgB9mWaLS?4uqrMAPgp6#9G5qs9w!cu*Jfbl zbTxKyoeg*#bVKRF#iTkR3AS7R<{ti5WnUybKp1r#)1&Lbe?uak7ffY^Z72?*yUTk{)3s&&rP@-))(RnqCYuCu)nH*5!_B6fW#tIa;GtMIJw)_`1Jsd^+ zr-|bq{!3oK`%bRcNc8{Y0*tWxhMKBkcy*@=yEx2;drO13wo)e!3@^fz2O}X!O@;ef zx*f)Heq4U=5}N%rg_<1tTy!)!pGzo`66CBe;7UsTa85-8+_VtFKOapdH(?wsc9TNt zU5GWkd%$sD6!iS7hf+wvA*UkR>316cHYj2Dc^~*7xdT_JE8xuTaCj?DJShKr?z7IYI zN1C(n=E0eqbDbna9{NC|?gfM2s<*Ins|>q490d{u(ZC-iz$a1D@Jhfp`2I!6Wl!*? z28UXB?0h9RtiK(DJ_s>iGO8%*q#;U7d(RzH6Ni5t?f7Oz5siGf5%M$qV8Iqgl(4=7 zA1lYg+4=UEY8r(Xp3Fmwp}n|6FB|@C>Exbmw#LUrV$7=|1^?S_%uY22i_QrF`Py!{ z_B?}5sjb9;xh9zS_b*L3Dh?URW2ntiKlEH3NsHb|;h@UcBBx6?an(%y1;nfBW`c0 z1$3PI0pxTRp5N4gAKv$4yHo=&Mqv^D+5MGg{w$xne++q_{4cLYA(LJ+s?Vg~0?eHh6|3r;iF0qW;g zaO$!@XvG219Xm6GI*r@L+&Bd|IWZBb8D+i=E8#7A?U;Q9WWa6Z)#erLtv$_-<1y~O%*d^j#c zD+zh0^1yej7#C_-gtou)xo7KSn3jq*D~UP=o2*{b9~Tse%Yq|x*ZO0ioKuCl+9TKw zH<1^x^$0pCMWNBsZrXP~3pNM+qnV}#VDa4!k1h2P_<6@dlBj>3=ok&x#jUV;^C@VO ziovfoa%|U?%RE2rw_KTRo#5B9|9GcK1FEae!Zo2`VDTfFhKXCC(NQ&$w_+o!eJcl5 zokmR4<_4Ir3#WSwS}{6xF>4Bzhm9r;kP`HU*1g>&Xtw0D_)`UVd~7Z6S`T5N`;+N@ zyWdnoVkAulK(%r|C=4v8UN&vq=&xt6ancgue+$G(TCfWD&%Ga)#0k0P0TmLIKEUli zvP^`TbK%oHWzgE@h92s3@JrW74Br+`aaJMb985OpY8T+gefN0ER^sf!#_3=`Q<1pK zb#q1QGqG^Qa?V6boUGh6$f@2N&%DCQpg3|5#MIoTMfdXI{l!OI=3YOr?s&reRFq+2 zAFMF-uoo0b%Hu7GA?~@663Gfn6b%0uL*y=hrsY3>U`t6U44-+0TYb&hsmXeXZ{ARk zP=t9?{y>R^DRBvNMVnhf7Tr%lLVCP#hwEu2N>VO`N zDeOU}J<7@VLjjH?mSM>_cc&UNl--G3coa^35CiXHW(s#$j%NEyTR^woh{VWP;hy9N zDBlF%Q3OG2x2;lOW%_k7MY9 z0ebR|F_dIXg~%B*F=u6pARsXuw#|HkVO#fd;&=V=u>DKA+}aG^M&`lJL{l784B{EY z9LM$Yg>Wb^l1|s1hUZd+`1x!uXZPqcEG;;RJ`LG;YDpv(SNl`x8A@1x^)QP%JDQzv zZ-ZcsbO;gcLCeLycyZ)4a?GF_m-K97^`|r8%q=1e^;t<*Uk{=u$MV25qL^wP?}51& z<=MYae6mbDiuozJqObZkT%4Yct`9qTZzU(w#v*+iRR<^scz*Xg-1%CKumnd5%lZO*Dl%e#h6nt?Spgn`H z;jDcxZ2x-!`g#L!*~+P~$7T~=l@VvP0Y23BPyjqcKQ6O=JWu`W9{4>a21?yyaOs`v zIJhSn;`HAN-fFF5vc4JI;JFcaSN$)lRoKJwPaN4v-MPMhNF^42rBN+CaIbNKpeT3) zK6!Q#W*-CCpDgkg_wD0?wjH1iqi^B_*LzSAF$Kf&Zs4i2xv2WG1?$gDhP>)CxKD(6 zQUU}xwp{~Kw_Sl13Tx5DeL1oGs74i3IYyKLND^+tTgk6O@dWVV@9`b|sqvhP&0a^BI zO9ak#vco}-7Mhl)jN?DAN4@JSAj9$=tUmgPb6ixxNlC1QtJ-{6aQ-vQe=GreKV85V z&mxh3$Pf3K8iW23ZP=SIg1d1f0)st#V2>wZuCXt0&3X?;wfng9|0Gz^_b<>?K%r`K z73RBF@Xp0OpaIK@F!lCbtp0fw&W+S$$-R!CQ;~@~-dI9f)>P`QtAU#Zb4Z+(F^QRz zio$iF82)7kzOlPlB)9Mq3?6;Jk%)0v;ILy9X&jINDCG#M?>iAWQbmpgUr=p$`u*hUpFD1kCa5S!a)qm{|vXqRM^YgfoQRnPr7(N@u1rhSP*j3 zSmohGTyzS5N-43s8E zmVTpFPA8!}>j&L3u@D1Fw}6yh1!{~EBi+B6ar=NHn7tO#a7!P-Ns)Hze_WrnwNkuN ztVm*Ktc7?HE|@Zu3STvPU|!r*?!Mh9qOm8C8{fVimz@|*w2fK?eph*z^ezkz4vfc= z1Z&{4Mo?au!FBJLjUP{m&Qn$vIK3iN+vp}Q-r?jg)p zU%?%@*$BVxd|T+y6R7uP1rq4w=xfL-ZITvE;=>N-@%MR!cW6`#srMV%iF+jpwS zY~D+*X-_aW>uL`^42?_92V9(4`*~Y$w zB>lk@xS&0TJdRw=@;kLyIvv1Dqj+lk!<6myoXliAt?8GKX8|-t*s|ju#wOpyW!ZDt zw)zN@U`Kb(Qlbu5>WpE{6;rY8vNcFu-^y

)`(FyFgyWF9fm1aMW$Ih3Tbp>C7ql zxVBP;Omfu)GqoMK6plcTq)1;GF`2&Wx{VPXBAqi?A6%lOvHPYw>I^EA;#>>3YidHb z-#m)(7GE&k=@FV3W^)QM)4_dYIVY7DOYh7)jV9&_&^kv8151|DBzbw5V-N{ao1?&T zS2Lzm*Q3$fdq~4p;LdG(sPP3$(3&f12Se#7@9`HmzFC2K`&VG^)gh{Q{}lr76L?O! z$f?_ixM5&0^ti~girsY(cQPEVW*(v|s}6JToTYJ}gflVA&BCFN(RiV74USxAfMIFD z@IG<`*=;!kC3%6^U?#x=V!d&iWl35=zFwaq!RgN{~ zYq(hQFWsCgbkJJM=Wi{6^-lFr@H~&TN6v@Us=1^-x6OFFq&?ZPAb<>Lm80v5F=X|i z75*Hvg`Xztz$$86h~Kpf*sy0e*>(Rq3r@d;?bFwB6Nd%pH)aD@ar_eWgsubAi()u+ zohg(o&IGBK`>|N*2o#v!10`t+v!eYmjoX3W^>=a?MBaBpwE{PI`4#qB+Y_%TX)t#> zfvZVYu;9NEPHWZQA~Gz_#%sK!QrB|O0AE6g+Eh{2;0W_v*n}%1Wth|%F{UB&4FXOL zaYxo(L~>D9u%T9*(3^j8t$Z;RDMoxP0>)oxfl{exVns*Xw6^ zAXOTpmKJeIsEvWY+c@)6dLphLjJ{u>F2TlYUXhbG77($pqNNPPPS(_8o$-l0Y;z2CsQPZwCi{89L-R+qd~`T%Q> zO0&%c>tX4lbNIL+4IB0qQH3+^AaQOs{yjtC%=K%y^1~09@SLHF{&TqbBZ^*lF^L!) zsDUY)&le77OXHgJvuUHwNp5f`gTl5*Fco2zMRGGR{EsOn%pFUIX0=0W)+%P|-3Qe! z=3Mcuci`{r&i!7!8mgbi3XbkPM683pV4;5(UY-&K+lK>i*vAv9CaximR$gLPX5B;I z2S<2~fAnak*HO&zO~uC7T@ZP47o^EeV_!4dss8sG>hk*+_oV3)4A~!`i}{}gg(|hCl35kfUw4C!f6KMnCrI?NUQOJux(d~SVmwu>CQ%E^GIXR2QsemFgHue1Zpz_IR6vvNJUWcsm(lR7vz zUqh7g9DxH*ow;j=C$pJze&XwWk9l1#{^Uq;AXb1mdwcIFSs1*8NUI-a$EQBTW_x3{ zqh*-4bFLOPlwadqQyAO*G?-Ng+GuL+RA7c9n1oS|Xm(SIB*vG)h#1lSNm~WSvc<`W zQcuu(XiWyfEwE}!H_aRJ<6f*k!VG8q#2C@%8%7wz@L4nR)+L2bG6!I=}*%Wkb8D4`slmU`8Z8*@a+ORwm+D=q<~CFBN*_-qQSYzF_QZ%dKk4fq}i@F!+5j`8#0DeAc#MtHWuC{=~!j++XI00W+acYluc~wL{4PZ_c_$g9K=pkpFpl zn*U!f=lzC}sp)ulI1>K!RUpr+2y}e&z;nJgZ;r}SI=Wai`|RwBt2c*2Uj9$`UZh5z z$?b;DvP{Uc{*0}&lBq_xB9S_C2&Dq}c+Y+>ewH3UE1$(=Mn@w2_%RcYCJ&Gzt5Gb8 zSAcPyckueN9&q&Qf+H{EX`)mTY;(I0*L;?9M_y;3%BP#?(3Qd^zDNi6hVR^y1$(en z_9315WfXkt{K?JfG@vz#5}fCABRHTC04(7I3^cjJeSKYSvyBwlY5$yNCS}5v`wCFE z^i7f2h$Xnn_#KgK=aXB1WKpj{7U<$c-l*3{@K(iDP~{?NuJa2pa(l~@osa;TuIK1$ z?=xJO=P*nfp#%rxcS8U2D->j&az75tF!`g^Lp|rWap7(IvBvTt%$arvH%d34@^W3Y zEN;MqHUFsclRDAR<3v(@O_U4XT`J0*z9d%Sq39-k3T7!MXh|7{1)OdUe9lQx% ztGJAoBbHD->n5C;wN$nfT?L z{ZU+v(jr;fP|A`pXi~I2_jA9NLZlK34JA=zXm3(cX;a#xjiP;}nWW{ppKm2aB}R&( zqC_LIWgF!8{Qiggy7!)Q-si}mwfZ)gy*(Od&I$&JkWwsN90HN26qxDc3Xs?6fb$D1 zY4bb{SiN)_oY0)kX}K6u`}Q_GkzvdXa(bCej|8or69^p}+b~$R9!t*LU>ZHZutAVNHwg38KGOAvk}xiL1SW{?uMG!Eg`4&@fX?X~;2N?Q zkLK~re@+$KXpFRXeJeRyHTrGl+F^0*Attd%kX+yJ}#N+T^lp`D;7%hAdMf#=(s+Z)h@%w z`bgneb2u*ft(%>GzW}C*JQq1ZCYi1>LRtdnlOMhn*bskGSaPU6tY+%JXnU`!6hC8U?*wrGTYFY}J=a*etY09}Q((^&|T)6Tq7+x!QZ*D&XUutvKoU1ir>C3j!zk8VCLEg5~piu)22{ZCr7Y z?LV^z9~p-Vi`Ng3!ZI(cIz5LAyQ)IR;TN#pT!QD#%JEu|3~n3CL(g;S>=1WXuwSl@ zq&FFnj+_Nt#E%4gc=r~QUu*|UT?lS4en6X#Dbqa*`p7ENHDI;Cn3dK61lky)k@_MK zQyzm5=Sg_-WGikk@W&%dQlQrE4Lr<^fa_^Icx35wznZ`2!cm8=YJV*ZyFQDWdGE#p zyFB6G#RD|r*h)Ud(Ny$*-v+Kx9`MC5pZ_j>9E}?yIlbU~Zfmv{_aI#er=`Roxab^10}gBgQ97OhE_JCeoZrQQ)KqNA}9%yMZ+rBAJQp&&2S>&w8-`s6lU@y2E*! z8`6sGUYIhT!B4@PxNKh!8IDb2o}Ct)mrW|GEIW(pN+#sos2;CY@(N|e*I-bOEn8z( z$=$Yeg)4qJ_=n_lsvUb6Z?0SkISOZZ(NqOCI5^?tBd=@SJ@T=uvQm^?13e_lwx?vZ z!-TGMmi750SY^m@`%O=RR&D^Kol~W8CR(_(ONtK2bwO^i9ruB65~Qh)U}J_H>!CZi zfckH+Gt7eOYRFPC%>jNkMpLm(f0N*@v-C>gW3u%3VPW227EO|P2?-a|;kNV)JS^b} zLF#t!>s(`cw}Z^;(TFXWGf^75LsDp> z_$bIb*N9kVKQOw$vlX*T*p?%`Fk}4?>yNXdkB6mc^q-etXV)X)#|M`}e0j)okIdvo8^o-BG-4_h;y}-UAm#ZedEsJsPhXPM-eSf`#8RiNVr|T&#r+_hv|( zo4Uk}CfXEnKc}{XD)$YSE!+q1oey#KYZI|0D~xZvAjPRRtMXFyLT<%6R~U+q;P$MF z0-u`0kmdXwz78#?c2490}??O=0RABZamhdFIR+M}B2@PFx zQG@0~sn;r$BD2_96GfPRa2EZoS%K3N@v{!I+F(a#9As4v5wl8v`lDzKSCBOsA6fXq zrkn4`+vOV(UCi*<9c50kDUe;XH(-m#7?Lsxx-GmDtQL7f;j+7A{X%=FwJi}&y*Ul5 z^0vT8wh})h{~s2%Ckx2UY9^^E$rWBNfFS*g;PotohKK%3OWX+U+|~?9|sq&6tJ8XhrE3%+BG|qCzpQ5_i9-%tMw9J(k)KQz9ixFd4Cd{J~dwT!6feP zhkASvAY`uftDx%rpg_!9g}e`jb0!))Ew94+tr57;Xf_?H8Do0E z|6z<#td+*5nf7Cdega=mU4x( zMp*}+mo39BnU^7_eJ02;bvj-=MZ}&zh9H+vk~UBSIv)bDD5-@ty|ZO*g^Jwm#ML4X zsuiTSFN4t3k?Ie`7ShoHEkCz?^ ze<<9jMNL1vzg-gRme;|4zYQW^wGb}I%z{oKGr#+e@1l*5NQg&IEg$ zH&=t&q?WPRs^_G2V;4tAEGO zt_%4U&lf>~tSbntC)1S@VZ6GV3?GD9==1X*JkeIhogYcT0EsZPdZ3E~#yKdLHXFAm z?*gTXJo-#d6Wo2$j|B%AG@Q%8iOZt1hn$+uqFCe*q&2jG__|Qu1(N1CQ46lAL|j<0am zI1&uhE$QORk&ITxZKcsCy?wcZ7&I>PPUgD~hP;eS;Yj+#yJ}8P`ku;Jm*sqv^S7q1|`^ zk;n;Uzm%Bcor9L-wXTrqPcdP#gV}IDXb9%Unlp=lP_nsuFFHA@3lFIT!1cgSZ1}7a z$}XwJ5w&1Eb3a7Xb1EgW!`UMKVSpH&mlr5koWu2>{=@v5ILu8lL-~JnXzE5WxbS8Z zM4t#CDm#i{dT|Q1JhB5DbCe;^PLdP<*(u@&TG$7}seF5KEi`@iKz~;z${b?RYE0yZ z8Bc^0_k756cS%r}3uAwt-b#z3_ksReZTxCzNBe8`bE-RasnQi`VKG+$&AYwf9tlI0 zb6H>=C;^`F58y=qOAs1LgNcE=;9mDAyB`q4#x~WkRNG|!pZ-+JA76uWb=0Vp)L#qi9F;BM#b20IE2k!Rf zDnV_7Xpbzl0(>8}ljoJg#Q4HS>M!ku3d2R@@r_70T%wQW`|qRq6-yH7A4MJb$<)2P z5^HV>IG2cjIlY-`n7!i@n_*%NuQR_ABQqnGWN!kE5lW)Xkw#;g_iqL3!nfnFh!e^^ z{vT6zjHMDT!PGqWCoL6?Lv2_thJ`0*&_^o3q-LCfaD^j$tW_e+D<5TNJCs<=pfzL5 zH6i#z5B}#^0XLUB2xlwxL0)Aj=+@sSPpgW_cg5c@rE?18>WaLl6Y+w~UytJQw=+2X zHZdZ9+KeoK`v3!_m}Iq!C`MMHQJ@Sy>0HV=6pzxW?H@r>W`LG%wPAIJJuEy;9`$}` z66J9%@K-D+T2H@%grOR#O*12Jha6~ybTr1VwS*&{;+%Kp53cEPhR{%Xl)bRyuuyd7 z9d~Soz+F;&fNUhr_AbG(Q?J0!uu521d4{VsvZnWXT)1JIY^vMqL8}jIVs&~F8%mG> z@0~n(-S>%TOB|${o9|MeycKlUqrEI_$wfY{=n6M*SDP;ImlEVm>V;&VJeE-3iso{a zblKt>eo=O-s0=Q@I@Rwi4rpv<|MwjJ?*%;RG-Mt}=JVQ?g-lQ|fez*tqlZ}&Mm@d= zcgBMGl#DYdmd|lJP4YzA{wRJ7mg4P_rRa98Y6y{B1C!T$hBlo)U|iFJ7+jDBmluUt t-K|Y@(+42Nss&D1iSa+SdcgjWKJH71F?jA?C)~8h6r*e!aK-ac_#ZMPfiVC8 literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/lens_blur.weights b/src/autoschedulers/anderson2021/weights/lens_blur.weights new file mode 100644 index 0000000000000000000000000000000000000000..74d7bcbefc7f13baf4c42b88935dd3fcd2b2711e GIT binary patch literal 20400 zcmXV1c{Eku+n0n8DugDKQVB^i-1F?4)K`X*REp-3N~MuznPr}bOqnv3v5d*Ro(WI z8dRB+^g97`PV8Is$eB)kT;_n}^be#Xq@Pp;xWkZdF$f(VV0KH_a4e4tK~I|(c+X_|nOX2RwUEIP}!km;QFd4c9sVc^(STG+3Qi>U2 zivmblZOSUY^Z>IHHyBZ=Qt-nT#%yjM^N+Zo-1-Ns;-qDe{`?WxHFmJQE-Y|Q#WB0Z z8`u%Y2E^1?cz(-j{4RDI)B7Ky5%(q*=;ng7eE~eb7>S~RZ78Pn0RulQ0BhbJ;J<6- zxXly-R^vXZP&d}FLYsBozKyOSFG~eG; zGxH-U51hmO>>13xVvLw1tj1C19)K{%IXDcVG$Xi;${jpU(*7;Qw?Pv~xN9i$A;1?# zg^Hd_58ttZTjaTo zzolU}>2ONs*soye_^}k~oZ(GXbR5NiBXh+p*a4Bj)^g&X%Ea=&+T zsjB0^h~C1KNu1?5BNNCnCwpXVN)`Zalh=zw7t9rleyop>F5x8Y|qAm6X&4jdOzOZ zuo%y1HlY8LvozuOJa}>c0(l|!2rJ+HNBktRshImV+LiZ>&e6@`rs!WpCMKLFX~xi% zoAb#MwKTXgIh4$}UP`-EBdB~Szv8yyDr#wao%pPE1)fzkoA={B-u^g>&YB-s)2DuD zpz#N|+3A<&Os&^mT#)nx2|E=h9EQ|@647`X-Q)u=X9nzjDN2M>-)4er| ziE7IhjI!=0Ik%Ap{u`hbx3aho60+I!g%9Yvs%La;r5xE+7(t4Uxs&*BQ|KS5Nce2E zmX5FYrB0_-li8sssBMxbsVba+bsqcC;Qk5V-!^1;r}*rt!W$^3E|2%DJt1$|dscm> zA9X!6i#$=APIb7YmKyPMcB1%DkwjZ1`xD3tCFvOyTS= z=;o*cc?waWBDaIh^SguwDw=dy_d3}o5sLk@&(rm*Zo{_iyXoh~`(#G67ma_N!|r|Z zj2I=}qr2k2la9;V_`ECEsNI+vx97VTtD`_*aZnX_y!;B+KEDS}vB35sXLsR+bDudse4IeHZ4;)Od84@FZ|oJ_jzhV3;a^7#_OLk+ z)$tqG+K!MzbQwuAoj~tC6XQA_(1q+4BkbpS(%d6CSo7^TURk2T9F_Nj$e+TrX{|3) z?Y05rOt_4V+Fb~(*$Pe%Y~X{%UEJM~&nPOMzjf zjQiY`HP^|J_T%L1>>nW3Ajj9*tWBo7o}zX?v&e2Oc{&<@5)41clck0o%!7!zL{TOZ z`4{GK5;nTQzvC&$9T`dc)F z=8l5SOAnNaFJVpd0_2ve`kAT$`F!`whLvMEK8{7!aw#~-ka_Xb-PX*lV56{K^b(Lw$t$o}wV zD@&Hp`zmj#h3yl*<&h3<-*6?@+P#Y37+uA!_!-M@$>%eA*DF!t0|({gCjtL~52sOg zjCU=k4Q+cfS)OM$$&(DG&%#p3&)v;r$7^Xi{)gho);I7=TLwjUoh6(qH=b?!dGczS z3HQDIcUXN!jcjZbrXuQ>sGoHya((7=L~eMpp&x}%dS()c{9XzC54xOZ-|nGn@B$jV zE|s42E9O7WDdRd-CA~=nw-t zSHy!H%>$ujca8aHgURP8A^rhhO_C(}0v=xKq$hWZ@NZ9X;`W|=0d7`$)cC~>e#gFc zvh@2@oW4qm1Xd$)wn!96MxEn6?^W z_*-Lxsw$Sm=eiX!ofZX$*B64mSqdDL{Eovf(?Kyc0#xQqg_K!U&|Mz|Uq)U+qsbcq zrqL$dQd=00KhNp&(iD2w-i7$}*)(0~2Jz*yWO&h9x@e&U?cX?o=u~Gh?&KI4e2ZdEm%3oVm;i_J*h1%wn9x<*L&=vXYq@C+m$)xC z%;oP|UrUo4I@mMQ)9_Q;G?-i!O}-4?1IeXoAh*Jw-Mr>D7}VW_9}9e`viM~xujmb} zucxq*HZoASLz#X5{xvjp>Ol3^Vd#nKWQ>=E@O3r5;As8{A~s)&PA-w>o|PLX&mEPy z`lm$syfBuG_nacLuch)|U0K1Hx0!Qyo|h)qry0`2o^z@9mh0TXW_vz&z7^kY#WL8l zeKYs+_G*Yw4?(ScX`uF{5zgo2z^iHf=&+X$XZ7H9KhnbGJbYTe!^=8}~k!S|XJ3f#u{TIqC)zMA<2KY5LF~>v@ z#E-Ust=}TNIDa=ZPBw*F3tRT6{V4W#yh7Pe<@ot7AMafVfw3xgs12V0c3$&XiO;t% zB~}Rqr9rDi9!%S{2hVpKV|CU;7}(5b`qG;r)j1PZf9^n+Mr#m? zG%-^8bq=@K9^w~Ut>dmbug=xcT*DV{i{al{cAuymUjjoq0^D-Kf_Z*+KAKM04R>33 zFyAx}g}P?5Rcc%C)2IUNsozAU`dm3~vu2Um3$oEPuNc2RlObb6?kLwdn|ti`1^z<= z4);`hHgVUM<2MD5kv$LO`3^HgxtBVh^F*dw)7|CUFe0Xg*aS*){}yE8t-h@!H)aWY z%R+=79biNEdc1*&`;$3?@^(b-cQVvy2JjSp8IJRTb#xL|aLT)F{hP|1%aS zH(#2u-89IVB`pPgrv2?Cz0A1j-j6~cBBn|nh!S(UA)*GEH|tpE5FH-eRFrx*N#o} zN24UIP1-^vjyyxY$!U7>PC1iophc=G?}KE^1~ww052kpR;PDeVM8VCTnsDN&X!%Dt zYCDApG3Ml7wK*{r&cKQ9b1`s)&#JguvTJkI(dbq`z+mDxN98fc@jl@(2s|qm3;6 zN_|P$8v$lBy+e=uH6zOhBgvd7L$XDDCqdUNbnyJk2In^6sNX_R@xIM#%S4=Yy%YbI z`=FG@E!H&j6-A~3b=G^3;|EgM*?te`)U)OEX_^+D_dI}1itoc|8@*_@;Uwbv%$-qM zB8i*at=QIgN@O-i1>H`Gq1my2z@7hx6Y9d_iR;&*ZgC!(@Z-SQVjNPs=hNN(Gubyu zljxYmS~8)v2K8RV<4N-^beDDs$aju2AwHMkY|I++(=-D~{$$!F<44`bMnSIU7ZwI1 zmFG=B7u5vDV^kNPy&q#cr2{d=x(c!+dO1gW4)Tjne!@862+Eg};$J%vOfp-xk}$I* za?$$@nX#dnT5RF5$~*gk+ZoEZU+9J4d8lFSr@Jy-}Elc>GGt8 z?s;^c@fW%+=_65=m`AQBN^+k)oW$?l;7NBi36l?3Z3y>_AFMxjjoR-mq8Hpk$j^d9 zT$w+=&{Ec&E-o_TC!8+<7h@kXT=a&wT)zO$aTLh@tL^w)Js$2jisQT&-ek#Z8C+WA zPGm%S=*QnP`RPW=w6d-ea&m`B&mJjm;3qCWHs2HH##l&dZxGv=3hEWl+@#YW@YqZeLhiT^vS~#Lwu8+Lu_3-C|~;}Cc1U9bEfX6 z^Pg9P^0yx_J1LNr`ui4~Ln~3ua~|37(;E6rc9Ag^F~0YjP5glq!#Hv~jEJQvaXk|4 zxt7i=`6jk}kaf(*y`wRp8|{laZu^+Vg5~h~z&*^&X@%vJUFiDr^~CBy6cM`YMgRJh z5&5W8x@6=pqgC{b{h}1aJT+1w+4%-Ao)&^rn*PH#D}7-2*d{Po{1A7T6_K`8llUvE z?D%=R`thbp4wVP9+5^YS(J+Bblao;cbCjWRuH z+vyd#cSMFvBKMm^+1AY0MC?is{bktA>Zc#33z*-SzI+Pw9kT)M(;2+JiL;@$IGA?qZGWLWk_CvgWE}V}#hB;UPa_vEE ziM|qW26w^Lyb4X98Ebba$N}Bhae%8Hpmmu=m9TD%+#10KzQ}^C(l&^IoriZ}e}^Bal>gv#*E~S+XMdTb$xlJZrtMzIqM3O5UIyxS z^`LlAHFV{kgT6g$;9`_Co{!B0Q>_GOG1f!b>_%oVH3tvBIEEs|EEwF-WrfTi8W*8} zGa40xi?|oq?;1j_&4XyS$`y2%B!Khs2W;eG6KJ*hhr?ff!`%1v=pGu%I*Vq&bLA|o zyq*D0`5Um{r7V2KU$o9a?e8GSd+Q{?_c?5BY#i$B z&}3Eog_*K}pWqed58;h(LC7oINI2m#MwTRFwA5dy#~9pV6^k1CzQOuO3sK0orXqcZ zIw<~3$Ng~^VQo?bblm+5@9NC4_fR?XEt3PGeQ%6+KY9r6Su2@oQgPruA;VZ|Ni@5@ z=PDR==;Pt2Fg%&`8{H=NM#a0DCa{gZiCrB*pcj~gL(b9Q zWj+k|9}Hkl&;`uc6@lX8O4#P~n>De03IB#-VSkpm;Qn3M<>bS(yqd;tkNFJScV>eo z3BzlxIXJ)J9DH$a$0^TW;kB-x=x*rBQ8o>PG1DMuoR5%3o?xigKc=qA3k?L?tK!1H zOk=e?IDSz;oo{hGujc!R*ICTE6$@fUl^A3915FNjz`#}s)@5}M#;*OsI9Uc@`m$Bf z_Ie#Oi#|Ztkwmoaoy)jKNq{^3R}nFF97m@;WI(y_9wPb{X`ZA(~nSvwnfDplp;4+wHTUDDMiY1nhyG&Zm&)=?t2n0P(lNV9N&{ znl^t2{stGG(5)Se_zNF2`5KC%dp2Wy$45vB4PY{NoTGShHSD7i>88>c1fp9-l!rMix|jbRb3KCC6#bTku%D1%;L0 zfsm;-TNdgEvB_REIoOFtpVp)+KAJM=oma_|Bt3Sy{Yfmg5+}9pk4U4pIN51Emmj+7 z8+)<-37+t+WbTN1)0UqtxU{qdOJ%ZY!25^HRPh~ zYkTplz_*&N&BN*lKXA8BGTz-0%i(;Bz);RKdh3fO%}$+2QeS7op_@s_^xwpgW<;Sc z|FG1%fIXkEfa9l?Mw6Wrsl|hB6*I=xktVr`gtz`Jd!p8tOU@Sa4f+Hiv-vnp2%gVR z2>0Z^-LV;d9~bD-k0$c{>Iq%{B$=qIl%lJ-J#p*1LygjsG5z6HlJYi=7KtlR6)#^B z<<@~g55wpQ0p`9}dlxne_~O!uONesyb{ZIPoqk_pLJzO$#Vh%tq$YX=k+q-97xEwC zILE&L>(?IaaN}w$6U;0>6?YV;Cdj|o$hJu?L|Iu+FbMWS_NyM5IB`0?5Il=Y*DgV; zS%Zv|Z#4~J@8a`aYDS0G$-*SZNc3vnz$yGX7pAJHQkzpQBsp;nJ@|VQUn(e?=-aR6 zJ|Fl^s(6)T@y7xh-G2=wwhe*i>TFQimBYE&U4<3&Ikw2@La+dLXiIux;fjmsl$MH( z2Uid~+hchC`AuBgc$_{yHj&&jE~V*u0hk%6M#Od`z(Ut}Z1D4k-bb5YlhGWiy;vHy z*#*(juV?t%qB4nB%S!G}`(C2+>Lrm)dq{;&>9E!cDd>^x$9A8ygoj+;Np1*ZKhnsNC68g#oI<~9? z)=l4!4O#W5($@xkZhY3=U@P;WS(c|4ZbW`9R3Z(EkC}k(m?EWR`aQ93l6qx=2j(sevkURp_Q{_>U_ZOXPWI@U}8MC@};LQ^D4Ie*^xJ8jjt!cvw&3VSkzlY&q5nmrKi0>j58h zd@eG5J<_nI^b3}lSpoM#6sK$DSI)4A1$a1KgZN!O5KnW_$yf#r9`9oL&1ZRvM;=1T zM=Mw{H5XNi{&1x3$H8KkJ9tth4~~S!V(JtjjC8*Z?o$*wI&#aIwNjqI=Xh6mmUf}l z_2qhe~nkB zY6xS`>Y>cnhz(y>fjfJSVo%2exVPXDq^cLfyJOGLc1AXKF)tZ+|D_;2p$2BHTLIzB zL$LF1VA7wIAqxxgU#A&Sml_9;xES->GD0u z^gO|{ji2H6QXcHL`v;kYI zRe_X^$;{5@Z@^lqhgY_41-6(zVMZLZvDc>pYvXg__DN6Zz3>qEiNy@p$c_`^nTrLj zMT~=OKH`7RKrE#m{XQLM^=|!S{ZHP)uk%_la={63Py5U&9u@RZ;e2!-w&hsgUx!`A zzoBon2ljQAfm2rk^U4LC5$NZ*1hwNtha{X~FNrr+s7r}H-48+d91x|AQY?u5$ z&i?t3C%o(>}XQ)1oeOJfhcl*)oY$LP4G!hqo z?S>K+PnP##6#l51qW-Zc%u=glaoauE-F2R+{cj;#b+-hc>zbh3jeE?EyZ4dTzW}OU zAz0h;dA-9?5Sdhu{3pSnkQa+%xht_~p)C&ojs*v60q#3Egm&xWp=DH#mHlGHn!Uay zcS3y#21t=_Ag_p;3!RcHV)3y9BXFn}uZ%+`zP=Su3 zDpbvCCZ$RB*1+h8CL;fb_6^1n!cOM3*H@*oaGPKAO>h#F&_A&1s?Q{di& zWYp;thmFGT(Jnp@q<$?0jpt9mU`MEA7Y=4N#gzkO(6T{ z3rc4ML7;sL=uJ0)j?o(6ME=FDJT7LQ@PX4s1+3_sMnTKAQ96Fj6jWAvvhH1~bk7tQZ2Ml%nb6h@N_L(+ z^IlJUA*u?tHIuMGuf8)4TqlZ07ha7-mYJcqEVHsiaNHSaC{JZ~HnQt+Wcx*QI z&OX7^U$o;~@T{vC+c%%bD0{MpX5EKBTQ4vVquii6QwrZ#*22=PT+W+C_t?I?F6cj> ziw9q-fo2L1MlBbC;>3f%tx3k6g-x)6XTdJGR1duz127ER#2AbW883AE1Go11VCnqd zV4u>BM~+Gf@bFy$W|lO{jr)uTLv6`I)!(4Ku!#M=y~ij?d61>Q)-Y?P{9tbRC*!W4 zFCa-j6Te1klS2oy*zRk+*k;@bv1NB~@k9|4eoKde8QPe7zZ~^;Nk`!+qNXKf%TcNaqSg0V}CjG!9EU6^eNjV6M~XnB6x7;R@{78fd)>Pg_ne6 zVQEA+EO6Wl#e;-ZByAx5jejpyI1|4^42;W13ifC#NBw#eu6q1{iTlat$ga@=wHw{6 zN6U5WP?3VT9V4jHaU9N|?TILdq{V zVq1u2K2o^H@(%OKo#LX-F^I@dMCG(5rl&C#EG}OHgUU)?{ks!r^LIL<#U|nG0C!yY zX*p^-egK8wK^ze1m|vV1Gg4=Yn7Jntnc$c%5E3qgKc8zb#zh{XEC#ibity`=uk0p% zE=X|)!K?I8`JSNz91W{CP;pF*Y|59QE4*7+jX(P6-d2xK9lbc6PrcY@8z*9eb1L}o zEx{~ThaPty0Mq|c;iu3KR^x~~G4d8BzSb-|W7k&Z)K6(@^odJzU?B|>z5x<93Yo&* zyO6X}gATgPh0R%pD799A*K;;vl*RyZ<8{E;`~>sLc7S)u+#gFt^ofCABD$-7MBTh~ z^jzyMmgBY%Kb@Ch{$;#lf3d%ebGpwXjSIlbT3XCD{dm;%PJ@#M+hI6&fN=|WgtuQ+ zAa74FbHV8})@#TUnVXrge(5<7Q_`SWuKDnNXAHVJzl8Z+L##>MLOOY1B`Ha|%AkTY zRNgH^gU^51%%zWEdpRFFyJnIImlkFT&!2ZO%Ly9it!G+Kon!uZhk*0yF+>pwGBAAt z#`J{qDn;v<)nc+xJiZlf4}HV6djH)k^_@k#=gono^0KgHgE&n7XU>_qC7(IrH3h7m z#zVtxWUg!~<$dvwWm%&tCZbdhh93v81y1`IS{aNj2Zx!j>-mtOCh&RBaba-NB(&=7 z!$f8uY^}ZlBA>>A^DGxlyzSUJLmxD2IEy;Nd*OlPAlo2)fGw3uhi%onQ8_3QtoB?( zqcI;`vf&e4|CEYp`CF0yM~n)Yoq(8%5+>ejDSWiHz+t`-?9Ly+m*TTZr)bHS>Lk55uN4;j%JccKb4S=rP;{W=W|i{`xb7 zTh}w5wgMkadlSZtox@mV6`USl07qWkV$4^@|F$Gc$_E5YTbhP zJ_GDgy?`Hm0Y7Vbqu%`pW=~KsJDReRsbEBLB8g$YJ`skKb|tvcXd-oZ|B*?obmLrm zsE>nVQdIG5BpUpzg0;_!;bvC^{9RLv_xbWD1#)1~K7gBZ+IYME&LaDYsbCLWVC3(- z29NobsLGMRUoXY!dnT8Wnsg11ali8>+h_CCOwD0%pB^0j5Q?J?uJHI959WL|zTeu<+qToHlX`V=AUIVX+qx-g9BWwg_hXUOlwS zoe%q_&PV;Z;`lM>Dj4TSqx{TrX!x!LN0h2?Wy@_r)@LPz?>>yXcdlSX&5cm>Of9JB z1o7lUoZ$B0M+}sR!}poN@Nr*1Y@Ou;HHA9N-njqZ@sj~m(aho5Tux`TB4Xg6%sa3< z_MIoOuN4!o<}m%K21c=QolMG(cD%Fd1e$w@(WiS#pseW<*yo)DABEpICvYos^MM}| z8B>hHEs{{?wSmZA?SjOV{ctKW2$E-Jvnq9#Ao_{Lw}-l6CjTV9|M`fyYjKx#e0U#r z#agio98`%$?^3kUo=02yqcEW?0JC=8K>w$a*f^*Kf3`_8`~TU1*vAU=a8rP=$TVZM zqcx~qnhDOHJsjuNLO3*!g%+7I4EHyp=OpHU_S2Dys8z=RoHJRGxxt`Ul7UTDKQT^o zC;pk@%8t74L9NM3IIyh=yiFn7R{ZjaXyt?JF?j-F|$r;Ra*kC`V!fTCq{km*mnH z(D~NaLD?&cb<};2C$Bm{Udl<#sz?CObw_Ygq8g9)%^2&AuHwU!Z<%{9FM&_86>ci% zXBQvb%c>vGX60qKz`NQ8_U*zA%;PITs4*Bs3I8q}NL6N=_KMSh5BOTM!-@V=R*uv9L!A6MR20mXE^A3#usk+*$;0*N)(dyN5u`#T6$B<>RsC-LQ1d z5A=`J#Tl31Vyc}$Hxgk`(zg;#)FzTMagOlOq=fUWuoPw~WMb+eW0-3Cj-7IEIlU}q ziGvy@Fi-M2gj|@63p=(GvjuUSX>v*x&jaN}s($w-gcTu{{W{NIXH|^a>b=-_N!Q^n@vuGVIe79WqbE67T5; z;Lmr-pnGBl*&Hd!dQVdzhxD2ljUziye()3IjCaAFb(5*YjfGgzyO4=IJ<4p3?ZUxU z7S-lOG3}3)@WS1ftjkev&Tl%0?5=OXRZW)Qv96J|b7*Em-)Q64xVzvreKGM&ea*-? zi{pacM0o!(hAltij5p-p;nu7TOvaiNW}Ay65`+EJBv=9FU6lpBR;P;dI$LPVq#F9^ zVgjDJmJZj3LxC*~fgV>Ma#FVurPbEa)^quwc&?Ee7Ru7xO|M~NfDCMQDQ9QQSWImG z>xO4>i(ux+EjnKMhN&~RLp|?TsK-|Ee6P-7jkG**c#0LfLZg8(-Z2R`!yA~_@56?B zZvqeBl`w0qJ0t#P2&YWY#lI3a;E6^xqy2Lw`|{N!{42H&f6q?DQ#O}j$tc3fxXbYK z?@jj3ZBt}^4YMEI!@-ZA#Cg&<9iL8m4w~xzn0DtI&ta<@ws?9mTAPb;w}S;1M7MGt zKPqSZ8q4roOd%6gE(PTeI`F$wFedXo(0tixNEBBDw;QT3lZcV8>e z9A4V8l5XbwV8&<7C);#CvyRbvY>8SXx@mjE-hr#AH{mDrjj50)FOwiz#tM#zx1ig) zKKAv2KWt;?b+qelg0*smP_n&%75{GxKL(1hwVg8b<~?s*b$mMBA34U1sq<0pV-oz! zm_y#QE7Qx%{$ff)9o*gU4y)w5(ecA1sF)TGyh>LbnxKWNUln88fB)diTd#@+yYD=Y z04YcxiUhB-`QT$4gkc;z80){u9!$$+$47s$c8>{rqBNKhju9vIBg4=cc^}X{5t}C8 zhKUEW;LPEDP&vPl2HlW@7rUn71%ZFYhK0b~>W`o)WdLVphBLF9#ISRT9=Sim7$==l zK=Ji;aMr7hS=;4@0m?~GW5eTgqT6ItIRdh0%>&*gZBCc4eC3EyWZF zD;vQFb5&qi;Bnau`tfdvsFUHjQTSRX9Nw%?U>kCu!SmI|@cG*)yQ?(@D-B-a15sV} z^=srP1oOefv=+VW?BSbQ2H1X;W;(-;BR#1=x_%J)_^k>q$6?&G$_&Obrs1C9TUeDG z0dTraJ6}PTd7$#!)*Q zp1cjOz7T^H{~u8MMVfXOWJAXcE%ww`K4+W93R-sjD3v_F9-PMBvQw`7hxX@uAffmI zTi*H)?pDtO$@w>V)vC_8RoM*PG9;*c1`mU$nqal>1WvuZIOa|4W~4Q?*)5jWrB$=!9#eJ3GG_QgQ{qe8fR zHiexkvmQxuIjdfvj5pJ=(doMts)_u@(Y{2eSC~$yL5+Mf`)S~d=NpNiAFy74*_|Jd$uwPCs<$YQehZS!f=;9xh zxS~{=ga|apGY8Y)kE3A5y7a&<&xfqSrYtsb`~*b27Q%ro4`I1VBwPqQ!m7^Hht(r9 zaP-FlXsZ->Dluhm1iFU`({tP`0N>HYH9?kS3hM>xU1kV+b4{r_i}KVt_<0-Ifz~}ajsG@ zXsw*hHqK1rD8+n0{|9yuyt)(*Kapa~-oIpiTij;aN359_*B`^=-&v6Rq8XicWpg}d z1qgbWAtTdP3a2OZ!xY2kOn6<3@xOin2NYnWEP?J55|WR3%U8pHKU|^mO9QygK8`gq zqTn)L8JPxwHoFLz$!;DHl&}fh&7|2Sa*x6E@;=m8O@QXZ%ix{0HtVU8i-+Cb^w-RdgoRj{fiQEEDrR->V=J60 znZ|5)2z&4mXKIFE-`#wu8vTKWLv!(O<|ZO^ZXE1?1TwZ4f=S4j4slderw2^?aqrY; zkR2GuMD?u4#Waf9Ga&GwVry6xOOEl&=E>;DQ=+qTDw&M9wID7vk957g2Wx;st;*qt#c6Z==#D4JjQ+mI*gon> zzfSVN13f~xuS|oAM=~h#XtcT;w~PtlwL=S`Z^1i!y4e|~C34A`iK_55>?%%AH!|kD7QBJq7gFHCt$ zK|?1E%iDG^U6YG|KVdd&w$Kqdy(!rDd?z&eYU7tDf-IxxY!bbEEliX0L_f!Iw&t@C zouF2T6$X?wo0S6tg1&grN2EgAA`pY)?I5aN11BFJsPO5N1fT84z+cT7QrlZFr?C}{ z1Uac&(Q8rOXA86D*lO_C`U__+m*d1)e=#}C9YMbn)1PMwIG8d$@O>E0XWO%~bUJps z-h*gMdAcR|FW4@Ogm)tmJXMFoOzcT>!OY!EE^6oEuMY`$aIPNy+Ng`4%msQ#i!dn@ zp2}L>Prz?s6R6^Y3{=2bw7OJ;E?4qpso*@`GCvzAKS+pA-r9=2^IS2G(dUYuHs=cE zQ~JHb0j^KkMAd{Rk@sz$v?p#EI2AXN;nHrS+4G%Abo>Oe(cv=f&2vIsHxq2Q@{@d4 zwW5tep(JXX78U*FN(`iDP^&-p(d~B~<7wcCSC{-`HZ3X#lNT%4Y%OK_ATkylHyFT* zOl5o|dmMUndC)1!LTa55{grAX(4>7BryM(i zJ7=eI-k6@pq;dicPQ#!S6~eqSzl}eF3bD_;0Ok~lvmD)JxP3zq%J)9Q*N2y3r=ck3 z;(V;|kiwn1g3Lz?L1WJa5THfaN0xHXa*;*1b`DtVo5ai8eG&b1vQc{7AsAUT6%Huq zGHP>v^FD151>Wo#_(#}`PT3v~7q9PU)-*Tb-Q-xg4Nt=5FJeFOj4iY)+n>Rq(w?@FzHxl$r*(S*3 z>2jLhtlytgO)$uX*!f|kkEM;t6 ze`0y$cKm(Y2@Ln`LdSk@_J%xND=b`yBdx#k6(uoCxQO|UFbf})?0%o3@cd~ zS#R7?=2x-SyBS_~|Kcq>p+TMUdsty@4x0QKW1f%v0>ul*IGI03cq)r!NJY9K*gyUN zEBPBp;44ATpLl~87%oq*`>!V3>w<{k7CElgz$C6@$_JvL+D|8oNs@ToyJ)jej~M1n zqI#N@w97&kmqzcR@+u=(pYKi!ibc5rXWr7b4IQLo@B!3c7W^Y0qsfkI2dVgvgShN> z2+h%64>N@>($%KX?8F5*OyDL%)UB_h3K2H&!gLSy>hc@8w%gfR5U zRutEDfU?IEk>_@X=-LbcFHeYezFG-Q_6zx?U6xcR;UPRv+zesYt;yk$eonxi8Z3%^ z%Pa_>5cX**%$7~V^P+Y5q)iR)c-F!4SaI^IznncDPvCS`7FKv2#W#oYK;L*J&0nw| z7aY8Z_f9M(w_;xc*Be1^mKPlH)MVN%uG4Pb3!_66x- z@}WMatig$Sn4!o{4xCN3a*nZOmgU6oTpTui|BQ~+-sJ53F!EMM0^0tprsdx)=`s-+ zT69QzzjUY$uRveJVi>BAB{OCQ^8EH6#~mt$?5)^lOqP#g zI8cg>MyFtt&H&DuCWDoVsWAEcRCL|*oRv?V#}j#y#2eh7#Xjs>PCT1pu$c(c3x!Ki z<#!fmR_9~pNNECR`#(u~f381Ur?dl(wtIo!gcjuaoq~YB9AcF2ODBn}M6rumY|8g8 zo=!>;oRxgd?&M^E!f{*XD`$>?^X-T2d)JWZHoDYm{YNa>`v+PB$8d?(Q|yvZXGJXw zLD+*s|L4j5&ui1m_GRXudyPtG-U8*-0u>cn>VQ zG2TTTg}1$f*2}{%SpSACGHD0#QF&aZ$;VkI!=c3d9%9OKRLTYD@*VH zJBX)kQ*e6NYdCH66Q-`uWGOEW0_uyQzd{*mw#T8|TrZpvlEcXSsDkY$cfi%Dr&zzg zYY@IpfTn4)z!2B7o%~b|cfkhmP&)!H)EmYM&cHd$9FtfRjFYnV_i2VuGs>Q9}IvRt0?x&PeGQ?23W~Cr68v8n_WEp zis6rIub_+gF_Sx`U}0-M>s6k?#8tlFNX-_;+n=kMfh2Rx^^+iWkBae^=4T9Hm!rhc zBV18g#kTg`9U zFs!_?2tjfXezZ;|;j0^Y=Zp_w^oBziqx^9;ac)SO%m_-U&er_LP1*a5s)_4617#7;@r6w@amofbVbDz$9LDs$21PI zJ@#OJ*E-l~P(&sYjxe(%RXF9B$Bw#nz>II5&@$r@ZQj?1b=|x^=}CqrQptXNPzN$r z4`9G$F8bF$VT#;;tX8#2FzHJPnLa}S&D`2Cbiq_q$bU^FjNLI!`yhOY8UVB6R1DNl zA+Kjo$K!2LB<|BCY=5pq>m!5^Ti}YOt)YT|gSmK&Ns3qNDp1R(p4faPgIe_2GuQ85 zLF?kM$iq+whRb}3L5~JanpDryFC?)oZN|`}a~CR=1JP@r4PND02bYKasP;%rtgpQk zRDP5bKjYM~kw&8IDS2#uG7WYgi^de&@4Q#|E69#=WZ=wsw3?zI8u!m)&C%QV|8!0C z{#ynv?-g*?a8s#6TAD^D=5=4wbgvjg42M9kYb zALhmg;X~sdn6bE*424^PDQF5rM@k^BbOnkUKB1Y>Kk(4YkhqO_W1PZE3@b3Ac4}hr#@Aal!d-@?Dh&~d?5<7?9|xCdmgZp z?`L^`k;UJ46k*Y#=dA913B-OgV`uuKAXv&6CL`65T<5==5nWG8w* z(1_c$^yW%;uzda>c#iS6+35_dd;cr3uoA&N-y~SSsF=+gtp;uXSUmR{&^>DrT|V^X56k#h$d7s&SV5{4UhXrzy#|(Y2Rn=FdK4!7PT(;b->o z=(E()H;c4%o=3w=>tLHh9u$ObhhfVaIQy$PmtX0R4?@G3szwfe-I-0Ie+wWr!!10+ zF+?^Ul7?$b0tJ=UtHHxXjV51H!l^Ag@hN{t91d-QjgqJ7R|OS2jZ9A_KOBU|=RK#n z-`~Q)?oz0HFCrJOzb4+wZLIZBI}9vYg1M1;)JHUxZhjC4By)s$cZ6|~c}fDS);Kf{ zT89nxn;>bTj1@PZz@4}JF{rQ%B+LiUJ(mAA8*@4u?}a%s(zMIaJJD+=AGDv+ z4Iy6|A^6H<@!FRu_@lM9IIpQneBsqHDiP2~E0weO-l_)+i-=+FeDBeS-)AsKB?4>b z~q~1T+~^|rFHg$|JEDeI`j&D-d9Mxg_}9ui=LcgQxe{O=?|yV zRBka1f z9qbc!k_9&}qq9i`WH)GvHd$PvQUjC0NxK6Qa#N_bl>+xB`!E!2z6-GtSxm`?f6|Kw z1tQOQ$QdkyryqVsVTT_45lv8AYB=U!1{y%3K{jKbSKb*@DE5*K5oEYOIe zczl~LHIKLql1DD#iI5c3ZL)xbcmq`V`x3|obYN7s6#evYF&;UZ3N}F+VpFlN_=@fo zw2Iw^vt^D$bwdxk^4ktUUiM~eamfIO>%Mp+R|WOtZehU6NXY7XfWb>F(f-cz$vZ4DhbAvr?aaPA0X+rw? zv^*9sZ6lBTX<9eVW_QH1!gs2ezPk*C?bnHcggrX9DT(Fq}kh14b_^Pi9f&Qu(pni*uzHWo7b#IAgLL?}vXQIrsUPgG9 ziG@DF=s!mwI5PpCj45HG_rGLq=4*KT?HCLPwqTMRKL`3*qfST;i*xG5SG#O*AkGfL zR*!+@vhQq#;a}LbR0WpryGl)nQ0(x>23Yy=9n5>1M0Y6-muH3>iS5&`!S;$oh*-#L z$nY@oqA~{P)V;V@IUC;#-QbO^Dw(EghhH^SDSv{t1M!ofIBOTv4|)PQzbV6=K?{)e znkNqV;Uj!8pJZzoIEoshZ`t2tid^p|0dATufv4<4nDNuoRBiVxIGnFWgkk3R!qteT zgsI_e4<)$zQjhYwkZZG>M)nkl1Qqa@0u&;!K<&C}<N{vj94o{gw{iact$A`<%VN8e-9Nb{GkB(vK^E1I*IF@F0kwEfh=~{ zZ1LYMQ*q{S55`ptgQh_Y22vlAr?p#j%~OS%yRL_%k_0-IdX=-wE@n?&m0;!qHF4oq zo_XxeVlH{_NW$0vc3%VAPq<$4R;GKk~>2>9%CQY`B+lug1>^-{q_61mJx)(B? zji^?UK5LVgBYWm1!UmT~^!InQD5IYWhW! z>p7p}LNceCK%hY{yoi{A>HaP7tA#&F5{<*lEkSV1vI>RTVOVI-H_ft}MHfSJ$vmt5 zOeTL7Hs4=KR-b#$95VC{{9<8N{|Dt;hkjdrYzfOC8?+;{}qpYjl(HP;b^&aJ)JVN6oWH@A!U0a zk!+fYfAtyCH`xPNwRSa13jd%Xt(M~H>3xFe%&56gG(yO^4sBjw&w_55%zdsiaTwonaurcS4vY=7qM}k4ej0)35{FB(5I(> zgne8^$7|bRiuE_B{9Os;mTv?fDH{;al*GO~6r53-&TY51pc%FC*cH(a2fc>KX|szE zT5*6YJ=n`GCDy?5U)=ELC}}KsDvRdP;dBi!aIzAjTXg~?q_&VWOB>EV`2{v8>%#7% z>L|KV53v*S^t)^w`kMte*ePIkTG1r&%`7};okyg;-2&Ug z#`KxfA{wgf%@te6(9Fb0Z2Rs?Qx)>bk5mzBd>-PBOA4Tt?tu0>gG}1l9QXd^0vYM+ zF~RgZyx+Y+bbh}aZA$2YnA%b5y;z2{Hu?xA@$bTg{2A2eM5Ca}Q%lresl$y**x^|C z1CqL>6N2R^K7QmV5;$%^U+c}xFj|{)-~5qw>Pk`jd2guZNSV!qu?jAb&LQcGPV;-! z7Q^S+%_Q)g2F+6sg-=iH*{rk5^uOU#sNr;_{B7|YYPUtgZuJU3)@iJdzw>O08h_TU z!F+CJEdj&(y}*&zy+aWnz*O`9c1fM*{4&nNX0ezq$7s?QwDy;Ch7%~s^xJkz1w`WWf(ehJ}~S7~xV4L&Z5s)!Ek z6E_ldGUNGFJEMS9?A^uh9r=GBprnEK3r|cFZ@69x-;AW_U*%*yp)gSlh`kc0dXJ1N_ z5YnU)qBJ3?M7;O6*89h^&RS>PyVf~p?frb7&#<-jopBWv5)%4fiU0pqQgBrhl!T!6 z$ZErqEnD&Qx~FWNp9<`OA|ozS1OeE@m?`x!f5}l)*jUESoS+5C_0`}|*Ut7FWr2Gl znsJtV&JOQ;j+oSl=j@E}heQ@8_dh{>?mf)WP6yfTIZ%Hg9L0lLQ9`j3gE|+16>k^t zKeTY%XA1$V_6Vzefc{sV1&`&T=w!)vaP;#LG}F+dDaFkYIpYQI^P%6=+@SzO3-{yI zjcR=U`aV+ib~0CfrVrQH*pvK(X*`Yd0AmmDVcmKcIxn$}T2A&N*Q@6;RV(aam(>Ln zDX@m&$KiBxu|56Ja)Z(X??~0yOz1!Oj*ZmN;^|Gn`j%+GFDS}9o5RAVIp+%OdGfc+MN0{|#6y4N2 zS(6t5sHgS^xc?eCc_x=Jsx2L6l_BSu$Q5cVR6_C|+ED*76EyG(CZlm-B=OH$@=2%Sp(`W1UTdN`C>r}q)ta6Q%iZcLb^hEzRO zo$B;EklOnHNciCzQkgqUrzV%u85$|vp0Y~RiCs-e@CLr!^AYlLx+Uq!Zy<)QD`*CM z=crYe(pB=esd>>!5@V)LpZO+}@?2r8@p3`EN5_DlwT9uH;Ik)k6H#H|G%T_51|@N6 zDy^PE>)x**o+*TG?i*yU8QdTpDdAwVGL;;9(nB}5bW^3(H@QpqJV#c;g36n1=R4n? z!QG$Y%a+W`B1dxXQ1PD+u<&I%6`T4T{e!H?+^RliX=g6^#_mTC+kn!kcfX>8Wf7A* zw-?>_X+p;Io1m<)gFXLeIbG`LO2-D0Nyxk}ICg3)+h{w1^q%#Cz3<})f36|^cSDi( zlr7~FT0!p~*5K|xv6(Mp?}$6Pj*;#kKCI?+0!xF-!K+~a0>69&PJ|ISPJP1Gy|98) zL$>tLgD{x9hb`S=ECl==A>{N?2l`599^CC81MSu=m~7^YlKXyRkN7qmOfQ7L?UC5c zroqkj-+0PzoEY4&^Dg3XRI`DS6aNVRe}P0&%|UTkflM%CIRT>3Gqdq;x~UN}wm zd{m%cf}ZeB<#sSt5e>LYI+dKZ_M@e&IDMp=iHfF`>>0fQgT6(TpkUaDmnK~zpT1w_ zg-%-uDNjGbIhFfRwBZQ~Rr)cK3!6~L`HH?C>|o<;LSf^`c68|Khn@|#*wp!gZCZK= z>f7IAN>V1Y$m-z1!EE$1xd$hdpJJ?`531QeW6b`ffQIU0=xAg)AIwY9Y|0r-C7KZN zH3CA?M?tf}3uR&oSd)yaSTNcI%5L?{%*_`uLq8Il#6LpL;y>7D`GpNhtb?e_3t*Q0 z2t1yikJrw8hNvIcnc5W$!t*ugs1$_y)D|+_$1xUDTLAM4qliQJI^vT|5 zM0>Q1Txh%seaD5lKc^bf%R2k$Nu&9+NPZI=qHcf*o4l}EH-+rbUqh#+cCzb6T(FMi zqL`T-y*XKk3jLQ$4)%$%l^SyVnQ3=mcjy>)9Xm`l54iCczh1=k_=_|;>p0QH&7vGR2BcE6DkFJ&F-~BGZrTayQcW47UAIK(ZooJ_h#h9Pi&CcQ7fweGzPU9^)$uiaOO-ql;gTT}>y z7PuKKqMaC6R|i60PjgzOH$v1jTQ=3Aj7fbt0ZvaNnCVHUm&Stor*}bkwWRkL|ytDYUb^yw`UYU z%RmH--HXA0Y&g3nKo6x1Wk_oNTQ<%p1kBzgGgfNq(9A`=yQ%=q_kRWVHGI}#ffg|{ zX#`=-E7V^!jyR{x<2rm*rNn<4U35K{XjY^$9^^3SeZ9$?EINugV``8*KNu|ceFYKh zM9yhGiXPCUWnbd~X9Tg|wWrYqHWJv#F=uLvUSh-fX86x-(_F^%*CF>a8npJy>lVGI(r(oolHmJG#TzT z6?1;yxn|BQDe^?nBQ9m=hD)kaY{cr?&ibkRT-E(SsmP`N5GI{IjgyNGSsa&$u`*d zqN~+*2vDg;<;N4CbGRKHXYa%1iwjVw=WfZc)N8PO;0MXF<4|n55Z$LGLO|V1=CBxo z#4m+>EIEH2#>zdQ>V_~l_$*+hzGPxT zlp;J&;i9T-E2~i11yR55K#NocOxd{$&wVz)iqt3Y&6dyfCBKFww-hk`(vC;#tUxH- zNZ(BPFn`~#7`{e(CwD~$pM3Nz;fLPY%O6x#=N6l2lR?dEz&&Qc)Sp&G6Jcj4Y~q2c zNd^ja%w@~x+hNW}XS$-tot`W@1tV_}B(_eaSCz%6g~L&@AbcV%pWH^IeN_2#T06;P z#ZN?8tCouHl*5SMlzvanB?$w%B?m7i(hq&2GG?j4{^3eZ)XADh;jXckqs0XRztWT6B;(41MMR*3C6gGW$0Cu~-75 z?YVei?|$63u>_*`-e-B@l`!_<0ZBht3o0-~cXJBqBB@w1TKSP@S!P7txhHW~cmf{E zUxZ2GQuNmrP5Qmt8y4B`MPKVteD5bm#|tOZXUc0K%G8F)u2{|pIa~mz<*R6ijXw-n z1VhT3Fi0Jq0V0`vOs$V)L;8=i9S^+msk002DVj*Ur)ZO9&leFx%__Rfv5LyPvn6%w zW$1&aO7yRn1=aaf$-b-6C31PsAs&`u!rTRff9n-??OVhoU3J5c?iw_|#)eJry9SM> z8{y>JFz{*K3!)#>F=&|2Dxa}rP16^m{)YkxjEx0H+f$Gup256#PGM^!$^Z<#2`+DE z^r3)`{CGx3vNn_DQuAoVuN8FpR#PnKdkSZJ4Y0WEHoh&|$-8XV#~2r=(Tjil@ZgmP zIG@=|7hMR2&QTXKeWNrWmcY|5PT=T0&R$D@iK79FLD@Hp*P4Pj=hizMf9QuYYMJcu zijDY#AIm;Ga)_*3E5=!gpJ0xg2xVzZPV6|a*eja>|Ai)G@k{8K2F zwVg}=-8u6pS&Eg*z?bM{pd!MtBH4~}R3=UED z@x^bUzG!yg6ulym22mF#5HV3pNVt_kuCM#ahTd6ElL8;p^6Sndq@{yassvM+uLEq{ z=9P@K6hV%J9nQRW1NZ1mBA==iaQgcn)b8SSIxv4Rqj58x<^-9u@)0kfqvj#mzwk0C zX>sT8wy!63hd;ubt}E;;kCoiT7Y_1`6GFHzh1L0i0UPk$E)`<1GnB4)YDyOyF2Wg) z*HW79it_D3(|so!Y2=w8!P+&VDQc?RVP}^3aZSjSk$2En zT(Y}%ln01FDgK-X&q!_{DiyR zZ_#dMJbLtUVCEhkv%$X$4fusveoqscoSIN9;x4GaeT{t<+tJMB9@L9}fkD%8sFAt> zHSKRP%IP}WtseuH=lIZS{vH36Bi8AbFgXSFjL@@+l6qvJ>iR$Ic_wkoU3)s!m5j)dfqi-mn&(r}skb3`KC<@Ck~!!_>NDCD;D@ zT4F7%$?rK|2-SQiesjD7lh9N^QZ?mJ?;ZkgS13N0&P8(}9b7aw8_a?$m}2`+jO~gr zTot#JfAe%bhzFMu)hn6sxzdrJt&>lF@193_WlvxRZzYVr$%2luTsm7zlJ4IYLG{Ok z_)@#NNJ^b2Zmju9OnwZ}Kdu`5iPA2hVV+OO#`E;ANjPym(M5llG{Lys9`YmL5q!Fm zMvi){;BPCbB2^;=ICu3Fl7BCrPJ5cmiXK`;*sglG-5AX_XT4<-?Ud=Kvez(AD3R8t zzoDHv^N>@wns#X*xw@T@r+;75sBCYXmk#K?qaCZvA2KrLOYoXa7Fhii%;-a>Ai24W zu4!3I@-GjQo(EO9JFcBn7v|A7jy5FazzTY1>VEjVDvs9mw_{@eMY6LhlpUGy9$cRv zU~}dU;j;azcv+s(4`GV@tqF^X_xXJ0e&tiL`j-$_Jy4O~5qAz^OQu0;)J;||U@_~s zF&w%loe|()GD;QH;cfNu@O2qIIz&iJvR@_Lu__xeMvMFK)Cdb0RU0uBCS_EavwPd4pJ` z7$mMvAn(Orlc*#G{uTNQJvL3^aqn5ML;hB0^$LIUhv#Qer&ok`Gl*c!MmX7Kt%=R#HfHTLLzM~K^vpdXcm z&tqp|)Vm}Mzi|YV&*w4?@0OxubqVbIHiUL6{SdP%9nRf6jA>X33T?q`!74@IeBTM4 z8N^#j;~X~>OXy`{C+;O%_dM<9qDJG zZ`WG5a8nk~MWuj=Ml3WL=%U>HI_7&)8XkOe7{v@&&`Z>2h0LEAgU5zNgvq0 za}YIbzoUcU8PHx93vN1PZ1_?mXtDl_L%qL2>0>Q=goLne;&-8bRw_Qebr)Q-H(^eL z91P$!2=VV>Bqd*C+OZgj-T#-JzrG5sq6c|Zzk?y;y(>t{rmIO57m2-u;RW#+Au|rOf^}=RAqv&@e1(nGi&4nGvNUGFmNLnWNos6?=hb>lNf~-l?1H&FP@h$ zX(i|`Y-j(@vcQg=-x$}EOVCB+0c*PE06I79#-_~~Osh5va<}IJ=T070%xgpI@*41% zYsh9?7Q9pOQpjL)P;X;9C}*6)kRo@Sq85lkLy-oZ!Idba;=%!qAxM2v#W-vc0cCzO z8xngThAMQCdq$TNK3fdOLLcI#mHpsRmst9t@+H_r+{2FcV9*VU!$G$Q@G&2PM`hnI zE%-d%-5G|G{}i#+^*3u|(*S=5qrfFqQt38n8CT1zn5<<8tZo0 zy#%_i5KSzvu>nD6V3z)U*tR_t@-i-?^3`47AX!808@HZXh z31#kJB;WX<(Le}_@3O_1_D_%ya+OJOI*vk#3)!kRE7(z3h5@^EpktFgNZ*N}6K^-u zk(^s#6CPvGlD7fc^z3Qj<|$m$=}xR%q#P*wX+nZn1IKmVd+;*0Ly=h@K*(eTTO1Mq zgQ2N3{G2~cKQ2Y*E{G}l{`M8I%F=}OS#v@EYY}vFqzpw*zJ+Rqi+uaESER;FksoM# zn5rf0=3ly6Ll3SMrTeTdQ?5o7d)wI_gwCtb!k1l`y+DDSI@iIb>`^AH!!26<^#@&* z7Y|}fuJO;*Hp2(S9{MuXkGp_!M{LxE^OO@KbxhvADXoI zIjUOxNdF-E<Rr#6&2UONQFZCV^^#zi)QQGlvuk(e3-NKFMCO`0LNTfVcw=8_$+c4 zzE$2}m1~Q6v-EmE#!eAx=YPO!hZB(7e~&pQ-2+WF1V60&0aAz4u=U#IxoKgav?yRh)rydlZNBH4M+Yo6u9jotV&;4ac_@L#=!n%h{NT{%Z@^v4hHVX6JkK zY@R|UNGDNUsW5un+X4peO~j7tbGTb|8~45*ZA>j za&ku~?JAic;m$d1o5u-sf^GrS;gPpbggqbBFb58R~v*Ha)jD9mdZn5}(0y zux#x)Q1r^hWja}`c1$MSoRZ6~UAB@$di%q{N#gXuQ3Gb-gm!pn;!Mvjj9|Led}#i- z8vjMq8_+fJhjVN;naxilQCZii&+}3IKEhRJon$0v)q@jTPf-la?g72;m_*wp|j0TlGs2?=zA#8vE`(&W~vK5 zPpw7ezE!`{dUe6(sqzN>HUj! zS(XfiryfI&$sgcsk7B3C4?~6gG*st}qpP(X$QZ<9YKIeUP42^1#ZlbW_5kNp#-isl zigM@TnQon*jI)d%GxVpFt@Zs1{QoLB4qsznBZ&c*J4Rr4xCJg1Jw%N%K4|(~VEVdc zVQo<_7MN`S?)jUXj@1L4Au$W^+7}2hJN+PrrlYIDWYl}QljXlY&71k(6G-^90anVV zqjKIKj?ANISbFpU9#_tQ|3acLX_63zdt`yfq?sH|1s%pz#vAw?-%{_Q4&3ncDRx}E z#+f&73^G1{1gDKZ@#u<|jAJz)KA(Gy|6LiyZ|{46C?;b`b0#(nHy8W@i&%RxlA66UO52{)K0;Nbh5 zNv@v^tG=Jbpn5ZOn;_83I1lve)c-NBY+i!o$V51t7m7K17BbJLFUPc)A-MFX6&y6) zu|co*Lc35rT%`Z7)bk_}^w?89BibU(w> zbzdNBIS*VM{z6n;KQwN5hE2!hu|}Zr&i^?CKO}!6Z|4Kh?Z1l)Pk+aN(~H1nR2dRB z$1_g#Z^25jn^(MkB{rEnV}>19V2@uZR>h=4)^Ts>IsXLtxAPgUz5^%HI~{Xc@)$>( zY{Yf1Kq8?Q1Gu%HFQ7aaqSJ73tDhXg%TBpW@3Y&cer)?-KhZ|F1j z!oGLK;M!5c=qh($$M{i5GruR$AwM&H=~vK{#6!-!3t+M<5~Aj2f~&$6wqx2~j?3V4 zNJwtS*#W62f4v4{?zRC(sF}^X6b@P2FQBACHZyi!Fu&#YLH+a@pzHV_Mr}%GPHy&r zeN%$aO=SuW72b!@SEdlv>k8NAbU*?)fW)0#G*t};{UKAlB(98qIL>h4s3ZoptbmaJ zb~0JwQP^*I1s~-a=jZ^v*t{}bKk|F zdo}p#@**rwyNBm(vt{1^nDv;afDqGg%Mh_e#OhwlFrKEE>d4gn;t(`HcBp1+*$mfD++&)O;ri zn?*jNLrez9{8|ob_0Qnwpw*kbN~KjV^m%g4~~8l)W1a zLED=^cd8Mzk5&RFd>lJ6xR`Rx4^HOgu;Op4K`3DsZ_cnebgK@tu79?}@k9A2vFzBHJMs$4LGYd5 zFyB4}((VpIK*K#a(NeQf9Vy&VfWR{I%zN1;OLKaOW#6bWQoDqyJKuZ*DsL%8H|^@E;8P~ zpFp_Y5-3*#sA!BwQTsR0SsBSb8h!{Gr!?TXodCP^ofzwh6UgF&Cy@RegLS&!F(qpf zsZe`?Tl=M0x05LtTxm$2c>D$9qPeWlwI+DsHNe{%GXV{KbC|&^@pv;j7tR=@;Djzi z6944}>o4F;dA}*x*B4^=IzF73e4vk%I;oI_7{CWEWM|hdA&tdm?*rV{lN%Z-dotV`!8N%`!;oXrv z;Bnw0TefEm4m+nY%O{2728S@Pe;o;n{_jkn{gsp%lne-EB*pO2f`f}mtwFgqZvNb+yXkif&qxYu$O zF0~PY$Bk8(WzhmY>M7W|D_MXY464K5k9vI!0VDy?GRtEWMQ_HIDhuY(E?nql6J z?|4Swd5Ecuu+|ljqSaM{2X}q3*UpFYsNgJ4+boLj4m1K}lx0GY@8gVqOj^ zZ4W9rL4&Y;i9E&{A4czWK6u=94n4~)#;1X+A@r9wyiJH0%M3{Fr&S=5o5@-ZF>O%bDwHn_$=EUM9rm z2XF2EJM5ibYP@F0g`l^73RA@(gC|(CaxW;l zgn)QP0|ZDEV1{lrW212#uix`x!xYYf`rrcmSEz)FO4;z<*cG2Ngu;>%KDJq16zF&I zq(AUJ{&-}Mf&3r%d-XAJi^^g|#FfD;=pf3<`$EQpUwEM71^Se}W;aRh1h0ci=>Elo zsn}@?wRZl<+6BQ5iw~fZEKa%nAb4=N^LXP?`Lb6 zfnF{ACp(CzulaC($$e(~Gujx`Kgi5GU5_`<2Cypzecbd*{1ayHz1S;Br6^9OM~U#u0&WsW-gg2EpM_KxfvG?87v zv^iR&EnLc1xp@llx1nkX=@%WiACZy{V%!9@Bf*q!rQ>^2&jWuocJhhW^!V*GF? z=fROAJlkc$p1NJjxXu%!7u5t9y5lfh@;Sj9mXn69&km#YCj-W4#2FTS^M;HC>oLpJ z1ja3^_5B~b$FCZ1IU6pwV5WmToM{%tV&CUD@?a`semfu6e>wz3`}ML1a{m5x?4l#jg@zY2_iXeC3LL)?)1SCr5E5U*NZXS;LNK zwK8v#-=qG-Xl$GO3VL?;L+ANa5NeSC&#?0l`k@LZXxztS_APw_wD z5JU;);?rFeEKauk5v;KXkW(AO*t%-U^~{_c}9Ra2;xM0?@~A0$o@75NAfE zpvB4<+;=e$iGCGG-uTB}!()QHtOM@;_Ym#~H1gbEb&RshQuszJa6oA`xvS-Y3;N4( zd1eUbq2V_s?Br~?W1tK^{Hgf9a2Gn|Zi7}%C|u$Xz}>lrm_r9dXac(k_KVD*Dpt3k zdeafutNjs+$A7bO8M~6FWhYY4cs|IC$l~CS`80l_B`z3i zVE-)l!fT5?(B4ju*lwSPu9Iy*W6l7+v*=|1ez4(fKNUs?{W?&nK^+p6ji`)K6SJo| z93roW!|sR^`19u}8a~9rr)+hej_q7#%c3B~wn_#(Cer;XI4$IJ&$9c@^G}Zt{nED(KPrBUbQ&EG5$?S>l(8A$asd zJZ!Lh4O=hQvaTXSaPaL%X5~UBe6=7GLOpN5$!*HiZg3MueAqAWuGGP!TMmUSgYo$1 z9%fT&3dcf36(=RF$JJ|>5UU&4@YN~-Up4Y%*Kd2x{yk>MO2!nz$KRgNqm|1HRNTeC z21~%NJCg137RHS34qW*>h3EM7JSdNZ;GFwL^itL@ob_g#pg$~w=|ip5EZ`aJSUkoU zM{dCD_r-|+Iz>D^WK520*CFvYr7dT(A&)&r>#VjhcykVsvn+wa@rSHc&mZ`fb&;Jj z+Y`Hd=F#f+rR^)fzU+gm3^EyW$ z?$l~<`{lu8ei_8WW!4zDbR#%QNWh{xB{p0^498U>aJl?L9Fe{Py;A-ldi^ZKp0vZg zd!x}LZ!R7j3Wej&t63MGEl!S}NF;wpVzBKO#(wTy)Y^ZKsoau}`z)7nJdd_;lyav6 z91X|WVnUErYXZxjw4>#W7tBHJJj_qN3b$LQaN=AHA!4inuA~Z)Cf88Rm@fmlZ}0K$ zHO{0DJtyIrpbjX0lgQhRKJe!I8eaGcUs(D2IJyXr2>9YxC|$h8fV0RC-lPs;iCG`} z^^Op#uE~egXGNf+6Tq1LSHMYKH--oHDZ{gE)6l4>6Q(N6!O8C;u<_IkTvtcoKCgjn*#zKC`;0%1yRt6YsSqW^!+ptmczQS=_b9!G#-s~ybe|AjN|gZl zlp~;6-2rbjpCU17<;;~+qvN?N@%|(s7|;05OfI^{+;KMpUmX>Y9P?uyc-+QkWg;+X ze=1mnu0m^dQR?xcm*eiG0J%1Q@kgLFygoUH4BfrVbkAyM>+5HsTqlCF?N|0v%Pj2L z77n+%_P`mZa-7lL&dff)1dpxRfLfi2V7q6G-CleP7vGM?pRYPl#AX{u>?{75FFKaTbmp9|R zTwKQpPx67aX3^Mw)C5Ks+=a?XaVQgY1i$SRB@brXQJGt-(NZ-7R~VXsX{@V2|MO($ z-nWDc`-H$W?G7x}h+z}dDu63In@Vg-MW-_!^u?FAs2+a-JjNn0PI3eJlkbL_XG1{v zkOOvKm_dCSB;cH)B}jR8V!^*0#(Cfsdvbu!X^KunzuB31I#wBd*U!Q&k0a2}-4Et< zd}Ruo24M1JC)gmR0p42fxV1MGGuH|-!k79{T);UDE-heFoF(x*XBD}wo{FW18}N0^ zRKea4hbf6)IG=JuKw?rfzLc0r7vlnac48NpJnCh;XWfDxx3hTUr6(vRM>6g*1*q~= z1;@5cW}PDEF@|q%uYGoBO+)7}rj0W^A=>)M|Z#bc!F5~!B z9{%enV{_G_nUv!`@aaPYnEETT>pul@6ta}4(^`F+YI=jNk@AJ(lQ&S4^h#J|8Y$Q- zg}8m58d$_XWalo-K#M!~@y*|f1`ai`__0oz46^&ELZdm%@yZ4RID-4<&m_$=49J!l zR>*9*iK3!Gcqm90-?5^g=C=p-+8U$N6BCZy_e*3(cs28UZW4Q-tB5_3zaIAJpXL4K zn}GlDK#9g^FbK)i@cM4L!NIOf@Mww05V>@=VZ9p!{NzzADHI11ElFs@9QvdUC zMGBH;$5^2YYam3hKN=U5;Cvk7{a#ju>&Lx0?=vK+dZsTN)tryN<{I+89K4S`rH z_1=bSeU78;%&~dQYs0n4?1JNx`jwt?`#%N#3fGb4@VTI64 zoOYO@evew=$b?_GQ6d)CT-L;0mS-ScZ4Ud?u8Wc2EP)vYZ`jL%9M=}>VVH*@uw?xM zxUQJX#H?A$M!Luo=jHx5w0e~FZF9jjJTI2BVmi~aGnmox)q$nXyWzQP8s1G*#p`*K zh-6qKD;c_(Iabt($@NR%1w0e*k2ZENtC4v$vKNfPXMjoL6DZ8S0SPv@al^%aCg1xg zzK{-K6sEidgKuKwlwj6->AQ+E;n!7kvG4{RlU#83w8bjtSIk@CW6-+(3Ct8djK^LJ zw5uTtG&UKAOE*Qq&N>`Ib;X&_yE$xARRQcD>|-^%YvJQUd-#_U!b=zw@S3O!lxlbk z6>~O$3g4S`{H+Tjif&kA^#(otjx%58*mGj8-^28FUx=)&$9z=*56!j!y-Yq#(m#wB z6$cod_}9#8k#~Z5XOyL@rsA!^AZBgmYnVOt8Ct8z!f4M#66B}MxcD`pDenSa)y~E{ z$_b3a=2?&+$o#Ia(}c*@NWAP>!K-GjL+5>8kk*xlR-z3vhU6(baT1x+bCoG4{pjVO zNNvi4;nR`(yq58Bxb0rgOpUTgiL5D%#Hv*G@4qG{e&I^=OOJtjZ=SMZ>Xftp zLlraQTN870{xAF_Hw}R!=vU8$Y1X9*o|Ww+Vs*R^^!h?Ep-YvVUZzBzy;w#Mn2FI9 z*9O4YNsAm7__IB+HyH^RfAY(tlBGS4JX>`>dwkaj>Ku<{V!}*e-s%x}+5HNZ?mC2x zR_~b80XISH*%sU!lZF9Hm(gC|EL?cen9h16!`eOy#U(l}B*;n;7V_c*=UL*JRePSI zrSWgyW1n}PhxLhSnADStkqa6zYT_aq z&y`?iEm34=sy5*0Av?P4);~_nhGzJFVkuo$6jeHXn-+1~oC^Qc%CYRCEZ#AC%W;{R zh+~!gaOCRE%CYRX02!f7w}Qpr_$%-02SSX8l_C`#nxv`?|PDA5lKHfy6+ z!DLctDntSlGuS)E7g1%SFg-2ku{WO&;`zF7m^`|WG5;bmx2$J9>d^YWJeu(OMwtaDe0%e%9Zf`jfc%*bt!}3H~*e zT&6ghgCD$ha7AY4(Wt~Ya(hcP6)kXW3-xy>|~RK9z{0dkZ0HyE+tw0a+5t!E3!=`ww0&PoM#wkA= zyANGJv0n%9>)%kcSIfunC%(+I_qOD}#iB?nxCZ$)XOIZaJzlaohwrqta5g6o|5(kY z%Re2 z8umlq&~#KWA7|69KVbG2FJY{dXW*+FI`qKS8=$e}4*R#U6fM>jGb+=TGD$5rV6l5S zTM!qAYo@G)`7y58vuzy)+^lD9+o$nz-iqVa_URz;!kM^r$int`TRJEH4QuzvjP&?- z!Vw8^s#&}RTGMZ{#cwl^&FW*Fqh36o!OSMc+(Qkd~ckZpRq6HZ9(N8{#R z_|NJlNO208>29|L9=aPgy*v!d){SHK(aWI81`;{P)tuGm&MC^f zIf3%dgyX)@vdA#3-xz|310VhmJTkdX>J8cHZ<#ZA4yT?)U@J+fZFp4PX z%|?ZZGsuC+1*GP37X57@hyHC(QAT+f6BiaxHSMXy!ncx|{;DH-hsxoWPYvak#*+{; zD_ZCl26DODl;?x`-|Mc@-iB7*ilfW;TNM209dl3M_BP>vEvGQ0MGhCmH-f&v4<0bw zj*;rhgqvRs52`2AkbTyO0XfXmq*4rhv>LrE2HC=<`|Pm6e0K3Gf0CunV>8RIqti=O zI;;OBF?!%Z^K&QizdP%&Mt?rwgJ?bCyKXYoIj;eC&Lk6ui56fW;X{>W%bCF0#^~caTdDb#JXrF?5gY=yk+XY_ zbM6bWTnE?g1GnW(5E(R`Q4dbT2=N+xEX820cMZHt)+W+Z*Q24+17={W5iULLhNjY5 zWXX+*v{s4BR!5w{H+*f9)?-fo`!bK{Eb{=fHh+8`J&fyQHqrQ+?WD#|35&QtaNl%4 zT67n|BUl%IeVxXpVfwjzRy&zurSb$4q_-cJ3-eIR--fqbcp0%c zF`N9)(^LNcTD~O~Mob@KY{@NX`tJqm)w7^f@C~X~IWY?VU1SG8d*a&oIc!a6I4ETc zk%aLX#LikBl44)L+MiSDZ=czMyo(s&C!D~Zj(xc0b~vVsze1IsWZ1nw2|mx_;OIp$ z?$aVQ8cvhYR{t*6dS!y2(kF1QyTWeM@`NuRUc!$0I5y6s5dRyxi-QXumbPj>7UYl? z!rs;LsGA#w?4u+AL;avpDINp1{jn zb_y~d#bfk{FOLWrVf(pn#BJqV5*3`lzVVcTZrcT%&%>9o^Y&FJR{6_}4SxaE=tG<} zcpKd6U$QnvZBEG_LLU$QE3!^7KnGfo(jE0u{sNHb1& z=7G0P<+6ztR|Fn#J!huW8~D{IN1mBVvWqe|;EK}6U>D*EgEmq4&~Q0E8Ysjm|0!X> z@Jse{oECIkn1dD1{;z^F|HpFc;&5arR7hngnuyd%c}nr@wQq#xRB{lnW|SyRN@PsR z+>s%K6e=W5c=mcookl675}8sNDiI~cd!N5x|FqX<4cGPM*;|b(SnlHsu6tD3&_5|4 z`TiYvdDp<}c@yDIwi@XF(cqp-OVf{2J5byq6sCIRvy3PS8X6M-Q(P`#T-*(w0bhV; z>~5nVNQL>C#WQCgEvDX@g?G;An4I%ZfULei<1rB>m~mtwNF;O-+w4UoB*q`K-QSXe z#vN$DJ35l`735jKC!wvsDcHAOVzyfwLH>0m^h!!|lDkECL2m@zlobz5bb#4DEQX@) ze*xp8usQez8(4Xeeaj9PRa{LZ*AHewimTW+7KgHO&qq7FOukkKx3 z^sM|cc5YE1Oqbg(tS?okr*oz7gj_gV@6t$&7UjUf)gie5T95GRmtxeeti$zx4dFM- z2QXopjCg)eH`6?I0>dl^m|C(1JbE9En37GVRRJh@*pje=Iq27>&7R)OVJ{!}Kw{Q2 zQAcMerpR_u?OZG9QoKga$e)Fr;ZBT6%0o2D$Ml$E(A1O)F6k9aa6cdNFD^$#6-)Fw zc?}lkhQet1J#h1TpD4DDgMi~|B%*FO?1%mTdVP=~fJ1XP83 zv7-{((X+A=_c&G%?T@ic_J6wEv5-2j(;6nlNj#HUSBZjLbu?Ud0bl7g5x1B7@bj2h zD9ImBa_wcQ%!I=zadI+dzurhZq#r^G-AqkVo55ggBk>GL63S1%#V#E+h2BRNXH+>_$|<79nVaAVm)Xaip7>}h7lH#~i=2m`Bh=!QkpxT)IOT=bz*fJjp?Gp3_zHq2Dn?5`3={QBT+?IpEw)Gm`I6FI!R!AlonbUgwxK=#hmEs zT-vu?4Rn?JFiGwaZgQ66=kkTP*LW2t=4)JFQ5^2vQ%xE!DvQIFHsSL!bsUba68%$J zN0X*TlddJTsFidduA2h37wJJz@NH)NY#;tibj7M^JP_eO6HlIK##xglQ>(vyVR_I4 zm|rNso)UM6VBy@y4~e2TuoM!#7IK1YYs_5#1J1uKq$ll{ij)5|(Z-H`{M@?{cSdJ& zpL|EtjqVBH|92XU`Qs`s6sMD%&m*AX%M%!VWq|D3C&jE>h4k0-D70Pnj5Hl{V?ocO zVEM~D;#Tz)91Yuvb7vkYk6w$*l=drDm*5kZR%j4b_IUMy91Z`kseCXC-?34fz4qx%r;!YZZ9?_858;- z*7z4x`Uiu@qH>aQ$&2@1WazG|lW}>~JRCVX6eNTx!n{chv_El_sijshd=b2553SOv zS9?1wEG~g_b(xU&X(Z%C#KSH5I{0-+108N#QW>WoU>G+5BbKDW^_TYCX0!dw+dzlS z9lU~qd{tOqbA!ziUcrVwO|j3%e0W~_l6O(^QLR~-hMhi#9qK7iuSvan1j-lj+{-^J7PXYgT2<}eNBNB{aJrm$R|s{K4pDq*{w-et((p=3?*?( zraBySR~LH}h7)Jc1bDI8ndzhx&TR5-nCR0A$y;Vq|1WBs_scMl?|BdRZN{>epj_d2 zzp0|DDvy|`ub!MLaD#ePV@Qh{g4uVU!Yel|+!nbC^^b;N#)3}KfQu0~$K(p<@z9$I zj7}22y~^}}Er09pJcl}OT5+cV2NPSY(da=M9Pcl~@CrRzsKwCQLj*py?{TbY1CHx9 z$NT-7xK|+Zy}m%R6erX&3A z!}1_ZoT>%yF2sW6`C53_`0|`yA{4B;0-XiFaTL!*#0LyQxU~$F-0i}b?~Nc$ zScc0QyC8e00OStl<6p%a@%W84bk5ckwfEP-cG(c#t62f~{T>|WGeyERr$O9$b?_N5 z$0PTUbbW6Gi&^<>{3f6y4diI{#Z9GnxRmc7_rvQVXZrNoHVtDOQ&A32#Jw5E=Yk zFTAgP6l*rhVCi3_u>I&VsPoE!iMpC_ICMOIkiN|%8yg|}*+-%t9m4lY=TS7(h5KpQlB7{-MunBdOe!|dz5Mxn=b0q7~u6}n4kFmUz4Zx#2z zTz4*hP0gjL_7ge0xPk7Jd4fkD-eE`1PK9?$Ei78l$$riYg`YqB>9j*zc#mlk(Qzn% ztLWbe0+t`8)tQ#O!8000n^c# zcf}v}ntkHyj~DZfvH|g&OpIH(MKs=_np|j9#-am-sGHR|MchQ6c`2tP?fg%qIWF zc=NaH)%fb(3St%5z&^^aW0a*qt$#B(B^_pt$3n;hwgrD%_KJF!`@@r2{p|M&8Qiv| zn14G6g=>SMSp6z-Uvz=@z*>l1S+t4Ljd)B}9z@roNStXQDZc+ihuWG-!6f;`Q0_UB zG}i{hpXFO=F~7^}A6)kw;YFu<}lqD4e|Jz1kjg^W?!1*=!{>k*kqcE?*nql z8-E}cPaKW47aL;Y>QIsuVF_Okw6nbbxlstR<>u$=(Rnkcpp<$LK6cv$ciu$f<8pau zNDc(u8Q*R+nqR|VBnU$c<5p&`0<#{tnF3oa7R;J(yh zZrv^+rkp$vhfbfs2gSSD1GmMP@9984EiHx$lFQxI5k z8Ef8g%hS49!}~w1FX{*BZu>wq0-uRW#`%(kZZcfzb8}4p?kv3cAOr<7XQBPbn^0!@ z1nTZ4g7bWR?(K#Idad&q3i~cn^=sB5r_m{_hR-#+z1<1S5>DY>a7V6uGIW^gftAWw zGVzQ7Rdd%UX*F*JeP=(~USq{^z6V&wWN$|96tNx2^3=04jCidG7wQn;5>*YLctAjo zy4QfYkpu32{8reKQi8pOXKCmm1A@@B+UFcfnAqC9E2$%xy7Nf#%F>Cd*y&@Xa+QcAGGUUV9WE&P}$aE_Y^# zcC`qhLwHN{`Ctk)U7t;R0!Lu8Q>vP!BReA6Ukb8vNtp0(HZ* zRDbm=j9PV<+jcAtyM;FFzn#N>y8z8Q`=QWLN$gqF3vWga!(FTMXdfPhs!LMA%4VB5 z|6?M%sV~imWKUz&ii@Z?RY5#1_zhfF(uI9GnS2g|CViCT3!gujGL?$S;3l1omV>j5 q55+~nV)J`EFYJf&MhC-&JJpy5IX_!{#|0zIMkVjhU??{2Zz4GuPJ^b1cs^5X(Fc{VVq%`p%)N|&~TG*V}asnjg{6kcfxLp%9h(C{S$ z)9#(M$kDE#66Ui z#3*hxC~SO)o?f*e&HWY1@WWsr7pGNs3{$IsbdaU3wDC= zZ41v$MGQF22k1B@oM(Dkh>iCYQNb(*ZiTdYhSrv8NGkxIg>HH{!9-HEG< zrwTJjJ3Z(zia$E%B0o``Pj?Q>;z4aM$jTLi@dLM*O@1ExS+RCzch#9h~$si_pRk^gxzUV882~L=S_C!{c$WZ+>YyZ zBJzK9@=8+6Vf4@-)~Fr?>uYgbozXekbWKrsCTSXfuJ&yHvGgJm_463b|7E~V+t;%G zknbQ_(N4$cl+qFEMeO0#2&$NAE==3wO$(NJK-_I4o*w5(>$3nD&Hs+|2YS(aODg6Z zJ_Y61zhc5S7c!l#XMPp_EL(CS%}>2gwVb<1rAr1(OGGyQnmbdu$yn)=L_YrA!eT9Z z(ZpL>m|Q2xuk4A2xJPC5k5&i!xH^;dN7&$<$jz*^`Y~-~6X^w|H7w)j1I&6qh%O5{ zII|~yxKwi(`2U{s?wFm!s8{Kt`y(;b=meY5dWX*4Yr|TmiNVtsp)hv3EA0slqj7He zENZ)w@V9*$U+{4}E6#1jy+wM$_x@sh>+-Ag&ZHiyA9;;^HF*ptwU)7Y=Jssyzqzzx ztqW__K4P>WXB)_eS`ohRGACoFi}3~zu=QUbH_?9+x*ohtTq;f35xcqUb@Wo2c$3do z?JUIBAILK+)TOsHhiKr^Cv0uEkUg|a;E&hQWyia=Fn<3Mp~uVH^!vrjv~PYo{U))D zrFt|#npX@PwJU^W+;F1t4jWm#^iL?s6~`)XM_l^gFbK1j5y4R*cQp4FDyl1Ck@a~P zxo|7H_&0*p1gp{0%9QQtUj^y%31F1&MVv-9)4B8!dp_qI+qm=!|L8dbmc9ERdq22b zm>ktcW$#}C6MHY}emaGv>$mVGI9y_iCXu+#)r8h%S#gKTm1sA&7dyjENDTs<@ToD|oUcgDY8%aV(7*(}f4!ePOowb)53+9WNrkmiBZyQRjK7 ztmN5OD*b#e^*0f{(+hQl7uuh(v9s)<>&FF7XCj4pK_%e*>>C7pdTQFL%9C8Xe z@Z>BDSQg-pwrnza@?o5A%>P8f?@h>fW` z=7p_i5l!;6RH(%rT|S!~;`ve{R?QyP_^>J7W$?D_4SPHK3B7iAEWfsKtZ*-p;hVHQ zU>A#<>F)o;`R+4Qp>TC6ik15kY4v6l+u?7x6t;3VZ7;)`0eiIj@D)C--hj;=Pq^lJ z2cWU-4c?B=gcdn{+~1pxzGlgARJ9soj4z<(rpLs5_%>*(J%sk>9Ph0~F`AD*i78YE zBELsMX!;=NJo84`=zPvBBM|ckn?cp3kxX897BdX5LbKF6$k7WcF*WZ|g8vMf-rQsvI0S?*(#ck?5fG z4CH_Ma0&Ze_)n#82&)|ngi9v(@TFloKV!o%ejP5M#&I`=&Dla?aQz|5bnsA7NfLyQ ze0gKt!yAGik&{>$a>6WsgcGVj6U0gih({f%RiBw3!Tq&OC;#( zs%fw}f39$hp`Ebdt1UmTJDSs&AgZySrqILgnvA~K#WI83gl=30ogj?IA&+j<%hM!W ztT+FBQ?_7{(j>ZAza5wEbVNsNWicB9*dA#u7`sNBT^s31$A9|)j?1ILY#DOaYi^*E z!&TU}C>j)528ihv7!_{14CjI;3LT7)T1+2+%iH$DrhsqEW&KL3pl(F_GrwTkaZh2| znICkudJoDa8M4(`yYS>%Z~k)mNU&EXblb*ocJs3j8+fkCnmx5~W5;$ny}lRb1*QU0H&n_2>$H4)&ORDmC3Mha*6 zF}mERg+vslz=n;U>{$H)`ge;hJ=Z*1m|xTgKaO;>KboOjzs*?wC<$5NNY`}!la?Hi zPkTwmt~kpsEnUQpYkkM850UszJAqE{ae?6php4pNOmLKH;?6w20d9gM=#Afu-3vF8 z+;`d7y*L?1PuvEdmW&t5Z0M(UXAe`ysvfSgx|)fxVmK+WpC+kKXZMD8veLTSp#D1( ztaTpY5_x~35p0QbD^}B#yqDZflTa{!ok*-T7r+ZXV$$M#wAlL{+?ENsZ+fOQCF~L8 z2rjcZ%Kmh&yCVOA6k*oW#MqBUd8$*ELOkg~So%GJ94mCioFPp}oD~FCPTxTSJMh5X zGNkWTu$vD9pxyQdZ&#catDh|=IC$L-&OHmoSK%Xs1K}Pt`qMBw+$PI^?i;}mc-YR? z`o^#gE3dHi(=$ny!6<6}Hij0|gcETYPtG-DKFFBuB1*^B(O>*PHY>@G*?c<$JlEHP z#Hb;zRVA0(c`OUMcFct1)|Y9?FcP!-MZ&2MmNVC26?$>X8Tz&J0)Ga503*7(>E@UB znby7p8a2(0`RgkPU9(Ta+kP4TgsMki7uCyLcgqO9I_^-1$!%==GD&PVtfIQB!}u5d z7Gjus2x{+6fLWjGz&$Muo{#^E4!eYKVsQp0j#~h36Cgc9)3;gQp$-$8b(g&Ks z)^9GJQQHA^qs^eg(v~}5KZsx3o}>JS`}nayh()JEV5r0sDz1ow-G$kl%*RZOi<$zp zxA~~Hp_NlC{{T_H5}-vU1IBOPiSC_7Se8->-5Z4DOX5?AcexFwAKTEi&KktRO$^Um zvSzoMs)R4|KJXtu_)VSTa)rH0-oi0G0d%`fJMGm82L547(s)7*&BS*=L9+nV%ra1{ zeHvFXYa@;s5`3!wT==x?>r92WwoK>O#17HW z*Al{{_E0L`)6XtT-=`0G(VYL@y$nNtW0TGus(oz^f7bV(=sxrl-uK7C=hYwCQu};% zbNw^;DpX{-*Th-x(Mxo`syOpp8ba+xoMx|$joFj^=@5S1h-TeYW=aR-@N&c)Fnuu* zrpn2}mt)H4m@&k8x(ygfN2Q_ltagEG2*Kt<%h68uAbha^uz67hMJhL7*m5omw&miP zJ$unaA>PahZMY=i~zRib#m>6)_a2#@^GRUs}k9gsbJ3PE2#oxDa z1hYSR7rQLwg>Qz+c^USKEasIyvk}Zkx7!V{eijesOv**0$X6sy^%+Td*NR5D6&N_E z#w*<+#;y$<6wC~M4zE{VL{)AyFHCJLbvXZr3r?@cK|c*p^~n;n-bS2y{Wbo%?~AgU zncUJ(6IkVEdki}w#{Vugjs1Nc!E`Q(3q7wr|^`|f4 z>#<&#Tw_F4THkVRM@OOg!N0&)8|H<&3Ix)ND@64%15Je2z{T<(1lg#vA(I=t5r@XI zx%<_q|GgMI`oS5qK5k(=%>uBT;Dt-4`rywOSE*w0TDPBDsQ zfnG5<(M5rq6*`lIOl(Ibx0!g|T!r$wl6kQo)(KM$o^kV|=Mq{iCrr=y00DaYVBQ4* zop?N!I&Jh7o;h?M9leL?WZOQnK1ZK_SRg4h*U;lvX%Dg|MgClo(D|M@sV^?**W;KU72p3 zY9{=1A(lOPIGednp3QGPVnM|pPJ&yNH>e3O7*-ow3zcR?;{mZYJVg}wE}HsuVoV2) zA9xL43ca|5J$2Zt_69Sy9S09VH%wGa#7%u)up;>&w^+>p1GPNyl;9V7KPdyv=9Q3< z@e`7|UgE~dvFQ1k2a|UT$m)w9&`4N-CCNI_ysa5WMkc|6mrwDFr9GNECPSmtN9bMg z2dZSQKvmmIjM^5$bs0v3mAepHEqZXc1hLMbh~(rq60yf+MUBWo(ce6rdj17`@s&dd zuV>J%{h6mY^CdLD?jVNa|6s?VO1M+s3>7D?Ld6*?uvxtvcTD^YHIt@*!|L~-;VvdT z-oeqoeJa#o?_6QU`Y`wy)z2n&PholNB~8(h$EC>#f}NMKc1$i>i0R{;Y1v>NR7UP? zdQWuk8?c^7l*z;_#zX&p!HXr4)ONd$(81;&J*=3>^zNU8+#(XYLFGFYAF~LbVS9_qbEHsr;T~@4#}<@4HLiH#fkZZEyxX$#s&B|e zi-oMrvK2;%?q%KDvUhFBt+i5M3`@I13nk1;&_e(Ut_8*1} zgwPyi8KLs>3~pno4TLMd0?UsVS=jvnBZoWjf_(-$EMY_jm>z6^N>R^a7wSuE);?u2 zFN}pYSvl}b!-=Y&h^48u1Jrif6rn%+g`R823i!#ET;D}&*!k=QezLCxLj!5F3mPQy zqCUoTkv3J{{sEq7e}f9$TpDKlj|GX&-`K+{Y!3O2iTY#V%Rw99Kba)>GGZE3+7lL6_7S_g0#;Div7=GmtsGiOx&tA_%>B=H->h8mhGrvOg;&gD2IEZOj42rLU zxctRafY-AfyfXSxFr^ut4EEustXQnrs*J13KHwP>X*jZS238J#LDe_2$QJ86;I<(V zxyTk*%sDA)n-k|Nlq3Y#M4Br&SU1~~!l)_dbJQ+hOmgr$z zb}h&%#KWh4T~wR00Ae2XpiXH4IJWtL>iwU*&hken-SCIp9Q_2uY+8%*=c?ebq9k10 z-i6XZWze2}3cl=I4re0d&^_ulm}$pAvylPHr`D04_%z(#bPz`xaj^839w%l|YIFxh zJ)>bBxJq9D`|Z7`y`cx~j8B5z{1|Z2f5e5)Gl3SHzu5QrH_UifgPx(GoQqTvG%Bay z!|O@loV^xvp2@>E42IB)pNO>dQ%pM?4KaKFaSd5hY12uRRH?EFC&E4N&O{E5kT`Vt7 z+^PP0~4a$}U} z{{Z^hqTbVP7|qoC&?mJWyT{G~X_x|C-812?Rs_l}k4A_6-J1s)Tn;W>(*~*;$1$|f4aaK+pjh8kqmH0*6r1VD1MNOY zDXk!O>m@){_<{?KNrk>L1LU7H;DxJ<#G%Xg(PPn9@T|L4{I*Q%^;o;GT_Pn7KbVP!@=>T1eD5=AKwodJJUh!Yz2Crci@$N zZbC6rQ^@yTj+w73(S3p(zWJ7betECRmC9rI)-fNE(V3TF zV}}6Ep8f#gT33Ns=2jxz0dLcd zD53lo#LO0Q_d@+(-ifPBMb44c2Nq#LL_F-z*iBD*ZYiF9Km)QBCF$VxD*oeqd&*lq zMd;$7g7GuOh2M9iVgIv{!W7219bXnQSzZ*I@?4dtX?%t25Ugk8cU;2L8AGt~`DKvL zKhL^;gYeqsS1em&9N)3+D?892=@$UQQSU_FMd*AxMiRY zk9bWM%Kq2JRMY3uXUF5%=7Gsf*mIH2)7(qc-+98KNpm3YZ3r$)d5KrsXOJ6tD}){v zdBQzC2YA~RL!hvh&u?^e=7*MsQFr6}3~%ixLH?5bc{`HX+)Q!d)}ViEOO6FK=TG5I zEDEOZrx7dFq=XmbO3^l57k{f+u-5++XzS_IP!W8QEcWh1p(zjU-3rECmN94}Z-JhP z5yh=;Mo?2yh-IqjDA1LJdq1?`+sj~x{(KyT`~$pi>x^)*Z5l|0yb|@=La17`6w_Lt zVpPr|5Qv|J?s0}tA<9dF(<8wuz!=KPg27}agSLmxSoi1}SLZT=+c-xEQ#bz=v9^31 za%_d^Yx|&6A_=<7uW+h0_XNsIKY{GVDNr-(Ee0QqL;lxf;y&gRG}}^qyXXhV98AO3 z?x~=vxe&WXe<2b&zp&(q101tTg1U->+`EMpu;uq@?u+(fye_XzhnpUdH?s%%yM@Ny@uk&< zwn8!gUY<*I6Ij3S=KAVP@t&w=`KfxMG&4crnL5sNUnB1OZ!YD%>cZaeGdMScxHna#bhz%&(9x2N%vJ4>*bHDa@(9t4YYhv1_adn61e z>IPMq_-G^zsL0{n9Ptjrzv{A#O^P)1LJIruz#F`3wwCU>#zeI_0P*ljC{h-{3&A=j zzvCDr$1_&@%t(04`X&|Y5~pu}zojEuztSR&_e|`l4re_v4jmHuIP(qv$ly9g#M(bD zyKUA2-dl6fYh^2b-1rh>2b-~WxiQ!s+99G|x@_O-3cQ$ag2x(8vlsI6ATSl9#>4ga z=+yv3eA>azhD@NJUK+Fd5y?V@137$+Tjl(@uE9c?#!&vsHZP&nu~z0+JB}X{UBBHy=!p&4JLnOKUiN0hGS=j- zMMGch=Z>BK#W~JTgo5J_A;)YOc=l1;#Mpi)Q&7SMf1RVk z?*2|BU61iZ-ABmM5`d%KUx=#v3eQ(R#^%EcST&{_XZ`GhAJV^3u>CF=d`&|26Fum6 zVh-32szThlShB70C0I}C65Lz02%F6wlYWPV_{p~zE27gO>&SWdbh;FUH}VMI(2jTY zd^+Z|+#wFO*@!C}U{qWU`h7UW8DtJ{{zo$L+w2w$pK};I6Fzd24~TN8L^gW%+48I( ztitxZ-|)rM8^6512hQzP#6Yzj+yA&innkjRLw1rc>HfHa#zM~YGhnv!Dnw1o1ZPEm zu3hOb&#|`_;u70X#V-XFLaH!2=@szAUT}9j!XeB43`#3zlcCch{g(R$8YfNy1Bd@G zYHd0>w(bHrjlYC0Gsk0JK`IP3tbnM`&JaAc9pb|22zQmQ!2+W}`> zrSVeBLJ0kDJIRuY!mq~uSTZ0FZod-9uy`_d-S3CnlQLmj=wclDp^k?NzM}bwIx@#B z9Or%OgnZTWoSWhHzn>8qW91IgPqHrjE3Et7L#lGKB;9xD%eS3S+ZcQ{a4=Qr!NKGjaz_5bi3SuU3)J_l<=#f7&a&bl(o6+DCA8`KGWU z+z?l64uXek9zdi)3}nA}h4CBO;f{+5-cEl3;vx^s|NJnEDxc%%{C`l@tjaET`3X|4 zhu|*XE9etH3(cHG98~m~$)Z>#3ax=nxHU5ycN)onx@#Y=!FvW9TB?kzYZXcO-DIem zzLk4oXol9KXTphaF`TDA9{n=rfa32`G(KdB+cdAjl0{YMRk?$+ZGKHQJ@bd-6_RxH z0}t+cr5V)j8A}HW(m{~f1*0o3g3-pk@Oobw*RHRLUn7=bz1?~=au|TSQNcKE(N|d7 zBaIOY&%)A8wOH(?MAPb*i?nzJzE2&(NRKl>PmF{P@l;GvTZpF76wkSzhlyQYc&>4w zV0w5NxBG8ARJ`1SOO{kY@!7{*SCj&__b$PNO`mbGyB11G#p7waGPpKPAG#uNz9;=l?kJVf-p6dt`p2XtIMbh+cPARwddkHMpo(a7D>q)<)f#8GDZ%B#u zLkk#!%8%`+o*_f)elKDPKcu)pnJ(NDIFc6o&ZN&r#_%ev=aJV{uZz!0N`l*>AB6hF zqIZJ^y#lwf-E$MQ4jzRbVx!ppvjw0REWop$r_;riE8%xZ7#^1kzOz@HUTJf@9Z> zz_$kjV8V`{B!_3nR=Uej-DkJpKfO5AiylR#-RG0bH4TEaQG3W#dljaq@*G#r4Z%m| zdEhUchw5$H4gYIj&ip_7LyX-RrsAhV54`Q=;x)Um^SKKASb3Nni(bOT_us}F|IMc9 zIV0JJ`uCWocAfj;c8jZW5i#QvK^V0=nzvEU6mH$lL9MAr@!E(9?C{ShY}zRUxr%k* zu&k!bz?{U}P#-;$8#D4JCXKxebZQMgc=`gi{dL2_WDB-b z#3d_!{wb;sT>v2-H*l@{WAsrMr;+nQFyr74m@d7Q=L=0fmFlVA(- zbU|!*fb)7$g_~UW!ei~TP_p42OfM606{)A(kIGfJ?57THDpiJ?oBFuLWGgNSECkC# zf#^Eu2`~2Fz^emkc-N^E48m?=Ug9Q9Rh`UYR~!a;&u*f}O$Vu;#@N{50kPKxact%r z^ijCQc|Y_6tBL|VOY=ykdo{VB^qy00PK2P#ui=43FM2-Q4aoO}QPx;Kwz8TFo`PRgr+VGI-gf0@vCYa;^*7I3amU6r-fT*z+zL*fnxWb{pWmkt;l0 zaSqQMR-i>&;~}$a9w*;F4X+1HMfLN3_{1^|)B;Cv4j065$)8`STrMJ(lS47-br={* zdVuw>Lb%{9gY#oK{PD~X7srR7)|_xM^L7APS8<=qt+&Cy*>>31dJHDcmcjFRedwgE z398*M$nxeeoH9e6{oPWBN+-tA2Oo>z+?z0vS+9we)FOQHu@qFTFfE7CnDhF=QHpqz7TKCF$S;2 zPSAUF1{9j4aPnj}Ih(Mai|-SIJ3Xbit?ve>xOW_A_6MLd89};5xp!-(0d~!v4u3W% zF=9NO_*iCQu*yQFcR}>Xw~2g^pRBgtu8Z% zNW%ySik9GvPI`e>u@BdEWIen8^fh_(AYy0KnS}i|DE_^i zH?Cs`La`hYGIW z)(JPO1G#r=TS&F`Ypj;OhW_{Rp~UekBvuqdSi&LF6|?|U`(yFF-(7)qX%MOEJP6a~ zbV29JY?8X`74q_xi%0nGgtbGTQR9~*nkq-*D&~W=L+43i+16s0ZL?v;m{4xZj+dmm z#E-nQx`HOnW8kI7M>wdk9vlmLA=uv+CMe}_J||nbHyLxdz(EcU{s`r+NdyuDb;{`; z@P!usZv5jePY>=DfbR!MwsoZ#tD7T@t&6XrU702m%PAw{^ySErx@7X|TQMr_t0s!_ z%0}{=%du~F6iA6!@GG+pf!n9|r1VxX=1eHT+Fxg(;;jj}61oo>A1Hv>Z9eQ>Z46y| zN3aCHBbcRP%N?oTiN@zeezd)x;ad1VZf)FOlGZEIOBMcLaoiJ%-^UZHC`WiMWrgd~ zyrDd31Qt1u#rs!Ps3G^oXw#Zz-h_85WR3Dqq`F>sJZ3y~ueHWkCJQmC_b!Zn#-YiU zL~=aOfz7kN4x9IXAWnMk@Qf-SKmHv;3HxA3dj6F3EU*)FRnKM$o)1Jbzj#6v$)7_B_^UaAc}#o(ovLNTNK2Y+N)eKMN#p36 z98umrsS0kjZ=u+0J|u3VxOJ{B#;(4K?xvKR?HY`qRdldrs0q(j{0~*l$DhmUk?fjG zUCaHjet9G2D0+~=v>JFcpaHJ8N3rPBd2sHU6Qn&aUGq<5zKCPO=Gh>%xMZT zw(DRw_cT$0O6Prte3*#u%>JTU`+ameT903XqQU&B7!@Dx%GlA4EEckQ?#`=^{Ne>pZ?D}d6DP|o?0AC7Mb z!v9nf;QEbg+?J2c*d)cnF$z7L_hNqtbl@=FG?I}Cg{b{ClbbLkL)WXn0xSCgoDel1 zvVE&a!eu8iEK>kJ;Unk~y*dOjDQr2F4Mkfegk_l{@mGQzP0LKj;aNhiuh#?8UmKdz{R`Js5MRi9ES}l-pk}j$-SM;l$YyxNEB)G#+<=iQ68N)-RjU zlHNoedsXa8X(BeKor;&8Q$UOLigYh;H8wnv0pr<~C~ZAVcCODu-~JKkzPW`Lc*+2r zqtcP5)J;qyb)d`bH8xIqK!Oe0amNZZ$Sb6rj=~zS2!8|NkG{c1Emt<)Ns0%|kyMH1 zRQFrrjCMWUO79QMz*U((u-4i{r1^b-eVc|R;sc#Yf>xoY*X4eY<9mM%i4hefL!mYL9^{ zHxIBaEWXMyqV0Ao$;SZUBg3A+dEL)9@J)ZK~IVWa+zGMv;gkB z*$BDo-rB9wIP<`I__-hj178$j-#Q_Iw_#v2 zY7v}P34uPf*<5YvR`xf>8n0{DfUWyXe6dRp##PS4&DjFp*Nb&zbb|n=&73Yu56Xi3 zV;({Jzh7K_suXirHXnQ~bD?njb>6WR732x;Iq95hPg1kQY0-=Ta^?Lwyqk2EYvWJk ze)}|$r=E`FziuzsAg{`O+V2Yu%T&0iF^y;$pwE7*pMqnRRanrTi>CumLfRBn9FuV# z*4@0%&3yxyo{);)zWl^TJDc#QoI2eyVKO`Q+yX|8P7z!Y<*M@NE3k6TL)bqj5~r+> zK-b?#xxTYf0-^3aI3p*3z2kbgM#(5Jxf?>2PQK<$uY{sSyc@ACT?Lt&Bysc|GZ4c> zaT91 zan=Zdxyx8;UXVop)Ty9F{%j@~p3hJ|g8T4SAL-)aJl#F5kh09Hh>8XNLqGT6hm6!xDJm{?piQ^-ze?u7?n-B2H~X z7?g&6B=ni$3I(hc znBr&DiQC3)ohXadVtKG`k2M$dAfL21%m$ec%4FfJK&JM-5t`Fqg3YfhKs+Cjb4N~+ zy?V;@)P^*S`Y{g+4WHwt&9AxmRT~5i(_P4fKz;Zrn!D7_LeKnH#CUG_-sU_l`mEY*aYBb10%!gJ6Mp9Im@3sCm&7UHyNFQzw2Ks~<{ zmPe1JGmc7wkD48CR97Ij+1tXQ6}d3$mj)W1n@+-3?t_jwZJ=}Z5SDKZfK@wnjH}SeikWzbOX21yBHt&7HkaH!KSM7 zoW}zR7_H!fzQKrn?`{;IQYtjMawixqReiwNWC%wTox}nEKV*YCfTZSH^!f3>ym1{$ zJUTJ5{1k^veDHIAl;B8q0x17&#ewA`py9C`weKkrlsY%#?JJv*GtS1#!?NV~Zat{T zNy9QzJ@_0Mg{@m%x%6KpPA$HBfda~OF@j=m|o4^8VXVO{2Q7$e~cR`+VK zQeqw(C*l|ymZ@l&I0iM_8bG086eGpC^gx(CPhYAVe+(lo-d&BqhoaawNpCFDl0?%P zS~$-v3cR+tbEm4m@c zJmw^8hxv>8$m}bRQAdFfb|ds?f>tZ&BERI8Rxc<2PR(VG4>$NUaUhUP0gWcZ9Yncc%h?W3@>_c-6UAOp#&b;#ju{p_mI4N4aW7E^TX3uvTo^5 zG&k}Yu3@>_q<{H?iDbgbP#mQ6kzf4 z*Q8-U%$~ zROmjdT&&LX!QS651s^<0xh;WwbURQFHoHQ3)kjwl!`cjTuTz|5sd*D!rKeoBtpeNc zpM&QcuR?$FQu;EnkmK3z$Fd;NJVdnjxc$Hz60%Si6-|Q3txGa&iGd`(EY2ZWsD^L* zBd{(h7RwWMp!8*gY?VsR5``esx&?n!XAz$5Y&K%5G-@0P#|)ErGOv3*RA42^ELuu} zEE-6*>LlDjXsCLHy^1l+Ty!g0?qP_XIbiKSWL@_G4iZuF2~j_FY} zYdMC4o|{1Hs~0+DKPA6&7t*Om%As3)3!9=8j*V&y>D8rANP4k6v%j|IE^OP|feFE;G$2s|77mMgSgX4rrG1rbRF2_3T1DZh zf*O=RcwTT?W|Jsf zO%u$1`UY<7W;kMb1~11q58k+{v4NB8AWD8S_1?RmY4z)}8FH^M@N)%u|1W}=c$8z| z-_NkvAq6ex&SOWiyEs>cn;7zYi2R+s6*M?s{O_?UIb1i2n(SN*8N-THD}4h^J^BEi zcdGM9(Kx2F-Ix|f?thJ(Uvga_a0Q`N=VA_cKCNTfsU~C0Vzp;rmg1yM#`P+&z#Hf zU&1Ys*IJVKbnDSDr4d5E=!xtA!Q$xXFffrhMqAUzg7*1~_@uBLR_Rs3jYn#tv;7_2 zrK<48w^+joa1$bJ7qS0ztXK^HJdL*sfHLz5P~&@&Jy_We?m-5qRt20ytuuM9Z~|#d zHyXO_EKbi4p>i}F#UE?3$-n&Q@&{6MUg&dV5`_8bjb+c?P6qFIj?-w@;|rIku?c!* z>~XIVrb>@z!FrFOWUm97eOpE2;!7av=oH+SVF_u?58>0dBiwS)-baCyISvJhX7)$U zLZQP2woP&m$UFZ7`&v^#330yI;aS||!JUGDf;i4esLs|O(u29`N73tv6NJJPHn`TC z)86d`{ihUAew@gAkFy0&SFGVKY#DdWH0PKB*E&!yz$<%$AVTzYiQan1w+Z> znQHBP{ClnnKI9p|nCLj}RLXq1u1tk)`D4N^_9()aL!!N%`gwR}%|5Un^Mb3genoEW zwIvRd?75%K#n8KF4lQ0jhD|a4fFDQaK*IB-IHW$6N%H2?`Fq>QU8xKBznR?sW^K0n zFGAhQ1x)w)02)@RK*p+@B+O2ntyKTWGm)>w4LUYhxvY+RVbzI;M7!nPc5mUP$U|^A zaS|%gAXxq4r|A4W#EJ9%!1hm*@N%st>V+ABKuMfF%A8OCH`7!7|40jNf)~kmaMBkDz#3 zH*C@=;37+c;DCiUkf$wVjbb5QeC>(%_a+jx@wLEl!Qj&A%hO4Hz|GS6MLdtnf@H=X zZqoZuUh}pCFr502v`kKdb(zANclHo|kn;&d9hDqqN(vE?6kxL6h*RBXWSqCNk+6Yk;xwKrHSuZwRxo}!0i9XE9- zz-Zjz^YEeiK3FFac-JyaZIa7z$HvQOu3F6%oSjPbV{7oCj6dp`D&ST1GbrOY1I3Rt z!sOD?RA2LJ@wlb?@R@W4bcA1rc2h%~cg6tsh&@AvMYEBfI?7cy7{IOWT3p%`1Q7=R zSHYS1)ttR?Tt!7nLeXY4YD$qbin`}H-$;ps3S}o!DrLwBm77Z`?I}soB5Bd2Rrj9f z8?wwOAzB!vkVGOyl<~X2zu~;j=UG0V_fw19vx6Y77{Y!QCxF|Bhmat80M-%O04?(* zue|lSf`SnU@A-(!GHNA0L9bcB^s%)6=N}+6t;LFSjp*1=#uPtQ;sT)?OZ0goNj2Wi zJUr9TxwMGvKVc0nCL(dXk2DLCGz-*HLQZ^vObv3J@B;qEy)OtyB ze?S>%lOJ1K-vQz6JWQN7f&1!<7|{I>nERXsm7DvSPwH)$(s>>DLIv!46v{kLR!au^ zk|bf?a!_5^%Z_%wM8hhIS2St_<>9fYp?4g#?K&Y!GY)5kWt08BdU)~E7^}(TJO+=| zrZsvou%Vd4ijtjV;^xyhQU5#UeHcgM#|<+5uok*wsD=f4bb!pAFA^`&d1zUrP9&T< z%}RO7a-LhV6>HYQw$?l_R*c3iW}fWigmV0?U<5C`(&97Ead661j~*;|EPLlJO5rU+C6yV1Sz7FK$X@FcBN(RrQX`IKyVdxlRCvYwQOa@XzNU_{XAoWE8t(ERHDO!Ve{uqt8;2 z$_b0K(&3O`vP5a$2{0L_1V(utoax92=O3g=-Ix#aaf_#q59^`Wbpx#5V+QB8#qd^V z^T6<#3q&uYRByutR8|Z`*WoPssBtTod1DRdv3eHlitvFmuhMXhmJT>1Y2YkpX^!9V zENM;?ftke|OflOc$naF;k9$iQHQmL0?e1OYn-n_KI_xoHDBi-Y08MRyt` z&V!}_CD?deo=eKtz)*(^!YGG%Fc=hHicu!Bsg}a{Ddk^4+;MRBmHFHQTm< zZV!{=g$nT?`|DY1v__rp7bKzi5eqJ5_CA(-XAD3LR^>d84Y*t1=(qD;P%>6 zU^YX_z)t|*U2&nto>=@N;>`+s1Z*GxXfasf2`yov)3IpE(YW+QX; z@kG@_qN1%xgD0nOCPzec(dFylrfxuu{tkuyL_JRXuq{{I-p97CSPIV0a>UkNo9&d5 zavMpZ+^Sk_XezQ{8Kcgkvrx+6d^kzk?v8@>Rq5p1)Z28vi2-}qYK{tjSrd_(BLpVz z$Br4v;Fs$q)$(w`{x9Q5#`jwI_EClITriWnSS*0e@2VhIv4M$d7C@wqD)W;1CMp>z zD9V@iBIU^>L0kD2`4h0Dq?_D!m1g~qZiID@OgV>>mpSgpa!j4lC|9Zeg=Hcn9W}vvzYFNS{RVy#hD*7k3#en|OMF7UG0CCB7<2eERjQkA z^HwlOZlxHbaaTLMaEY*vuwRUQM;b9Ewj4kHQ;Y`^Uqk=y5nS;6Ajn0WgiVVBp=SDY zXv%T?BUYAM*N_8}H4Rkjju8fA1&}k@C&-q+^N5RskQRua0zD*; zyXIVm6<|bf^^bu+r~k)x&|}teIy$H-EUkIh%9u&ZCCNT-KYVrjnRu4fTS>kCA$+_> z9&6^*z>Y&pVbVV7u4ASyOx-vMqjf9Ua9ca%-G4(g5Rdrf zr1jh%==-KfVs+ypbeHj<^J_FTMCmhei6Q2?D)56ESICG(03A%73hAjP;OH`**O>Sb zPS5YG&OKm5C%zqFrh9wDk2aX0je!hWwx^QKPR|AMihCt9o@%gTpWn0e(TY@`cfhgk z4G?zAg+^Xh=gPQJ?;-1+nZnZd!Sqf+qSR~N0FfD;SlRmxbRC75r{oKM z1MleLnCm!I6o>~m>tKbs3|@CR$<4BglX9b}m_FYOWfu41)+IfxvUHW%Rg8Q)l2N-X5fy2G9n z%$n7?}YImBp^=3l+qE%41We)B8ZiTml z%dDRVPJs>4F(3@RiUD_Wq}pGDf3%uOvR!mwdvv?i!3nCEn137=rKxkSjxV@~v1{3m z3<_IP?t%fVL;cVeh@0ZT^JN6A*sgg?BAvc5Qg7%8 z)p8k&c~{!7Ps5UbnS7R?>)Ff#JFoHDwdtT;egaRWh>6a_YxpJq6#h9~mPE;$z_9Ta z!Of)SEa>+};Cr_~zFsQ!tXod!jiKCV&v{fl^E~s|b`9$CPlEZ`DD-}NmCf?%0?U_+ zpxO2XF!4Mv4@VL&GY2Pm3Q?ld&CE<|`Hyw+bcv-CcKq@KENY}ZXGkZMJH~RO4c_4m z`*Uzi#uV!c%0cg(D?VN?q^TPf>GB#OR{1*<UkWCr~zDGwE|}e%VA6I z7`ivBo^>>SgPz9mVAb&)0!!4n`zl_bS-hUe1t{?%UW?yoWXZL<>=%^n)WD|}70?$t z#Hvg7TIWulOZUi5<87rGlJQmdAi*P(G^-3@wX70;7&C?%t13}h4}09E{5w~qEoWoj zD`B-7rOd9<8FndrA#N#IEa6B!RACHt?+a$IcA-H1cnL;1KLMZCznP1xA=R7G1KIta zHL2}q=(7^uuh1YI>P+M*JLnH)j)d^s*bpk~vrQh|RapJ5$e4W{r53X?Xo>tNk>NIh6Da5a-1yuP_A*R@8<9>?-wsMaOJvLhgnsa8* z+tnJ}J(+1NyFUzuK6UXiv3t3@Su^>ECc3QA{u%i1b)v$~C)nvfx@OZrJwGjfH1DVv z%(=V7qtSvb?EgE5|L+1ElrOTdNE^Q6A5Um|{vUiw_=fAq@7SoTNNa`{^L^qh7P)&O zcX?LJA(xF$7=jA|NZ~qES5?QFK31vZsi$VWN5e8j7%*J04!EY{i k;pYb%yfaODmr1@)dOu_nOc0MJa*p@d?n#J+PhP|S03jW3(!Q&Rgphqpk}V-i3GvMPP9-6v zs3f71Bw0#G)bIJ8^ZVm{=AW5!=FXk_zOU`-xtw$zDtCaAnEi6f44IMj0 z@1r=?icBQm)TOZG-cx$dc{6dWpU$^B@c>G%c*2j%_dsM{FXJTplxKBN1Uj00AP4HO zL#>1veO5zDs$|HD%Qs1nNf*^7iS)kKCc0>IBPki1OiXi5W7w;3bdcK#FT1XyTx~8R zBIC$wl}X}l-~AG@RcwhSjiu|S*AVu^3$*Slz&o2;p>K~f)&4vPx*kF7BN=gwVb_AZ z{Uh}BssSnXHzOy~%QRMH;j!{YD6Lum1p@_W&pv_~i7&w9#~nygGe(u%+0dJKj~Qc` z3yCXD*(t9tg2myRjD$=f_+bNMF|&*LOAeu;^_%<0MtX z4!G1HCcVKk_ABwHWHzRJe}YE*TbQe#39?&qq240`B?6mJQnd{O+qA$&umiY{jl5G+ zMS#_Qh-Xt;k(=`c>w}*$ysJPoOYP~j%V+5;6-t5~meUUte7Ss6K2$$BjQ$#vIi;ak z!dzD5|2c7uFL&CXeEi_WzR`?9A4?TbiQZ2=Jde=}*FS>wk5Y*54u*O0Uf5e44OWj& z)8ybgv|PlWD!EDXkL3MFL!8k!dt6U1xdAanPhJH7k)I2Ic@;<_D({GaW- z(%VmA-0%=qskwtqY$ChOEdo5jfO8%Ek5BH6kbw7tC^lQ3zt6e|B|1WAP>d5f(&R@x zofnW37yq(W4HqFaUxAxC%8-@+Cep+-WIWG%Gfl4pU>v^(UmW;>7dPL=+{35g>9z0Z zIf+MOTKws|Bun~z*$h%L`hl!nbrl-#=Mdeo96hi+nBKaWKyAYx^ZC0hXw9tyTztY> zZiB26KP_hg^fhOZh$T;HVt6<;csC87XD_GoX5S;BK2bn^e88+<4>9}m5W3B8V@;p? zp`rE&@c+Hx-8H>{G4C@$*U|_SBr@s0eY)i6nF2I+n9d!Qri2Ibq#`nbbi)(c(j&$l zjCsmuZAECxq5a&BCTFhV`9ywqiZ8J_5<%*QyRpQBLSFQFsx{3KKWo(Rq}RWq*OJze z`MKLcF2suAbD^xXgB~UsJjAAd{j9S8Mm*#m$}~sCVX53H8ocp5ulz|g-acu>#hs2K zGF*E43P*3jItG!%J$rIw|>Pg15OeN13&EZKFmO{gO zq*bSfFlekVtT|dqyN{%j(mYYDyy%RE4-W&Ey_^vo<=CTnH&Ibj35#vKVef%zTxrkJ zFIApwIvas8TPAZJO9jO7T_$>d{Yor;*U{K%)zm-iKA-nRoh~_gkvl7Yl2cqaN`CR> zfmbeEG^R>aJksKUU41r{OuY-{Uk8{gE3D`}v3sPK-GiQXekGGqd(gqEkjb0=4Ntk~ zLY8tQs3~q^1zF>$VB-~PBr8pv0+tzHF_^^V*lr@vWDxhnwGv0wIeg87%emWD64c3X zJpXswWWMuc7vSO+a0U}^(~h4$tgbQvouE><__`YczJ3B;lnHE|^n|T_ZUahzF7)_q zJ7V86#4FT0fIB+o@|i$4I{a)h^N;K!`b`@!#o`i5x%|OSiOu*UvjF}!M`H(@0g=sr z@VQJ0@qaB%{g(H`GpAOv;-wF~4_Qs#`K3@}O_ueqs;5>7#l%Uro;dIRP4`!Cq4&EY zXlP>zX-@w{LY{5}*T?Ij&9VTUnsXQxmBTnoe;6B@*F#aBAFR0kfCN+r;A+W*#NnbQ zUu)x4vMJM#pB(0m?|g4TVjn_(%AX}04Us$+ub@&ZegZRMM{B=_6CK+$ zaz_0&6s~=OB2Rr8Da{5Haq>4Zgl%ksZ75g|Zb64H-=TA@9X7N*XB%`5Kz;KEOiQ{0 zjk1ez-;W&hHN6E#)t+Je3Ln(oSj||Bq`|`3kD&Dp%lo*d1T7R!Vmi@-;%a_5X-m(^D` zvK?Z6US+BlF$guwaqG-LG+JMXVmU2r|F#jZin|21YO-*|+Y4kfqHwFyYmobOiCrtb znV+qCnY;V$J-6`5Z~l{rZ2r%}ZlcyC$4$`A<{EN1#^Bl`ly2jpqLMgpk9~Qy`on^N zj3%`0Ok-Ofijz~jdMLGhMY2>fh*E?k+}&YM-p$g*^9N7j*uBw&E163rBz}>Nbz}Iq zqlciOU!7m_WD|{>A4T&6!C0(9I+mzBcWJ)QOeU z^lBhCMP@C3Pl_VHNhF)@AM*#~&W)!htgEQo59E5jgp-NaFG5#hF?je!lIfYb{D)u6 zG1vSy7);uOIeS(RG3hI51s~tc~?`blDmW+n2}Sh*97tx+6)swfrM#uxmm`V{3LzXYOrUdqmYXPS%gV?{fk7=qLf}V}@nXuP~ zh^m+p;Z?j~_F3n_Qj0`5F#ad@zfJ*_q;OE1ArFbuN};_f5}I0mGybdexJ~0H@>L^# zL4mzLcVL}56(2fIVw!Tu!b{^h`_-=87*lz2I@g=7P~Ax%oo{FUM&0JGYSiWD&pOUt z_pqZYeq2IRyExc$*ctj;&Jnrx$6#0=08+B*;4JZq^{BoMrv$0+BWV-%F0yCxKILHV z(pxyb%80DxfO9p~;V(5$Ckf-DxXnxA>A%ZW5Pz$f+WmNbX3Qii-VrKDc^L)ku9xE2S{u&dPd1a`JCR7*Sny>(rt)nKZg59esB*i~ zBB`bJARATS4ng)oM9C}xg56@kY1%tX{VfAZ2YKLb89_aL+$r8Eqd&l|GQ_^5gX$^m}*hZYNJDYTgZse}{yYT&8 zUNg52BKM0PrZpLf{7Dltp)V|!){pvdSH;ip+a?uIlhk|MyxC&JCNYDr)b*UOQ!zBY%~>hf`SGC%r?zJk=E&KsfIlo zyKJC=+Fo?SwXsZh|4h^}68CK9If`aZ;B<`t524>GZ5;RU+q zaw;3pu!k)FdKh;21qiHG%MgzX30S@$4z)pnSg$+*sb=={)zyi_Q&pRKjmXmF@m^%? zr%VhS;8?YjR;*d3CK`Rrhk*Ea*lKqiawW2u({7PyF!ZwE^4_O6WS#U&SAM28ghm9WW>O)4gL_{`vx}%$8pYRHCZjcg*|mt z5-r^S0)O@hFXWIwAhom{_3ve&2^R})mj7T}$qK5~VZ>U0?#0cK$|THO3ax6gaQ4s! z`u$Bl%!m#}sd1C&yww@R;c+lNn|zYSw&+o<6*1(~`EAr-TmT(uZ^h>c8fX-J5yGwa z;_R^v?B&@AdVM=^pll~6$Q9+%NQ|%#ZKtyB@=$(e8i~^@CyE~x_=}g?b9GZju*xTk zNG&%Z>AH%1<%x^9n=iNX7foBl#hM18>2FE0%<4ig<7GMn-}Be6K?Os+baP-Yo8FXcOHYl}AmaK9Nlo zD#Sb^UC8!t5`Q06A|au}U3j{inx!10eFb@B)lUO5=l(jPFxUcJgI?@>Yx;1c6tUKzn90qrXGE&YitCYu;=lKBn)h4i@|DG{Uaz5b;WwV5`a5W7X=98O zMzQT+1>Akn0Ocp5q1?j?)~{WKPRie)YO*S9UHch+&ss?NFHe!$Gt2lV=YF6LB$8^! z25|)i>YQ1ZB|lwP4h?T12zG>G&GnG z#YFqOn-Kpj=Ju~0%a@6s%mvkr<=cL4Mz3~1B4O)E&plf}`{T60qhFZ|S~r~xY}!py z=Ew1mr*GrLU#N2`Wo5*9$P$xl6QJ({LuYR%+~ERI*7MqLl5jD8Ztyq8sO$ns@i~{YhXr_HvC5TG(Lfl_ajWs^A zsH|oa&2319SsQ(E+=lZ+c2JbR?e%Ngns5f@f^lkblUiDx#*(D}Jj+y?0mJd#x&h;=VN|gxTP>puU1=5up&a&g|ZWDcnqs*8| zGdQi80_dp!CvXYdiIRKFs7_!PZFSFQpPy~O44FGbId~fX--QTjV#|S?OAhWFiU$3t z%cy&57gL+N7~bwK#?f1z<_x%g zcnkeFT!hn2gGs3KBzma4lm4>yW;@enP$$P|SXc+N{lag``NgoQ3wPk{Obd8-3<@bWwL$|j=!cGLEm~5R{}#O5zm1zw*ccx4TsK(zr~4EEy2 z>>F6VZ3?a~`+^=OQgFn4CRU7eq1p!xX0y#*IAxata)w_(BqNs%TrkLXm`kv;19_ky z^AEu5A}l=5qFQJ>Mr;gc17D>>dSMg8os4J7m+50-P7TP&Cqd_c9?n*s5Al!tQ1?jz zI5+!&+Jj%b_NR|g>g6btFzz{stZypLpF0(g6{q6T)((^kDudR{)6lhJ6?jC-;+dE< zFkKiA4aNp2ce|G9OUl4~uiR10m<7X|`mD&BC&qVC$Qh09!67Lh*y8vD7uxlq!-|uj zKQA8K7C&YqbWEUe{a@_=_6KHuszT3@5Y|m173!y?E)&tBE!lAif6h2n2 z!Op!8pi55?M0UM1cB*&+p6SaN1({gz6iqdjnHR-ccX)wO^HSUw8Hz^|{-CG0GE-H& z3A>Np1K8FJZ8~WnA{8pwwa>8vbH(Ed1|m0l0(p_o`|*kZwL}6ECB<}X7=wCOKf%Q zWn7QVMdva1S+nJP(dqS0Y*?4YH0iS-cYOx%lJ8>KjQ6;{v=Th0uVAw-2**^U16gb? z8d^7lTGnw4DLjP=+5sriA8p(g^b|$Zoq4dZAJU(cGY%WZfExFf4T--E{bdHoKWV^= zm@0ZByWmoAPk!ZL9GxoB$H|k@%hVCminNfPyP-Nvw$iKJ>@3d6lnMtzvp*tD< z?!9NiDvset=X@~zwg9KKPlLFb8}ZDOC_EH%4`%jJ7>*#X<#j-tTZ4d#O~#N7!6`!)fZ*8K$Tze56%JKGqkSH5V{ z9fA@&>@cqRGbDywVbZoAL6Mu9Z25Z|*jDfu{dO*f*8eub^zb?wtRuq>ho5AZt866J zZ+6mH-S?=!O`8ATtF=TfS`O5FbsZCM? z>eF_dbS;1$>K%`ROI4WIO*81|PQ;@-T*+)Z-<3y?a;UlE&To$7q@QMHYz)eWp zr^v-GoyN%@+{5=ci(`&0ySV8ISAb)}UufWJ%L0W2h=E#4@!^6zGXV(a#0Y z{Vo{dz8y!7e}LD$&KQ^4W`M-i_uvr2LFH;g%xJ2^nA{~G5IqOI6OEu;*p~!n#lR|H z1(cNqgNZtY=0~nr`#6@Zb(_iBYjH4blTcey&BtNqCYbSGKeUfYh2E!OtXfr(V2WWU z$k?kwmBvR5c27k9_gl=F@tx3MOYq~8pCIj?fla;BKuvoQc8u#{#_0aW(&t;@m{lsM zt~`pvic`t#vQ)Z1?JH_2O(TDoyTP9W`gGa{30Ca3H|m&mVz`z0u-JjmoZ|yOlltb~2GAB|gt)UpXvfh%J%k{(2>FGS3+;IAUselbb^RTJv9u0PSk3)-kSf>{c z!TdlV9h>+H`_u9n`uxP_4DkaO~Z1 zxb2fd&EMu>vhW>`k4h7Zz98DOK@%^fhcmNszhla*hoEP&gjQ!MaTjh*CAZ}z* z2Xc#**qPRWbRZMiACm_#YO^eTSpEf%?p#dH*JZKoKXdTIp%rwO>Jrk^aG!n{z06hK zzR15OX~e&_HkKX|dC%9}+D|JJmFNkJg)~0E3c4N$b!<6lteoVGHR)BT*3|@Er#RNr za3k|%z6-tE?Ma6B%p-M@<50}z7MVV6kW%r5xWjk`FXa6`_L%o?)_GnE6dZp9xuzq) z+Y-Yn-xz>0c_o}L7)96hav)=T1Jhf#!MqtWh9Rr{6KuEs zg@+csV769p(0-;4_xlfF?}ttxswtRQ!LX&0=IH4hAShB_4#O|2py;v@8>Ux++dB?m zN3$ptYgIs!W*&TWe~Gq}Z(}R-n(_2s0Aoa-!nD;(AdGne4wq_}l=_LVwC^+q)?1*P zxKJ$z>K(lI6u+^4hvh@z`EVgEOG0&sTt3o*6d z;mz7=Y&a~BmE(I+<5xfYl=_1L$NOOLJry-i^r7DgEwCL@gT!?=nCbr?L8zp^R^ggtaj4tn<6@@yWi#@2g(pljtt>}n|j*VanLK&=&9 zM-M^9np;90vYqM5^hYyt19E40fa#8Ch?#x|TowJ{nAnXY9+>{ zz6YMjTlVhx2*}>zfl`V&%0_Z9wFIw%LH zekU^{qPMW)!2qO9z60Asmg4YFO*~le9W74OGFqk)sMFmJ`D)&*plt|7)J<`zdnBf7 zJYsQEF*voJVam}Uw6UDaULq$;9KK_V23-ay?JDOfk=6FS6Yfz0m(pj}@L zhYsWkj%jEjKSz>x%%hU!mF0jF_k(>oJRY5QzW}+BZz!7@1c6%`guapqG!H!mUc@N2 zX7MrYurD0Do6AbPs{oP2DS~MOYoKHH0P8xk1&$oJha$d{c@8gbG36b@C}KERJTN5@ zb!`V3r>JhFZWQ3mmlBlJ6x>{{Wj zn+c|6z6DWX2IhZxn7&1lbokJB7;T(Qk8jmwIz49M;Y4>V6dIdxo@qEnc#n#(RtxLT zDZs5k-DQOmxHlf~5`{?QjU0u)f{qz_!T z%R#*g2Awk8;F$6dZgc)An7eU3?Cj11w|g0I!cGeh=r}VUODBi19WZ)#~n+>7}Ilkf^C{nuw}k79$Nhh=C}Ev=n;1u zs#t`VwtYjrfU#(*l!QBl*lhf3Ptb*>?5oMXP}d=WvI@72e>_%ZqCVUg^f-Y7tx9v26i%?t3#t{30~<-8Z`{|b_Se1N#kb=V?#j=3?m48~Z`1@G;?VA*^F zecimEk=6(_l4>EsL<5dDcrjY@&DpiHM2XJz2t0dDnk>8Di2vTplg8R{bV&g6EI)k4 z{3}-E#>+A|aWoQtzHMTT=iU^YTqVS;|K>o%r5o5$%Og8Y)9_kXH#m8Qu+a|E^rBuT z_}+esaY_?O#EJy)@Yu~3`b*Hzo1R#|@jA9yy}{`0zG5T88iAYU8^(6>Ey2+15zx0f z$M%gQ#@KWdkMy5q*UMc+hrLOxY05_YraG6tw^G8s=&A5Jy$V8qoq`ep$5#Is1r@Yp zPM*lYN9*H2O%Q>GTTU}mqu=0@{}P!8O3wt^-c#Xl*#sW4=!F23zn(E z+|hJ#{Gir?m!q z$H#}^%hy}r!QfXg2+D+m|9V+4f0V03V)F-V|1L!&o%``WaSMXoJ`dajN%tDG}S~C)A9Z zaP!qth&dSr3#LDV!q5BI<6iRg;1?O1m6CvprFW2aSHte!B=}Zdj57X-II1Jbb0lf- zb$b$~YxiQ(3wwIk>@b7s4iLWQI!@PM$oE_g{I=AL{hJqtJ#$mo!0Z65@koI^SN_A@ zw`Y^=pfb$f%g4zehdFDTQFXpMl<8Z*+U=FhjZGD(y7nphZsbGV{$m(%^bPz=PJ^eH(#8Dqb6&!|VmUgUsWE(qJybB6Uhf!k1YtWo0igKD>%x}dqW=Q)#c)Z&UT>2-{ z#nlF^U%uci!xQQ`?Mo)O0e91?xele+?sc_JctC;7hhOZ38oz zw*(5liC}oyNk;x@Fsc*>gZD}f1}=2s@32kKCmqfzDU@L7!FU*yvH@L93g4Eb2 zbPq$r*9AbYn?a(-MEq)e6|TK~j`LOwp>>xErX-($YLNt#nPki?4g19uXc*!K^D7{} z?k5zP)Z(TOj_@__4D<_q1w+TX_#)1Yt=zr>hQA+#PUC&}W!D{WtNO^akJ^Ers20wIDpt%ankRNG z2C5#bz>SqLSovr=jjG5%HKqz*HiyHD6<#2v_!s8hS%BFqEAUsQ(9;@D#+6B;AX~bP zoi|>M*#mP>zCf5Ww4K3Q{U95iid->MLk-Gz&w>oALnXV1RA|?@J^1-+8SKnI0%Kdk zL3dpUdOlOa`t)N=?th8+Ecc=y-m)9^NP&>ArW%cp{{`EbV%#cRn+cQUsnR?veBfw{ zE8gX>+OESTybN#nwju(=k7Q#)su`^B`wHhLjfENWF0iIMXE7Fi8f^bZ5peuZhK_ab zW-}*7GRM8X;N0K=W_Nb|Dac==H#W62Rz-*ld}>5WI_(_tv`>;Z^J@=&I22F#wGgsEkftisnSX2OxT zn9@-VZCi|}?5D%b;K@+NzSbS5d5H>^r%i@{Pyku|8yM_n4IfjJm?xfv*c!PAz6=WI zY_ST2-ye(L6@TLr=Sj>htIa4f=NTjZ_7>Bku>%+V)q<~jV{!8S2&_DL9b(r7;^N0K zFgS3TS=*G1#SW>E-*T7Pecli2Y__o1PaMSH1J}Vfc08l9+6h4P9Zrym#hPpp;=AKJ z?E8@pST!B~Z8U?85_#Bor=3ygH33IK6$CH$0l$z6=1ss2%nq5s9@ol+^3nq(QI-w( z&&rCX{Q88?f*Ki<3*Xp}d!Iqr&q%Cw2IgX^AG7APG~RkM9=12?khESONbSE5MIML2 zKGT98jNJq-g=*|x(LiVw_LbdH`E2>32Us4x9Rx$$v333|BE3ry_co8RtCdvPjZT^P$s+=qO4VU^wGuNpzmfUS8;)0F zEU1=k6f9pko_RAg3mT*);GEQ6I5m9+K9?Iy{q_E^vzC3vacPIyg5Alip=c$TCjMi4 zrbk2HPY1M$y$@gQFGHwE1JmtM4rba}Si|X%0qquy8I_`!8>>OyVLVv*2f{CSja$Y{ z#mVUgIH~%#(6=6k8#-g)`u;wso?DDg8ew?g{Xwc`a}BDb0e0(uLaDa5?4RgG7}azF z{M9GW%V*|L1?<8!MIBIUor3P6>zK!nC(wg;^5K2IqCoXd82T;QzOuAO+DKZ zyd5QL?=qAfMf2)B=+8cYCX0N~{h=aR{;U|a8s_1POB2wl$`jVB+Omt=#puLaPvKIu z3v904hBbMGuamFw<-BEYKd-Q{&=M620UiQkh>cg#+el-;_G`^`8N(ULY~6G zOF^)Hix%yR|I0Q~o#N2g7saEWNATdTR-7)`&Kfv)3+CNl%`2K zQ~jUQ*ZKW88oU9DZ8cfr!nd?``6q$)feQ3k#6ezO9L*QI%g&tT1xNI2vHAQ=+ITt# zo{ore$1N=ACUsr<;?@l|XF8AC@3167bCB^`*UOGE8^^s~s7!eS-k`<$ph@;0!NRVW z%=3k|c*WV7_ulXtV?3dpQJ?$)+Cr5X#Wqiv8ng_&X3m4I-#>87LTRk&_Xqt&rOcj? zpX}p%V^C#j2#$00Ly4X{FyZVj(45Q%pC5o$?ql#xz8Ve&8$em~LpJHH1Lof{B^dY{k}rEg@%*LZNS($}k@_iA@ZyNzK#(|TS$voI8aWfxO7CLC zqDye*ueb3=(Lwm3_6zD4NP*I)KzQ)$F5WipV#mA7fRA?qv+$QQ+-Y;`7DsU4#=?_h!M z7Z}*p#QNM$!84k-(Yy64vn?_fR?RJ7KY4UAPhahUWyV77N5ccPWmFi!7F`_fyw9V) z%P_Udl#y7?g4g?UP}Fx7YMtg#=y@HtsawjXMEq@ZN7Fn7|rl8V*j z@G5VuCV$trK%GkG!8-vpX8Lg`b0>OY!*UtS}?!J&XO7 zkPK|`aVGQp19D)<|B2o^^j z#2N3FlH_qS>4*In*i!i45jAN@`C&9VBHW=9O z4z=75(x>NAQQkltT6dbzRl9QG%k}Lv!s|CYb92Czt=*{oN)mD&&nC}~t-_Uq!f)7D z&9)A5yt@6e^s1Z=-SAePvg3{;9_bbQJ>?6_CQ73??*UX@KLN5$&1~^=L2BGB| z`|hJ8jl1WIQ*?BhkgZafr+OX47o=lN^JBJFd~r#O%yS4n`kWoBG8K%jv!Ka322|>E5Ja}v(~Ch~Yzlt}Rwmk^M*cgjxc-2>V^oiql5gV&4+d>o z6S2qfDjZ9lf{)&;5qbs_pdd;gRS%wqe!mJRdzcSZ>mzag-X7Lird?pB5Y4!VR5F8} z$cEL7GUeXS;N5yDcU95NPrUH+6?5%|5wG&^0@$_OjE$Oj+4x_iA`||x4##&GVC24Aa5kb{IR9w^QGPBq z4UMCHi%sa;qA&1hQy6+Bg`)E&9yvJmG!{#GGB*1pkZ$>ftwq-ve~GoAmz;!Gu9}lS zZZT+h-VB*&fT7~5TlpM&BMKQ+hNY~@9c#s#dvty8F;gx z9Nk*d;IHB!n!6@~_{bc5gW2pWW8waIgrO%C|KWThNkkV1f-Im`BWsNy5yXOd+efCkoE43t@KcRb(97-eQ#Tedg=a>*(G( zksVJo;iI-PqjA;%p6WM1t9BbWetC~JhT=@RUOdKqFJY9PP9OzWYi~ABcn(Hmt{d*U`x?*w zlZ68{haXf%`5=uw4=a)fP}=$&?%TE=dW%1?Q?94sN4x*Pdfp+p{yh(;Op9XeN)HJX zQo9ANYKky(e-u8`o(1O7<4KFcTfypm4MH7$Es8qiU`Se+AbZD3kYD~3Bi3qw-R=b3 zt<_zk?EVnCic(?Y<5dv)@+fGQ4x*EGK7`!7&*(oa#@c=Vu;*nM%mZPMIGO;_s~!sW zrIoPLEM>8{K!IUD&qx2R24*)YU`0&sz{Asj1$QMJ*>6#wn1P2^m`bJ-ul`QQi)m4&rdLDh;~Hq3^Mq|$b(0*=b0OpP#pxmGGKgcZfu5ooSvd739{w81w7tE{(XR8FT& z;8?T`7bP^}*>*kJvSvT_x1EJrHF;XnnTO^t^l(Y@Ec$xQFD6UZ6eb>jf^(yyFmtVd zovZ1~9Jmll`?D@F+GnPc|7?yy?C!&ulxW1`8Q)}0R1)ySw{Kv*NRs$=rC@Zf4>ZjD z0QzFnS<4r~>{n+Zs+=xlp35ZTZ_O3x&?rqtoov8yS|fH8U&7Bv-r$kv@u<4!32*9* z-N3HW$3EF$pr;?;wQHMUh9J>sbkP*t^_7D=--H>7#oJ)ol2N=<+lT?ms`N>e8A^&I z^ZM_HfM@{=0i`0e!(W-|g_&Xgj4*8fkcj7fj^Q>E1H<*>@cNRyxKF4(^|#HVnZbYI z<)W`}Uq=)7Gcgcw*b8@StS3pn&vB$J9xv#1p!|A$l*}lGzniC!x+Sx)y|NAUgH`F6 z3w~I(s|zOxlUQ36g!%I#UsiwFKj!VB4ji|A2f3UtN0;sFB-x`g___R}-160u{ByEC zH1csiv6<6KJ7V>q=UWAdEowKidUlx<9=cDYW5#p5>Q$J&Etfu>V8h>jPe8p-O7p*8 zDk9Qnh5+4C$mV-2J9An+@7hl@_MKoEXq%5_2;*kU0Zr^$p#s0`qG7$Wc!5Yls+)_8@zlYTj=5{1A z`YG3T}tvFqaiyS_3*?pvA4E#;WhDMw8kl~6RMhk0qX4Mle5 zLA=L88dnj(u0JUwXqy_2v1@YS!e@WMlf`<%%++cpWJw~9vyEo8)Za2uI2+5QI@yO_ z?Iazl!((cKm%s4N&64;80k9^U53d8KGFAFcaNR!ak9FbXm+JdaXrptHg))IlpA~d|O5ijLU(G^XJg*MH_%qIe;(TWnqX$ z0FyLHI1j1`I22w3mD7r`s-%Juxmt@=CZn)wK#{k9qc`X@ZU+-(p})2NyWqUkHPn*3 z0?F?`>KDso4CaxTh19Ll|%zh>{_WJ?v&J^R8^~xkzKcB5(zk!|6 zGqy&@7Gyu(LA_-Ma3@h9GB@n0Q3S`H+OeM;{xp>enk?vMiv^fi9RzQurl7n`1zfRv z08{rZgRmFB@k>qz`|iOJbp12TOd9$P-{*g3zvxY3{@7(ez#ci={p}n)W7d$ROQ(^J z>Ny_57rn>%sq7Et~MPm(-KoDO&S($-rCi0jl;Y_e}+C2uBDd+q5|^J5BC zZT5#9&y=`#?S9n2K%a8r&am2MCaV!6psvg2k-G(#n9dbjP(5cH;omEQdyy+RuU}$x zUZ&iPm}IfU4qSJNzD2^ z6A0DwfuR0Y*w=K8v6!$6K3G4d632PCHY}Em?bpD%5A&eS^!9v!A@k8ee}>Yj1*|4$Px5^7ClM>Tl@bR))E@ z>%r!-3;p@boi6xggJTOvsA^j#Gv2uvxmDwNX-TDUZz)o{EKBgRTSfYRNs|R1u94X_ zMcDMl5TqvTC2PCRQ7cUYdcfp5vEDd~;D7<%nmcagn1bb>?3fn!~kQ^NF#D8zZsG`0un&x>k z?P?WFv96E6*+q@Mtk=V|lGH^!23)_-g8#-i(pO(u*mrplMT139dN7G?v=a8P##S70 z_y`goKEduY=LJ(Lt^<__$5pp_@RcA3B3w4(?3@}5*mV~An|oo4yA=7Kr>FM+Rhd-- zhQ&8f-6$J2+m+&yS@$9BdL5Xau@=mA3}?bW>0vr3v91--@M%z-)UQw>+RP*fynP3T z#ouC(`xG3SCr%DAr|<&*G^U;mz|%sMZCfr%Uic=%gArql#15kEsY53QreX2@WE7kK z1FpsO!0PH0_Q2Weus)cXpAo?j-vW4`+QEGJJQfH4=wjFBiNMR3 z1gFb}tkd`i9y9k6_$~DT^Q|1T@_ixffgdyBt0bx3^Np3XJPjW%O+y9QeS(P4Q`l!( zL%fu&$(j~G0BLh*$U64 z$^}z2B|sq_Svz9^)0P}BP}CfQnqgBQdgpnZR96U@H$3ri<4YVly9Dpe6>_rqSJ^~^ zKEdV1Z{Y0nLTFf*4u#hq6H~WfB;m)g+8~IHc2gj!%PLXwnkPOp%$dID__09cVZ_4?K6=LAkf5N&X>4;_1Fy za4$R_m%pheD&Jz^$0}X1WkM7N#~8r%8+s7<)obm%=9J9j-wS=FHD#7x$<Hkz`ogjG?J)W^7B4uM6aN+^{26b;V)RR~ zSXG%;%7lR_&s?bI+#nNQWut0G40gmRQ<1+d78NDawMPco$JTC0u;0iWU--hf6irh4 z-&AVtY>Y5LldUM$0r@xC(C!h8T1!8(fu1DXrE?h{rWavA{wh#&Rg_pMe_?mIVEitf z$u1pvN7j4GqSE*gvPa=04EpUC#9{>M*BXdZtu;k@fwkZmm?z#bLBbrKSJJv$v9PPA zlW5$^ghX9=iBowJD!ON*Z<`l<17sK*Rf|J2Y)R0rn8zdMFr<)5{)<6G^R09 zWOn%i>)g2#gzeUXRn4(fevKU$cgqCSWG8b%b#o%z;|L|ZXVoP01blb%Ib5V3W>-qE zX1x_|G#dsJRY!0O4h7T7F>t0ubP^yJ=I=3R1o>D%59>{u~CmDU?bxSb_g zv1TgqYj%VXY@ii=_c$%}B;boL?u1(;w|wRvE=tClRw!n`sqApDDaZj$Jy$$5f3hU2 zMkwj)zX+Dy%TRbz3;r7yf=P4huxgawn^^vhwX8lS@F>|ulio?V?srbK?N=dq9@dbI z=li=8KjcBTNELhyct<@mgB+1u!ptMCL|)ojBGRtJR9Rpd3EMzFc{^u&Wlb2%bN#+B`U#2jm}Fv*+=ynCQOu`)0he=9LYK6kf$}CD zC{)U!`)xT4E}$mlGC@taHzeKiG_pY^Ohx&(%5{l%QkA^chyfsPN~mFubM$tjS*0DE`ync zzr>YZx$v{@j==vX!+!jThOMu`@0tcqNZm?XrxEkp*MChzBA@)VmC~26*b&M?H`<9DTv$hYJqCU5ven_0U)CKNF4Zza%(sXr` z3hIBK!(r_Re#jgY-yW5=jCQyMimHS8f$!bGStSEYBK_EGg(NK49}nL;yab;GtI*Ad z!)y9pFs_tvSAwfy&6epfW&8j%diE2Q9l%-tZOlnd@jcXc4K`iiE>4;wCPk)MXel>L z?~QXN%kq3!gXuhsHk77r69w+A!NYKhwyw837*VBV{-0~~w|^W%gxO?5sLSbijhV~qK~Qw~IKIt?C`m1MQO zlh~?xBWORJExEaI0#_Dt3-%?q&@(&J(0|=FLCWRFbiQ!_ypuDcKOF?%6?ldedw7rv zZ83Mh@ta_t(g6Fm&KvKJmqELIk64`YM-r}CFSwksSRzWb604z5T%mg$8=Mrdz@Pwj z9k+q96KUWwK?`;)SHm8;2TbPqb4aazMKmLV;rvJ<%1wO3TAtj-3DHl`^PevuNQywo z;5h8@eM?jtI^c`cX&CWn!APY;Xui)36HX?xaQip-?4UWm>1+EE>RdIYNWb3YVc}|DR|e_fZ_P}kYx8O{Hy(k zWPH*Fv^lbstG=O!D}sNr=Q4-E>x_`haaanbJ9Pv%ryWC$p?0YKTN|yMQ+dbBn2!B( z0%sEW4{m*2!K{-KL4J~uh0U47b0sQR^{6&tj>~mrmzRU^kUv>clP_*r+CD= z7s@?kun5F(tUN_vePJ>U|NR+T_c579Ksr}5y_IeAjz+snF8C+!=6E~~XL388$n;7x zYSXIA8Xl?%wmq~38Tn3}qtGG9mQEmHTJs^ho8#6VI8CnwIgr%#SLmBG4JMhBWl>S` z0W@YOL(y}@FjEA}$d^2Sltz~AQ=;4DcS3B?6b!abhliqjl;k|(9`ujFN5#4z>y-vh z{GGjWO*Ob&zd#?W51RGU*U-eV9wS zU0=}a5*vDNGy$ap{>DKvj0+D&phd@0jJT~yXN@_>l8rY&UT770x_g-X#b*HxO0u}) zWgjm0wBUwo9?({^hfH%@3iWLM1Y3lg@Im-4^T@NG#CMF!(S4&U^Uc#>4Z}L9_`4ar z@^_$5r`v+C`qb5TC5EXD4j_A;&^CanrTMsJ*$HsJA>IWr2lYIbVsY zr?;`<_#&u!F(1sk+Ce3CA!pWO0ozR$iqHP&#`>xO{Gg!E`5e(B-*uhY@Rnw{GH*ON z`ojpAa375wEJPhfKz~Xeg!P}mxRhWC=I6TkRg(oiWNpG03#P?jU=s4 z+PO5p{4aQXZ!9%Tn*nQA>}QXkoyIfXs?@wLUXU|oHe{|y=Wc4>fRlU95u4Q?VP>E^ zcHO=s{*~Vb>hfiRaV0U-)v=h~v7SghRB~y;yAja}^DuUL!vz*D8wDQCJAw8;A&Wn0 z(HAP`$^I#pV7ul#{Q{A^yQXE)UtB^D^z?B>(}FPQz-^`|?IKA(`w3E(ECudPIIj8q z80_v?0)v0{fsVsWa%ov8w@aW-Z(a@H5*(_S@3jW@Z0c7sHgY>U7?;Adjebz78%iou z%D_ZFfgAGp0r7Rq!FyYs?{g~-xX5uj|Y8V z7qx-Y^C7r|=MfLLR^pQnQ)n;09XB~o3l8?>z$hF8QfUOfq7zV;aTMDWoJcNzQ*1g= j2Iu8RnPKHtsHkY+dONH!WJj@JYt>vV%Y20SZhi1SKNps9 literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/stencil_chain.weights b/src/autoschedulers/anderson2021/weights/stencil_chain.weights new file mode 100644 index 0000000000000000000000000000000000000000..a8451c036af74774aac365a9a05b254483189ee2 GIT binary patch literal 20400 zcmXV1c{Ek;*GGkrp%6u>kor=lBJSDGH6$So6h)1x%?H3>(tkXaT>wIAky|-#VIVvnmH$2%6x@RKU2U3$T zfn5c18y=vq-%}80e=)K`159gu9v-b}g{pcTxHobSH?R+2R?2fQAG!spYNn`Es15@u zMU04TA*5JXvNM|f!TRtGMpWuPgklS0J-d(jOFU8DzLHg%qzCCuwcy;)&h~n;!1YRG z+{B-2Rw(NY#n_?{lZjur7@WvRL)?`6?)n*bqm2uA8lgf0N z`#flC;L<6iLOhgv|sNb+V};KS@V;~zm_++PNx=jY@bdo<80{3y~FIZ|Js?z zrcl!3v4GCl>IVxCEMkiG@#xOmp|qwwnpis%I#jAhY(;}<`Gr_|=hHDPGueu3w;^)B zcX6t2R>S16QLI-x2x}5k*v6&vsl%MfhR+WSq5r0vSa|p( zRA2duKWC?NQYw;g_Vq{9tin+@^exck6 zAL)vz1N2^JVyEE~Q*nq=vpOdP}D`$?}&@`bQpW{~`95#>qQX9U7aXLKt2Z>!ZyHgWk zHuHljPkBJJ3%7u5j2*+}UtlGi^)S`20^9x#vx;F(=y~u0^EapmzkN!^V}&-j?CnYX z%R5gy);@$PMxSJuXOQH0A^w50cj+$GBJKsnS9H(e$^4c}3jBjF(#gypKghm6CFK4~ zTeM#>n^?RO<{Qo(q!J#J$+VDOnsRrXRNWTFI)8UGt~d<*{ACQ!i_d!9zJc-@3Rt!# z0N(v4O}qGOX~nrnw147-`}R(zLOP4coGU?$k6Zv5H26Uum3*Mv%G0I0;3!R7s#P_cYFBW6E{3Wq(JN3q{I?@oIW?y$65W zFJm{G1M%&@QQs|(tc;7G4u@WYRI41f;hhWX0x{&Sb29zQtN^bqPIR~9Trzuu3sK(~ zP5*M|llFg4Fv?VeoKHDQVyc~BcjY?hw7rLJ?FEdI(qWvZKZY&s>!9RzD9q(?$RaCW zhJV0>8V>t%J+)(q-jm(j{Rem9=doV;_R(b`_xuc9Jub}sEc6){Z#%^-oEb}YMLotf z-pW|?Ae`Qi-c9B%{sZSCj?jX0iR93O0ebR~BXtuNBD)f->F;25Vv-a`WDh0LI^9g7 ztv7{M$S|bZ>J&LO=O)}=^$>-sgBfv+78G&|GcktE>@~*=VE@wvoj-qt-c{?drSlov zvhV;jwZF&A)LYOhZGd}+3NY9*3%t}G<5jaj)N*>lSpUg{Md}Zr;}y&KU{j9PQ;%ad z(S=LjEI+03W8U^vY=9F^3?(*Y~n2ceZMhjTxZ{xQV@EOA^4^X0$gYpWK zfL|HRY0w|zh3B-PV{azAG2Du&{4b04e<&fN$C}Aa9ZkrvmLyWk>}Y-Hcg9vUg)EcV zN}p}ZAv2zdaZz?6XwT8)PO3dc<(e1Lku4YLPLrKPz0MZc_m%3v{y8kOzr;;?#q+Fav8E~ z?32;M^)hT(k|gMhJP^{mXWI0>nQ6E>i$B^?K{786!nHtWYH{-u3QNV1iz~gzrB^QW zQ@k5rMDic0S#pq;j$X&beWN@Z-4k4iB|C{s#R~G_^9AZNdWVi)wW8YB(ormXJE2nV zK&BuNgy!ux)uf#m-p~L--@G|(lJ=0Gu%6Aftz@#F|AgCSISkLpoKNBliE)G;+3$ao z4zHYrIy-#GPOXb%u89*>o@vSdbwrG7fRpGBlT%#ul;RUYMlZHB){Wub2=!c#)8@`IY?2dg0A{_s68M@ zUvHD(_YFNF{nk0m`PUr;_Ix~GtjL*b+)vZJ-hZd_lWvX^1;2e{w%8R~Y2b^k?{vuv!qQ8ZB*j%|*;uT8uWkzQKuQe0D#Z z$vm^)NGInK*0AOP8MK&$ext83btk35I){kvqin{P90cQU@yyZto>(}h1?h7m!Or6w zh+rqS9Ze>!%_3xx<1T7skVYC(udt1sQ~9f0JMipGMUp=IBKKHxD&1QXif@v8agj?f zzkl6kve$e8|4sJ{(2<`;KKHF8-yQ78<)i+rMBHmUnw$WuCJ(ZEZB0qb&Mw%z#)gjO zEoHYa6Xjk`j$=wvB#D_$0`i{)=}-R|z9 z!&`4KC5NZ+cO0F{z2`hc1C+h^Qd>m1UM|AiAr~*M^yF!Lv+O_gnVdhD|NJKZOoIyF z;`U3THfq3)Kb6ak)riJLJJMk8mj*bMlLN1&enr>qeDGeHhv_ofa6<7C47*mby7OhA zVewJ6*)a%ruW^CUIkl+vU=nnWw4>`x57e1oj6%H`Wg`-eV0Skd(xu0t#7+ZGDBOV1 zhUd(|i3s8cTEH=M0iIEJgNDhLP-E-J9&j1OukEi;_H!xr+~Z@}>1Y_M@`ajg(HB`RdP*@GA{kCGueE)@^vI%M>4FKZJqxe5NnG5mNVOf<;d| zdN!;9p*VAs^`4vf&9#(oPVRF<%T2f!@;H3s4-@#Y@vFJx79u#L8wK3MwoH?^I$8?5 z!Mzq9s9WZtP=_j8HFpEPe!8AYXiD-&ugG(aKlPF2yOQa~kxjUZ|BZ;he~2S8#iZ=; zYkEAXp7h#^aMe`A>7nE+l(C7T>-6P`(U${sV&@e8XZ2Qwo8&~!9x-7r*y)ll7E*9r zUC@g!xP0z=D=Ky9A~DmQ$#>S%q9693B>yE$;7-ivk&3DFs9WFw@5lBXpvk*PlK))> zZ{H-8TUCT%Bcen_S_=A(&OrCPG1m9QPgC)P99%QEgXbB|V9TN9=qz;*`fLE!y{?0@ znaS|Sb^(mG-^MdLccF)U8N6C~o))XlBGXGJaRnU(c78&98#0K^we`f~v{L!+@C>v! zeh-p;{ix^p1Qga@2lrY7YFibJKL3S5-{fc}I}+;0GRHcA(L> zOW5HU0{ym;kohtevPY(a$Spo*H(h08z8+ya?gpU6!ek1W)?{|(9TL?um9&VzqAr)` z)6dT?6S47sc-WMS5z9ZZoH1Wq6n%rNZM#mqq)kw_u9o@flF9H+%pjqM_rgm`u)VGc ze~2gIcw`!S7Z_pcd<`<;V=hLF@L9FvcC1yd2AX^*hVZLb!FByHC=|_Obj8lo;1V@* zB(;q^6&GeAXIIiUmHPDMy?C-IPmb1mZUK{^PSOBYtj3mjhE=I1=Tx{fTIV@veI;1> zBmvA`y<&oHKEZ->H`oBPJn&F@h?X-N*|~e>5!ZlmHY)cyj)u+$wV-@nTPC8)m3KH^ z8jMm}x7f$Wljv?O9bo=zAS#^^?6zYHe52Lhu)!;sjOLs{*OfwassV@Uu6qLx|K74~ ziT0?RFp6iKDGYC4z?|?BL+gWofvf(96XVI_i7%}|{h~ZH=O@B`+kc?6Z3Qj9c!}em zdJm_FP9rA=ejxNmBUk$fovHW&yw!5>QJF67iWwo%2?@9@(v+6%s-`wU_dwl3oKC(L zMor3H(DB-IU{9&z`r)hiH0Cu*$UOqb#GANldltV(S)c!Nzae+!@j3h*GGAc5dk~r8 znoCMt!@12$F8mFlUzi|n7*SosC(C`RNz8`{z!r z&$+=WrPgV=Gl|8nj|e)i$#DJxqE{*cjw`0a zjZaJX3oqTF+UF(d_SmJ|ikpkb!>|WLM&E{ru<|CU;s*RHshg<&&AB+$bvk#mo(0)_ zK8#HL`40N-`>|;|8*oVdJ?3pW20pw2P?S$cr{O-V$vVg`RX4=*I=*<4_Y3`>Jp!$k z6_A(r12Veb;0C4Z==+5ON;`PWs*ulU%D;zIS-Q}&r3EKm$^h**jo4@Fg4XU?&?MRe zLssKZCvg$#+TUQpmT0!yBnj+J@uAIT5dTymHW-#Mg~d&b(33}HO~^vo-y&2Acny8Q z(&*~f3>}NUaOCH_ftGijjLFn->^xKpcb>OEjrV1!Ib#RwRxL+2#V=4lT^U?geS)p+ zSLn=&W^$RVB`}##)N+x~z_rI3A%-%Irwd#_MSNasu5jmclZE zGp2o8F!lfXneM8rCWUB6j%deo`D=Ibr$$WR>wi2%dTnBHC?%ca{W=RJ6>a&;nhy}2 zXhW`hSrCa_YRWC08^G5uy9Y1E2FbHX4LU_)ADyhy$(&m=op?(6k<+{u*5_X*wv$la z=}0bV@ouA=cS!MHB#2`#m4T5fgG8z3BZ)tEi6#^T;H+H0fX(e#V^hjV*(|^)$9!1x zSAfxnkAc(5I_kPhifr6{m$+P*Oca)OlWw>F_*e5!!qr%J1idqKg`OKNnG=k|U7Jbf zdJ(cPtppys%%fRdOQ2D|8XlbNr3dcl@>T!$-*~u;99lX=%(N7_pGXGnxONJzmMK7X zLOg36I-hm5kAv z)fF_Wc?UhuH)2fQ93fi67sz@YK0bMGOM~1}LE^6*nZD;YHIXysZyPM)ecUWZqP%6e zpYJc`x~*GF%^mq5>rsH)M=yi^r4ZCTv4d$SG=SGT%P=#i6~5mIp%c$J!2-!9Vy9Dr z7ao_Bd|!S3#-2^!Hrxb8Gp0~|wfQvH-55s2 zSkIfTaBVw+NkTq8y{d!>?@}@D;vrBweVb{1w-Ci^%fMq`7&pxM3Q0?I;Z*!V%)xSy ze;dgbFI5K4;8yU<8$q6O3wjvtL8tudShINsu6*(Gy`Rf$T5|_Z ztWO77{A9aVh_dPt9MDhr2jJ%~z}hUTUFgC%r&u=PWj18rZ)5n!uQD}8 z`j}Gi6r|)*p?5?N)s?m3YULp6KD-C+?V+Gn`h(L|U5Vl^#+hrApMlW2wzA>{Gx2Cy z1}^RBM)Al;(2;u*`nD~HGx5@RDj^dr7hQ!GQ$v)!*}x2@=HT9!2XUe)3&uC}Ss|N; zrgu=VGny2Er+6T^Y#qWy>j%-<>^SKEcNO*8&)Nt){bvrUppU# zLaNKtH*0{>&vbN8JOftOVxj%sIDDwH!QMTk(5EL4LOb4=y45}e-)tjhs#GHQ3TK!~ z{dbAA@AdH*^8A<~+p# zvBxmr{Q^h0_d$DwB=F~mvGTtekQD-EQ$`mS`t&k|?{XNSmRM%^`*n0XD8_TwkB6A$ zW+*c984MN;puY1TwA2{Jpqm{yFl8=?gEDjv%z?W)@hG)C30?QfK>x)^kajEpzVBSt zT5J?r=cHi6zU#b{DNDdmqn-Ub!xlTX4luiqEU%R$R^JM zPTC!OH0v#{tEvNERWmm4oZy*C7D66dh{pEqpq6(GWA2~8saoMEG;7k9O$IAV~(Wws^P^d<3>Imd*%LVnbHRv1Q z%6a(ZB??(sK(YUFy!EaYPfe4?_x))YTJ)B=SbG#dxEF)v7adgTQh}t|PI&6!CG<=v z0`-55XkEFI)p~oMJ@L&43|}Bx+C{OU5yxSM$xYbgk_~tA&Y{}*ZLnp(7xDu3gElBa z(ya@yp_7M}jo*R4+LI@AYcnJMG8oPKV^DP4dQ58n1Sv7+nanLmQ0RsRTl01eY`#~C zq1z3hW3>~Q9)CeU?O4bU8s7q6XTF6MYro;r?`!DqjfUKBsz-?IWm!-Q)`gUb&79q{ z-h;oz1{9g`0fa0Uvn4U1@bqsl$rjSXAK+^=7_ z_ukwjd2ibJedb!+7giqpahZ5p@lBEMv`C8myr~Uu&55LKE%hLIduQ1x!;|!1h9vFR zZ-tDxso4EYnf5FZ<6oGxgIeCbLw0-h&^v3>N$W2a{)a(1?v2M+NWa`N{`GU;iCH#F z7R9gSAKP;t#;l*x>e6CrFiD8W^QEb#ZW2xW6GpDZ#IZk0cY>*B82iIk0wNxMAaDMa zkf-KV{P!kbsn&*S;?0>zHw~WTo{Q`y4=Y_-?gcyZ_h!Bb>hZf;(~0b_dEB#)_Hf%ypN5*K5N4@=7xFDQP;w&* zx7%LDb+R_-n;u`@cES|utM20?wOr)sO@fl|I?(?n3X;AYLq7Kar+=*}E_KWS(df6} zoWO^=mByIU)`$s(OMoYQ76xQYphj>m@lHn76g_AANE zhcaaEhb^?*ZZ1_`VoE?qh>q-YqM9{RDDOrVc_G;Q&N&zn7sF7huOLDrER|TT+Yxlx z2N$AWlmgbly;LY{h_gTGC9HYv&ki?OV2NO6g{t|YI5kHB8NwcWagtV8_lLXlx6&M?bbuY9Se>?@`fR}Snd+sNV2AVzsi9LTtWl)+3|ml#B$Y7D|4^^kmlQZC6U&127DvWX?(+( z4EJid9rTq7cx+h-tefVJPqXV$t*;IGPViY@V<%>L(;lj|){q2>w8FBS1l|lZ7PGt zkGHb?MsJ?dzK4+VX%#Gy%SE+2e>hSViLlV~E*??KgMBdxm^wuW<9zeMcZw26SKffJ zk_rGmC#XE&eh034{1`jVMsa4%8iTy9kFdr52YN1k&bZd{q3cv5?h6~mf%m;Yl+!V# zmSL;JR-mtYIIm>RG8lVN4<#Wc>_xqD+|qpjyW54KOtThJHEzR)gD=o=`c3R$ni=0P z9S{+&29=dd;3D%7oP(Y+=}j`QbnqlbG+E>RNdmr%bJwIn`ycbd@i~b9oD2u=T)@H| z8q8Bg9n47@2A@A|;JoM^8_~EE+J&yeS^5t`Z?`(qfsi~mO6*1KT% zH3K!g2Qk!J6C6j?AZ6`!W=qo>Sfkv{D_OY&TP&Y2Bd&|FH@F;Yl5!#cNC5Pneu(_! zB8F??%()zpi-oOs7+1#v#1$_TzH)Y=u6e zYWRoa24_6QF`{)b#O&M3XH8>3Ka7aa4gpd@630^;ok}1x<-Keb`7DkeG;^c z%CoXxRVT^!B(H9cz>0Z}Rs8yBI5*Y@;``ch35#fonFY>=gfalv-Hl;EVCVItyTJ&7T zCPN zF(!%bJD-E>pD!q#5eX44Enqm!9NI^#ffF~59eG^LJRA&1?-a74Z)!m(Wd=`W#0I+6 zN7&teT;RxoA`}Xq&T)R8#ng0s+N;emYxTr#{01+U*?>iQ12v)>#ub60MFgKw64XjTja#_l`I(Qg!Dl60-W1Df7%S0M_8Kp# zdI}DS4GDPnAl~dP@mT*#4IWn5<0hDjQ#Xr3WWznorEzGZITMl&*5Wbxk!>xX2wUEV z!O}DL;mmmt{Cy`F0@g{9G^-fMSTK=1be<3Ax&E-A%#t~F`4&o5Ph(R{v~lvxD%h71 zg`%?_!R|lexZ$lU+}5TT-}@W*C-d+luNu!$N%}V}6pSuTgnrp^Tp*N-B4b}!r5VS; zepv<%9uQ~A%gt=xQ3u$Qf1eezs)88x)fl_)JgOTl#)Bum;N`+%cIxz0todjUi776? ziNDPr-hL09Y*p~mN?#`GkT=uvJGR`m(uldgww&F2BONTl13CGhkAwENjRI`T!sB;m z*$t0Ratcm8gRuH^9FA<}ggmZ=6ABC9$%!OjI<8~unk+~cwPSY)SHN|84Emy-m_Czw z&Nx?$@U1T5A1Pr{zI7D$UY?;FX6)H3m=?VQ3^eZZwr&0h78)9+;TH?=tJ7r&=qW(e)wvwr)DI|g zbOoCnvJNXG#6f3c7#@PfbF2+_hEE{H4Fx#TaR$sAVxPNc%95+^UhQU?cEt^3Aw=h+tMsQbd0D$bxOhpf?8Z342YvuU5sHTcBg$H$NohnOtBQFX44v&Rp-K?$!-&o|D^({S_Al7b1m|$4l`DF|G}-I zXSiVgMl$R)gUa@OLQ%tw^rf~rE9I?<`$R@KPHr8nxPJ$%KbC^w@*&vG{ljZtc?}2r zPebn3PPlhJmXUn-1$z%D(sbi;=F^d**udn1R7NKBsMkQoy#>^;XAhh`HxC`JyoRLA z2>j4yMAHmsl3tEG?xR~6sk|~QycCOZnIh!2;}cvFDM!Anh%M{A^B-P_Jjl!p3FEB4 zx}S0Pb%q$5F`V(U7A*CIv5anHUGiQsDGU7}`_y_^!x_Xa@!MFLNO`&;*9x49jiKWo zhjBVI6|P-8!t@OJF)QOdVTDRR^VzwF=c>YCCyBn}I7wZE?AVKJ)Yc$o_KPT7U40Se znDOB^9YNJAOTb9Fg`HTADD~U}rtJ=dKbzlz+}%hFefXVCQS!%cD>ozUS^;ZhmScay zA2zJ30IrDc!V#S`2=lpxeRIZH&nut6ek>VRwm9K;yE1$+{V-fRr~vab=7Qb09!gv| z4|~3h;-R)T==-gN?P-pOjzPo^CBHbQ5_dA!Y}ernxBwy=S)i~@pt%d~qwP*Av%=;k zUj8r@C4;&l`|59w;*3ys;$XnO+O@0!*kZfu>Vs$l)Z_C;NS#y(^CbU*eC%uclM*M zl>!v+8HfKyDIhFRJ)Ac=?WjCklqR23Mu(U&5X;bjh`uyPvD#1M2WK%spSLryXBdt_ zpeT3=v?xiR=Sb^Hp}YDUN{C+L)$C2h8P(6(+UNDmv4M2-6gWXc*4faEsu9jFuMtL} z;UFH`ag>>Tf(HX48#NDk`{j={DBgGalmq z#lvFH=`=J{4!&1}GK`xH^?fA?Q@ekHQ+^_}r)GhV!!0;koX4&>9gg1|PJu5YL1tz7 z;z#og_Tj8M>?B)#xSw0jbes=>zlF~kkHvEYTJ{x8taRaQcsdzv^N-?Q%|FZuEor8H zeF{&wuN=?mU&hK`A2@p+HsZg#IZzNI&15MYKg zge+57Pki>9&_(rTxXffRRaYdN+qBX0trl)wxd?Njlh{e~sxdJ952HFI1{XD4 zfH}2!xM9E^J2hT|c;_X&-x`OGs@Jf}UYNXN>lyL8EGi%GhhtYfpgUvgipP&u-#LjF?3^` z%0%$@vLJU9>~KTCT25B`b?~qL1x<^M@vp})j8q%*bX1)1P*5<0dmjXO`|0#wP!#5# z{mi)EF=skOf8z5wp}e>0i9Ch(g1t5EFzaxAzUl4?Ez%`rK)L02S+Qy}NONsuT~5@o zdz0>grIjD7vz5Tz)^Az0&U6UxNMP4@K1c6_YD|rc$6d|2xc+M^Djj-CHD=AFVnH9l zaLHO+ad6S8(&NRYtjA71w_r>B5xo||k0oP7$Wm-=bz>3>{Xy?(nX!7_v z8^3i4IcCUXIg2hqmFhoq`lyFuiX${xz)Ncj^la5_X?S77Eyg~10#E&ZA}adTl zxd;uIFr2cql{0Ih9^^ZS(aZ1W!3zul|KIJ{CeO0-C+^4F8v|ie&_P&_7oha`AdGFD zM&9kdfRA(C8Ha}DsF@{6_dRh$wRHk->4M2@z{}G_SoaFMcX1FTsy3j3@K5j#t%mP^ zR>PYGtKeHv0=~5FV}Ep~v0f_v(ET_C*bS$^tcpRK6=S#{x{G>Y4bPoEhs$U&UJgiO zwDKd$ce1Zo({tLe$xj`)v6V2^rH^vk#aU@12Mj%(fg7mbgz^G)9}MiGlK@p3scVZ|PHe5C_U5ZVs`ljmaX{VPZhXkgBZ z4lLJDV%zy9=%aoNzQ_tu<+HP4?wqp_{-J}HQE5l?*6O30Z5AZVU&$0IzlP7#cEiD2 zpLr8S&9SCUkv7lnW{OT%nf8Bu0$oy<@&3FJ@R*e$;4;s%qDhBPb5}Ixh~y zM3+IhjV9T4#tjcD48Xnv&Ui>ql8U|*gF}}dz#&t>Z-Xktbta$fAL4Ts$v(%;kB3MVfz9E6%*C!7Zup5(i%i!%OGj{)0 zajbE02V?DQ+BSoNzWE$+44J1-;kkFj0n{z4sZm{uQU6 zZd|}xxJZ1eUkkK+75E)E1HR7F!Dx>fSgzH~*qfE%MftyAQ&^3)3xn8w!3OxnUy?}A z%VYmLdWrovI*u#j8i17&AzKFmk;ICE{o+>EcupC5eYIkn|EocTZKdpfc#Ic3BjN6c zg=l5-h?j6C4TGX*(J5Bb>G2LFf@e=bMw zLcpor%=&&+cC*7e82dXN-}(%)=LEZAn2Hj86PCs%_XOhZ<|n}OvSuH~U4`e%gCRb7 zE~Ec%J@sLS!TV$*tgp=irQjY$Mp`yr}2s9~{%7>=F~WNz;_ zfIZF@;3}VkoxA2TMKKAmHo=CKx}?u~_B2uML3($&vridWK6@1(eGy$qjbFtEM>}IipE^%7%Apl5sAQ0Sw|rXZiCjoURHD8 zG+@67aOuf0xTNEW>+5PEMO+s}pxe+@A9+5q$q?QJzx{6>Z;BJFOees>xa$I!!1bgnEDtIb)W4XNo)5BLM z?G==Vms?P5C>-ax72uUAxlI4xP%zkW4b=}#fYNUXctHFbqj=O8WJgYc?uiGC<;n|i zdI1lA71lv&donZG_C0Pj(4reYu7rO@;jH59WT;%SM}W`kc~hr&VOPdfX5S<`%z4_v z1pc0i*BmaxvEy;#~zqla% zA@g;o9Zc6TBc=KcaNBbbC-{q(c4=IOGuH-jpU(wmiq8gmLUbZM7Pg31McC0z*`LtM z>jp8~NXWQZ3e{#bV9Ach`0L{xwr`R@My+wjS7sw9EfxXmzT7CgC(s_op6Zf03p)iE zAP|I&rFrQgsdV!J6?#+dKk_$Plq^113&+k(q0zTp%EjaJL%T%_31N;&V7+7pU@1~A0CdX$H z{mn_>w>AJbai(C~#6?&WB}{X>1+!#t9_)M+h0=-cFu}P36z8e34&f4L*qef_NedXy zUANe*zXx%Rv@W^)(-s^TgfUsG6YzGIB%PsDi{^cLEZHOte^)6oY8R$q=H*DdDagF7 zb=b?nPL}=iZk>sC^L9ovBMLG-USd*1D@NWO!W);Y&`xd*e5yVR_t;TbJ}AgDdHlt7 zTXV1=@iiVyvBTEt36yQR54L?$%!+5>yc+i%uwh7nj)_YU+WrmBylG@}Cf;CmW%yXE zvz?fiW?@g~Ra{w?jb8<`^#@mq)Eto`v#uGkKMdP&qLBy2cMXbd<8j0^ zgUcP(UxL23?L?up7|Q-TkD}Zza`i(HcdqeV{%z?l8gE%Zr~5462A#h^yA$;w%ubQm zly;eH{5+Mr`qElrm$-vET=4ruhJ zgOMx`gBOj`jIUojCRLV!)@~z6T#x`Z1zs@a=2Y@}-V|6M{u(Dm3QSv%`ceO96J+tv zLhkChvBnP=h<+=$LgMnoXVv?597`ufze?H)Z1UH+i^vK{RWhaRnwv$>YWwSxmd< zd2C(W3D4X`sZ-oN*rH=@8g3>+dKW(u@OVSaZ|4a#l%0z|7dtS#C6R2Xo*tY~m4$_M zU$J?@bZpwI4k8P?AuMJUH3=O8`zQDCM#=#5t>tOCa7GU#FT2KbHn1Wls=x43Z9coK zgi8kQ&8EBeePzTyhL9ssS#dAlKt)zd;SPRkEMl>@vZ5k zAvl~oTjM}NgGb4c(PS#;XG_?u52$C8&!}CGhii?wwAWTvASvlld9@K-A$p76>Bu0D z4nLrJx~It06{Up{6Ue^2Xu9B=1wB1u7mc45#S>}^q2rButbxM- zB-ACc6Lc)d_(^3bbXiRXQviC+TyRGuu-u{&==Wa9Rvyu$mHpYAj`|>W#C#c^U)PG! z6~*|x<3Fm`5COBcUx)i{=OCb7h|XUfMmk@s(*DLp{P)}rc-tsN4>7xG$2SeC>%EOl zGPp>bC)+~a@=~6LbQPl@$VxguD*0jN$KGuUrPc1Pa5`9<*<-m0SLIHkzYZLuw`vUG z3t7Mys}rNLDPPM)MNY!XkPx!`%O<$`@Etzj6~km2Xw+)*A^T} zT~2+xM5)=L2ZU#*%AAbb#a^3mfo`#z0OpZ~=zQWHXg%G{@HgH>4+%Zmf7A@;?7vNR z3|++d;Xs@z$mq<^S0hQq0T6Ubo&M=QPfxc7;nj5j@mk1Dtpbr|NyatL7Bsf=VEeKd-e2orVzeX+9p!(qi>0;j*m66%x`v?{ zRz^g+>NRX?i6%iok}#6A3f~8)VBT6e`tfTIlXgau7Vj_wr?1oL>v%yw&q;ta!E&f~ z)rs}Oc|2M*8B;6yyxq&y3Ew#p-%Syr>xSD==Hq1Ek|Sk|*NsO^<%t>8rN;q(559tD zHYs3mJrpfNoynQaw&Z-O6ScjvinFA@1S$;Qvy0w5fmPD4*`-CF!Oq>0DSyvG7dIaa z53C|RD~zb$nJ#?XZNW@Y{eedtg=jac!b@B?%Cwyh!~bb=|I^yW4XmJewj5nv^%1jU zZox7;Jy>VBhHl-HS$!fo4b;M9#DKa4|^NY8|Se$(38Ib>JdTRe)$6gh~tC9)~F6*%a5S^Z5hZP_zzmd{$qU~im-oI9|u4A zZ_Em{B-k_i35bbBFjqEDBd&IG99w=Moa`_K&*bp(11frWcU== z=fqs^*@P!c;^6m7HIVXbheLS}n8|0;;K)o(d}bEF#CzAm{eLq+Tg4MzScbs&wU3#h z%0ga1@gV!l;ve(j#SK*4`UmQIucDh^uI26TWq&Obre0NMm>qtI*>l&Js5bE+PDbFB zEqX?VZeK^M`T6MSSI0h((;*%L?R%T%X%sq?%|3eNgIC&xu}HEV9vKSW(eGT|&m$WJ zndx}&IV=em&n?FXF&vz>@hKYO0<7t3X2-4nuYxmw#Byt+FqN5d%q4OVB~hsq@4okg zQYu2FlcbU2bP7c!Nv3F!h%zUkl2ReVyYB}jl2jC_G)qc@DJ1p1-~Vv`u=lmrwVYtD z_E}Uciv$aW&zPp00S(R8Eb&M-7%go8-zax_+fomD=O{xKk>Ogr;^-GCao(<5l1-j=r0 zq_lLHs3As|rKf=PlOk3vcOTT09)Z(T9{W zPJ;xmJ%_uF`M?8_f8bJc2FLo!5ZA*hxbsT9V9wl|_{UY5Mo#pCnzgfVAR&dEX}yjs zXCw;;1@g2m(H`%2UZ#<~dtprL2Rvh4;*P%~*W`O&IPRTBkRB4{YLg14kN{|dy~ zhoiA<1G7+UMCsfjJhr|Cj~lf>gM>7{&F?WAsENcweeYRVj3IE-MSNWNiTI6Q3EMuq zvb;UH7#`gx*wB$CNd03*1A0~q^RKPOl;V17R678>#iv1P(;1lMEW=CcCW*4kJk*=& z3u!;EfPTet;f)P9!Tnt*uC#o}bTo1R9!Y>NxkY}|tilX86F54{s@#4-0Vt>}Mv1^L z_qKASu+Fd{}RCA3aC4l7=TvN|yeYeVIk*ojeGr}Zo< z>+c3#@ma9Y-A0(>?f`2ah7+kh&mg3;7d|IzA(Os`)B0>g-RT&(_;xBCy*ma}12ULX z<}^J`eZ2%-6FH{;638zE-fLr8t^%y;WNBS9@I!2ZGv&b{ge|IFrZ zUPaBqYI=dJ6?zZRDLHq+wfs+(=4j&^&n?#FWaS-FGyS*DN~zc0|^ zGRANycoS>Q_zt<|4p`FJ`So_9HaAIkeU=8~Ma7JmPM9+wt#a#I+^R=x0N0gWW$pHRepBU6(VJuLekPD`OTBX zp}BS?m;GCYzhZQXI-aWmrzwgsGIto$7kR>(O+8R4+B9*R5`mWrH}L(%N&K2oks{Yk zoSU7K1*yrVtT;{pIinYVh5JF+>70!BMeJ}(eG%$DOvk9JJ0RiOFz7KiZdqyqy%sng z-t0-F2|vBbjpr9Qx$P00;?P?rNZAbE`n8y~TR7^aM`F*O7VZ+8M*=rEvK9KrurgfK zqigCXr)|c-n<;XjKDwO7Ow44SViFi`>q|iZ)n>6an&7m{-UukmGM#AGuJ9*l>+Cui}X4Uf;m(&3{=HCk7R z?m_hnI(5c?eNGN`RH(wNJayjB?HHdhr~#)&Eb!JGdPKVrY@yJqu;XD&!f@6#h1Q2qGV-YH$q}I*rACRgB67&GfcSQJW%mmO&-a3HWHc4<>Kt6KO^cm)F2l)xP8NLgu%RKH29WW5F?sYc7nIl7(Z{mxc+Kf9 z8%&$WTR6UC&XH?4yKj*o^YIQl@V6y@CRmfXYS{?xOb*1St0b|)qyn}CE`~>e7r|h> zHWbAwqxtV*CjRObTxjSZ6JjGE*dPPP{3Yr(?##o!utxO1^aHqzSUlRVfD#8k6PMVR z;2;?aaXX%3oU{*Gc$uSaP!@|>+KDfATA*)&6%u8 zw-iRvr{bN^sP}@px;KzEPjkM+_B*&N-HEf*2VujEGNNr}im!&2qr0{eGm0w(b@3I{ z(?R5zb|mBFgIOeY(M))`#+PjKKS&2T9k}Ce0rzTD_~(P)KvkL(#I4p94ys9^Qj{Jy zF4zPOx_D;$Q3>qp6oq{TIWT|I0ijC2CR*H4phdBA_(n$LM$9v$>(4)+t*IB#VPyig z&CUe1fD~b@%aDjkT_?G_oT);SxX7;@MQZnj!RrU-f%u&!8T^1U+4CT5;c+^d#DGn>8@zp;gm#h1WX@h`7^QyzT9u->6`Fseo~d3n8i>RXW7w?e05BiHcS#5|aI$wMTLP=lr9wRH* zJ-Lg_%df=5WOW$rnFcAa0Mf>+qNi%jschX6umeAOJW!UoDt#_@^vEw=6h8w0*%)JA zVhT7usKa+3&EQFtKKSlqFz79dOPf33u2C&59W3OYIA`LY`O4t;Dh3*)qcHep1FTmj zq(E&1wv_FLLuNJDDiw)7TS`zSPLB;`WROO!lWgo|dwg_z3E86A%Vs+5VTI?CA!gEh zxE*$cZGRj=+LN~7?IuH(lez;2pU9$AuOgOs@%Yo|Fq+GS!h#=X$@1bXDEEIyN=n5o zR0>aHOzkfb8jQv08ye`o{v^Ji9wKnPHy^^5KO}bdG`RiY)ikS2f%y#|!dgjB=uWDE ze?K=0{0q(kXLbhk=AFQTxo1(o%L+qxUB=9nnq-g2!5nA|At@ieb>S4vJDtI0d3Q=)yU{J~fbUQuZS+yEQ z9^Q!t@k_{f^%SZcA4B!$rQnWI6)L}JB7FTf9}u+&fQ)yn=B~_-c3&lR)-Ib zXQuCxEbhslE(rk@jUD^@+eXACn z7N!tKsRp9|Mh>lVdPtXtBGFQ;!dowba3nxpusOGoBm`(BngUWvN?RD*1IdPH>M2?Okn(mhbtdcq#5ej-xaI1*i=Le0+hkqL6YAXe)H zHSUW-rE`mz?y(+PeES3Up)D1SLR(qb@XyE^JO*ZbpC<)_%yb-jEc2EjAAAUe0v8sw_L2~tS{jw%4zf2tK_XV6enOa-Oc{b z9RAM&va(N;!k{7CE^+|hoEM{?2E_QU1Jih=nd;>C@%{XdJq#m4f72`5b5V=G%T$cy zd3&ARP%SM+%j#E?wr&X;p5za2L_N7NO9#mGXi=9(;VN6btP5sLs0YKdX1LJa23{!F Za!K-gNu#NwWzgJOmi4C=D@88t{{T!WVDJC{ literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/unsharp.weights b/src/autoschedulers/anderson2021/weights/unsharp.weights new file mode 100644 index 0000000000000000000000000000000000000000..f35dc3f33cf9e4883aea7e46c0e8e9314be80ed2 GIT binary patch literal 20400 zcmXV%c~njB_s7#DRGL*nR3sru8t&Q8m5hZDMIrPt3x&)YP$uwHhO_C|_2r)Vt)%mPm?WIJ*I4 zx0j%sdo4(?zZn^!0j9Mk6HiyQLRrm1DEL)?+u0JB8Cwr#gIN%-Zi>penlKQ1pAojm zh1iwmtm-QdSbHjt5tl3kZ){=K&gy6Wl9Q;gv6NL7TLOtqPrud5d1+Px?2(r8`kp?#6ue^@}KOfur1 zohJudVgpENO)I&l^pKRBXrk-O+u$p2i&32}wCEGk`?`C`htFkLYq^4GJE!3EOCoT2 z;ZvGYxPUUQb7@lQJ0hgJ39DmTnWW>-A+u@@SGe1aUww5ax7l$$S8vUE{-=43M9VsZ zKYDpKS&(Lk6ZNa3M#d(p*p_xw+ILcsIfDYjgm=()Ro!#L=y`7!8 zZ#HUkf5hCSIt>@d=T~9;4-dBzwb?gmY%(%#S3H@vSGQmi_Z!w9A4HFxX_$NJJXGHO zf>WP=f%`tKG|%i8of?)!UUJ5uKWY*gJ|GG>Cc-al(xn!0&#BAzcy92We0*P+j;Vh> zQ=-ZzFaHE^zAx&>j-c;werOYE3`(Sv#=g@hs?+HHnYr}C;-e(<=M&8SFoMT)-?Qc~ zywON|47mTAIr-*SF}ytk-YL|xVWG7+Pt}dMmtUqcpKhe5?Vgj^-A9Q3)+n;? z_v~S!o4Xri{4E(SKY*QVy9DD6o?zR*Ay(<84W4ugV9q(pQuvw4?$0ZLydWofLNl3$ zM<$byFLTMIMWWoPYf|Z{AGNgYKpgjb8iTuSmQxX5IlfiqGBSJpPNJu+iVu`Olh&bG z+<-w1J||y|pTGSXRat1rzuc_MEz6sL)gF#$^yC!qvyB;^3!in#i$eu1MJ%%Ngz8!y zdj3CUs^EPGnx`J8(kp>(TUSFI;xjnTT0Lab<|sN(V~pl1`g2$Hu1A;N_4G-cI={a@ zkgS@g!5jH7gnn+(WbB#*cm4%6e#^3EdSpsFwG)!yds@xn4zLH&ZJT$oe9||xwJc=v zX7u5?1A36D6awlByV%a^4b*)0W}NT3m+1G;raP`Kr1@Fr37R_bgc{n&yJwHslF9ir zLwPngHSQ^mtCl0imbSclqvfQrYZ83=agEheBB0}21|F}z!mZEUzzH=2d--y7g!nZ$nBdzDE&vx@pTQ`};>is-j(#l*GOQR2WK>=oaMgBbi!4t

J$Y2iMFi!y&^=BBh;u>n(JZIL9CBq`k66k7XIq%jN<61daOeJ~{ z`YjavGe$t~l?O^jK48r=Z{vfJ7EnLd#3*mRjF~23&?4Rqx%0-b-|{o-A6EzAS2aM@ zW*AD8?&BTT9ti*8$J8ul5NeIlewGiKY$-(1oKALV*BDqvUI%M+X>j&*2kG=sv{!ru zGC!}g4}I^#obL?(W34RzWTPK9e5MKi=!?mGEB8g*|1KYamK;7~aJK{}zvrNWq8RW? zZ*c1LM|ro>+t9iAAR$WbeGhDe}4n%X??h`BT!#tMwgN zCF@3FwyJZPa50=3mqH>FBk8Q9GQ5%}NmE1aLdcG2R<*JbXV$p@yWfOb$}gerci%A& z4(OBbrz+{Mr$YRhs!CkQa^RnEeM*mcAEJjx&k?z^J7`w06<-YAk@C;J7#>=P$D(eM z`HVa_ZBite`gkU~_qsryRw5j%bHI0(H^Q^K+H~984fKUZJ<5Ik0gkI8!Q2>GtBv>Y zfPEP3UKR-oG!ulD6qqhwnt&b+*>qRdHS*nZ02EW?A%`f^^mt9KcTFr)_P7~49Cq>N z?wZQ2?BJtUvI)K0y@_l&CC{BMfSMGIi zcL~SEs|``MXAYw3ERuTv4I6Fd4{JLU87pmFc+EvjGJJsR4}ODl#(Y-cb{2f)N)nR? znef-pn#6ZWb2Tdo&Gay(H#bfpdKIaR8*u`oZz0U-!jqUgstt*Ae8KX-HxS16sJvu5 z>XLQzz>P=pgStPVE8@xXGlaN?fu9%es|6I{$ zYO^_n{!>w7#gA8jwX+y$*)#$#)~7;5<{rU|^AQTwpMk5yDw?$YHVt3+n)&?5nn{_d zg|1Vs;_F8-pmx8I)A!w;){A)I`Aaoe|16R>H}nzFeJsn@Tw6oFI9hX8^t~cinJfG_ zLoPR^%7ni+J&Hd;_As|;#|*BvMlko1f(F0XIE}6!4dJF4pWvU>=%WXm$1uHSJJ;#( zR&J1%A1>OT0CW24;6i#jG|PQK`+a;kYnX|NQ+45-QYZ}Bm$7>Drb6A~)9fqj>v+h@ z4!mbSMfDOfc>k*d?WZ5Wh4UVuP;XMvugQ&I`S1oLN{_=MOD#O77zf^U^^B7!g2eF_ zu=bvhmoy!qZjw1vSy;2j?MCoRM>EQNe2kw9_*it&4@S$}pek?z*k03MCx6bu*l-o7 zP3EHJwl-Fw@*{-*PJq_QnIN}kFJ9;|#fsE&7}&;V`V$)={#Y`s{M>;j>#RU1*vzDH z=oTLpUhuKElKW@#b+SD`l}6MaC?R-vt=6ZEeK*z&p>il*O#F^l;y(vgRk z{yT^VHWoorRS8b}BMNhH3puzV9R9o#;IW!#O_)Q+i#E2 zJu+0&(SpWI(gmq;XH<^y=4qcSMd!BxxR*2H4c|maY;%DY={v?v(*g;Eg)?((@$#C)!W&8D% z>Dn|x`(p@dXv`zVvQJ>+vfKD%i6mj1USsikQSz;79lJ@b9G4|%qUKU5EIsDJjtwO- zZH|vYq&owBe(_m#S4(zvh8CKgXKtgF$D|f(EbFBZif! z&Ov?fbr^%d7ysC6$?A~(@gdlIjT$$-9;jc>=Cvgws@?6x@y9n%Qag*afASZj zl2b5QDi2O99#6us3ia=2q8UE|j#>Nz!%=M-G9wf(&Md>5hn&b2jX}J1 zC61k5VL?BKYy^?vxilmz65opNW9z5NF&DmC@KiTOqmYd*sr*rnh6>U2O=AO3x2yDfJi==1hXWS@8c%uppO3?l9V4-jmo4HR4u&0>{l3QQ;3e_}yZuw5HG< z6J~7ZhMp7U?nt;mRs~Ndo?CWM@sTb3*DI{)POA>nu~!_fYM%&q)(sgVH{1#Rh3;&^ ze|0#h`4%&GpMfj90Z>v%M4O?0tV(fW4K)q$_Chy2&-;xYFDgL0WesFz{)D7YZ*aSE z47&AkKzTop+2HjNP5A{_mZAqOyIW8+GzoOyG-AJn9j{~gE55bF$znA`_VjL`Fnq9$aa=Af)haeRN`Ddg9;K-JkWsJdhcTQ;mh2cJQ5r{v*!p`|!?Mh>j?tzaJ6d|*m`r(#In z7%k!HAYZx@%u5E@&YvvRsVN{M`{(0`!!lwX6oMfhJT}tx0O7uCq$ZE8nbQ2DRH*ba z@jtzlEHGHbJ-MNWs)_ICued0KrrU1N*9PuPtz#+h%-8cRz9*1CjZNgFl>|5Y%^+F$ zHh@1(Gz$VoJ(yF%A0XuK9y%$q6P!-2Ad99KfzIS|dgAFMDx74*(TR9~N;eKueXTnB zx%m#MW}`W`)F|PZ*^`N$Av8S46K7@sdhQZv)b)=U$@TN`j&(Lz{S|2Rp)*j}evw|h zl17A0+euk(HzQKMfJ{8Ojn7`@LmHa1_q&9lF4B%Vwxlzfd#{sL@8vl5^d@ymHl{|I zI*>lNmin)_OHB_<=cn8E5Tk`= zXPbiiB_%TJpB10;=m&G`%_*GA{K3SfQqb?T1-LJ!@%lw(K-K*_?8!8Hh~9@_5}u8< zQOX$J8IQq%CqVsT9`mYG2PK{s!GVDx+&=pYL>gwmg%Br9$6`=u_hla#ssQKv9&pe6 zg*=rOJYaAbZL(vqYL_amtN4hQ%p|~h%`AL6){p9M=P*01^5NXJM36E12tw((tk1$> z_R|`1R?~+A`r-co+&y5?6&BS4dN9}~i1m4y3aN!{4Bs`1samd&u{pIMDH{*Hzm}k; ziY`Qzen-9X0&wi`2KC23IX#u7DA6#^L{EAFLR;F39?YMPr;C!%ua9AkZ8p8Q;`sL@Gm=OT@_kI4Dzb}_(JB}Lm(lY&Q^s-px!QBR{f?h^Jrig zt_9qLz`C~}bS=M6x%fT&mIU^?-0d5nLOeGhDvKv3SgGq-W9tjCR z=jcD^CZ@#H6z#yTF82X;4ZwSyWDt@F;2p7x2g|_#NSR%W0~4RYz}W`;#VrKgHB*2; zdm^jwhXENOV0I?;VD6P(Cbu)45o!ryhJM7LgVRKwqkahZuUdh^zdpj!MFXgBJBH?3 zLwG%{3kRg;fCQ+(r-9k+(DK32?Z$1P>m;5K6g zn|W36OpkORlg&k=jUAw#c?SIp&!L?5Efg9GGkxz{i9)j-Ik0F5Qp>9t+pWT&&VSAN zN2S3~g#mJ14LHHmMR7FXFEZ5ir-b%gLP;Mc6IoIflo9J9t(wQ>xbY;=>Vqt zUc{t5K`8N01=|k&Va=>x!Qa7fa7>jDe18vi9lF7^G|RC&!#=~#-D#jp0`OLAI%?LQ zhd#G_L`Wdd(f`_Crn>AJ8VS5t<$2>wU4uh&7NcqzL6 zx`$T1vlzFK$>2u+76-}x!x6c11{5oi8~+P3_hx|RrsQqSvGi~JQPoc-v`ZqjkvaS9jo15$e#Ol z1q>Px%`NY+-af9NYLW&!?NT8>^D3&}-V3{rxggK;80dl$L}mrR_V+wAZ~OuLO(%Im zS-TjCmp9Ptt3QhG-G-4JA0XEMHj}*D8HM7s*s69b*i}%9-usq9*CreIVlKwtD=y97 z9DSJH{Om(yW1q7kvtEE{lc0}~4*{7l8Bo8W2eG2BIEQAw1&@{6QCRgI2$?TtANhNO z->xZi!(3Uqx#K*~^YbxCR38AjqkE~}nVY;T7sp{~vjR!aG{nHMz5JQaM?ia$7**4u zuyNKTl-FNJha_L(HQjCW){F~`O2%TwQ%!@WdGz6x)z9H*SsZUkqyj&z{0H^($)Tch zF~nc%B!89O5F0SmPoMwr&WZZUw z-WoS<)T=I}~=^*ac4z}TZs9tA;>1~Y|p1TZq6E4HRR1>HY z>`T0hqF{Mz1yoer0khc@I!X>62-IN-EU7DuV@e>zS=xVyKgUmHphTK+nyu;Rv-Z zDK;osL8B$_P^p+a_Dte0I59ktT-a)hHligYQCXeuoheI>ztQKks?(a0OsW&Vl>3a$0+SRsObH#LdL93mvBUq7Pm7<^Jl*S9(O636zm%g-qxRBm z->hgrW(0{0D~GuX7rD-gt#p4PrKi<>X<_^kx>G}nJ3nRzakG0ug>DXVjzzu%tJfav zP~A#=BsjCY)!k5nnj!yE9qS(0jS(Szu&X2npWj(a)(6Ys#<)~!X6y;08BT0=+%vlG z<_`S+WeqW(PMLO1Ntz$4LFM!-nQfX9)b)u!`H*NxH(mHZU41nf55p($?_Vv+`lv@d zUroo*FSl^=jzQ2}nFi{6(m5$TWmrs`uth;1?g(_p@jr4@$1RwB_j3nI%bJrJ4cC}C zZ@g&cZaHfEUn{(_;n7{EgwasyA#t2?4L&ZMN+k`;@Q(X5_`-2nz>*C89c1&2I23ZxW37~8(LP~_UAyfcGqRa7LS3JYlg(SfW0&A8F%Y6R z8|CQE%dTieI^n&80b2h`p($ZR?{D<7Oz?I3&Y zB&ZS&@h^(lagSLom!J@YzB&qItfKnPy%9p4)B7<%A~MQu&*G8!3?%LwZfIc$GE7J4|+E)G5w#UVO3!t zK3KZ}xEDh>UCY06hD0sE-GBI5TIC zLS|1l?B4hjPcE)!?4RK_F8Ow%2Q!#HOiW z`29TkG_A#BVgkR6^U$PD_aD<>T@Mn&lfWrI0CV?iF||qyF+Fk!u8g&T?V?WBr}01N z5Q>4z^dELT9%Wj;^n(4ZV&r_)hPC%4Al2FwwyeH|Wd|}*;?*c4{dg}jpPu8{y3dfk zkOz*oe<8f?3p8(djxDETv3kk?&iOe6KP3JjZ_h(8_>zQLXTPKO*|}gnq7Jc}W0>7d zZ@^0B6YtTwW!Pf=ocU$H7<+FNV^w4ZWIKC8@5OTD-@DInO>8+~o*9_in$Or<=OC_W zfQhj+=>72oYmha}-gM5wuNtiwJogm1C46R;j|=ut;T&`uvgTMlS%+Qs|3Lpr5A5%J z1c$n+83Xk$>>57_>FZMjK4cHmpK%jclNiXIc?rz-hC%p@EI6calkHOc%W)j6h1kRn zobH{9vVPSVnbZy(q1SBwm0-xWyMz)7In3xq!J5nKhbEqy+l3E(ZUjdzftT zaQw33CYBA$z`5TE%-Dn!{Pg%2Bu~qN-TsC+`a=s(6nw$8XX}``=E11*wFe%kd$PRu zBQQ4G91Wd9Fm+A|i#v+Iq3Z%uwPqe$R`39u^v&>GToDsj@C12Z=0aIBf|WI&*E#x}v&Nx6;b3ni(0vC7(RO1bw2UaQGJPA^wXd@T_nm}3Db?6; zbuKbptNKKUip^+ht78~T}#I~jx>+Wo-h&p1wM+tkf2X@5%^K9 zGdL~2H{r&yHW-LNcrNXR{62w4*dD~jmPUZ6i$AFQ&0*FjDWFwBEEG+MLA_21*eu+Q zwvm}2`FkN~H$8`w$Mbln=V&1}XCmkHrD~Q_kpm9=LAGIZ3OfE*4>DtYD4paBK6Wi& zAa4d8BbC4j9>=asE+(J40jKkGS@AbdK`2(0r}k?-eA4{I9vZU)=i~QL=*BdTZG8$; z^=TA^jE0MTsm7w7^)Q2+8^~9FUtHgii$~Ji(dV%(hIfgubq`j;>R=OGy~7tuHa>w+ zgDA*(-H!3wx*-3U8761Eh6w@&=6^iQrtKR*`gtnp zeo>3klD(MZm5F)VLcnt$16FO*sFq70CiO1IhvYIeHAbV7{Wadsw@-_{XbAk_#cNRG z`6C$4T*HeQkHeN_D)`#H4}(5R;@>}UIOg6DX&YVOqI(*<`9&i58SlZ@XJI0{k2A2rKm-q-S;3fkPlCGyNH$X5!iMJ9slzm~vynIFk2_*z1`h z@F?^Q(_Iq=gEpVp**YSyd|5h9lPX}f5BlMn%FFB&KN~?tQ-d#}%(43U6gbjV3JLM* zASIg0dpp_BkoapBZXv|FkH$F*bNyU|8eYuyVgCyC+Ri3%jOZvyGF%*V_9 zCG4o}7smAFZyYrF3`=giqeof>=sV=X%Ap_-aXW!QjasIm$(hi7wg-g+j4?yW0L4tJ z!8`Cf{+0^CYTZnRyI!#0{hJIXGv-23L?rCoDTzUA7&I<3Xa5xl(|Wr`-lbbN@a2Fu z#59)iuKY`8Hf8MsD-BQdi9e1`^%3~2dk96X>+sn}PmZF~B5J%<1Osf+;PmezXfs|x zZ}u4ynZ=SMS-M={-SSM9=I@4E19vz&S^jV&{2=`v-HD%tF3`lk5$pnmX)teJ0&F|3 zPH%rc3ST>_LG)t~nA_*DY1#v5qFl!uSapi&TpP(Q|Mrq~)+c1eFa*ji^%P zfX@Rf@xc^Nj!>8ky6t(#Xnie0*`If@B&>sZyfhn?1iiyD&TrVeOc>Uh&cprnYIJRS zGLwAbG=?v%2H%uqxHSDSHZ7K?Qql^7Tt*##F^?g8>lTc1pGW7IFCo9{pW~V7XeuRC zfiu-)&|R?zOrGDxqi2Okzpnz5A>@IpTwiiT#fLd(?VRb~`zv9N##Y?>%p6``D8s`yUZBWZ9en-90nWIw zm>n~ZslC1k4ybHE-MDm!9<7IwR%v?aeH8TW@d3%YyBIU{j9q!K3x;=GWppgP*#Q-1iiNGVaTN ziluyc>PY&~_slTx@7;h!twG>=q>_oMQN$@rj&q*wIgjUSCV`*wN65=O%50Rt$avy)b3xSd+84OYX$CMdIg5A>3tYV5IDAmV+%j^3LXWk^_ zHqT)W_~kPV?}eZtzZcc~8<>$)ZzeY@58uxmD?VeL1T&3RW4FnAXthkke~x<0cy3|&LaPZhui-J!e=Hl2 z^p`RD7l+PiO4Q+A4dX-8QT%2zlsuIqDaHlGGLvS)_zNeT4Hk^csRPhN?y`%`|FX_P z9n5#f!-bozhtWGt6ty(`@y2#Dcs*Mb%~npu*Ol_jwXF+is;mX9o1y_xA@_0N`gn+X ze-G#UXT{n@wZZ1mX#A$O2lD1LG5c4(K%vwUkXx~vd1YjZV=o8TjS=RUxr`6z^L&jy z^#wxYy?RuUi@<>O*EzFxe*yAKk(3)PVhS{`Kw3*PMxS(Gn^F=P^lb&nLnZJwL>rE5 zX~c>*%8+WSND5Xaam+$y;2(Eg7#v#)vTr^x&Kl34TJkeznu9%ZUtfn>>dib-{EC~K zr@-hxJLoL_!G4?i9)6m2z*tBzr~CI*?9BIr=2U&McgqOpy>v9QNofN+r_~2k+c<1{ z)^QL$^^=)CSC!sbdlRoNo++&yhv_(h{neH7B)bfwr z{U8$Kj~)kuJDHr3hr;O5+XPlEgF^kyEWRu-Z<8SYMk-^Ws)QhRYtR zZgm$-l|{$_{ccS3R>TuE_wmO5H?aQx1nQJK2_JshO-H}@!`|+{>`d!PL^@f7J*PJv zhn%%prvwdHa8MGyyP62@<0PoXb1+|LGP)QKz^w@?y!&-xl%vXLmKe$rpNe11q(Vg! zHM}3h?R3C%WD%CR^q`^OT|eCsi5l`ncciRNMS}D69)o+# z_#o>Q2fIUEFml-h8a%{8c5x^go8N@l7$RT+dzfeCt5LqhnAs@i4zqM0!Oct=x-v=+ z)k-IjMd2FcV;5piLp3j5wiBZR9${f}Eo}IG4wnj0481>VKs#2SB+ju!9{ON&XAC$j zPXm#sw{daQRI<9Vi5a~oLBCfwfVk5YFzR`SoT{}rbnp&)n07(=pG~Qm>^*7G@V-pG^ANF0ap!f#*{xCrEkJ$jWrgvNtPG}}s`bvQNj zTi9Y)9eW9lc156N&P0f8zs_5ykGSuN2=o@4!K0u^-jADAP?S0i!=m=Wz^EZe?VSp> zVn49Fw2wKzj)2l?%IE}%;^BrjoZ@>ok>|4+?OlVx=Z`03T?hiR4rT0j4aOPD(X4V` z0Ppkpc!o_mgm0EUhlrd&{CVpnv{?dv7p`W??eAkElY^~ht(*@(UjtXW9951^g@rat zV3a$6B`$XudyWAX+Jy@8h=)wt+?89oJ+e>$s`5 z8gw)?*;|S|$i1|IPFmM!N?Q5&{Y*1neWHv{&nJMBwIqJ83dPQ-TG(Zs4F<{mn~XBjFjNX93QYvF^!PoB<9Cydz`1YJtcnSXPS z!QI#h*44@cn(n2t%l})z@v_}Y&&*Q6mD2;EPQ2{IJO!W;qbw`Y){5R#==1YJEa$ptOcp~vmzIDmOV4gu{jEh z4VFXmmJ;}PHHZ0JC-6Fcm9pbw@?@TB0Dfr~1$PM#&gK(J@ZjTY>N(8^y^o7ftx5}c znGgwnmz!AIx9{=8^F+v9n+I%XFuOp|=Zc@(OF!@rvx}}~z?Zr(rlI93ir-FRuRo71 z-hTB6G{+DoOXDLWR6UC=Kl=!RMlx765n0fvcBjAggh8~Am!EwLti z`;sy@7C+`iN=yOWZHbU+KZjbe2GmusE)$fsz-`|?I4mvT!7pEjdFu)O+aiq5D}3;Y zM81GY3S(Zj$AaViI5vaF!)=R1Y4n4Su-Hot%i{v@RL5hutKo)Q9$x`jM?+%N`Np;Ws8=wKUj=@{&}?a5wCF(EzipBGB)3 zFGoC!Wh*s{P-jXL=hw_C)c&%c$!?CrrJ~VrJ<^SxMiuZ$N-8e>HkY2Ls=?K*B{0|V z5vN^_1A7d(Jiy*9QS<(O0E3UE(Y8Fw7*#hmgs zaLeh!MK4}6?^;ywNZTW583@C7iK=kTNe_;253&Cg3~A`cK}h)!0v#4};ZdRJrsK>xBW%J$6wK|Y;GM}6 zK%RR6tK!PQ>u#a|?3l&*Z~tCiVp=dWHtPUPUvdv3uL`_dH%AQA|HJ%zX^YKKl8nYo zIdXmbI9#ci4>r1q95pc?)Md7T@8&!xN#2agbIk-g_ypu`YJdd(Nt_;20y_#XvY#IJ zK%|Z#b1*ps?(b2AqyGuvq6e43h&zE@xxEh91}Er#X2;4K{bA?LX~ZRKen6>_Elg|m zWYeQW;QimD7_qMrZ9^16G;}qm;$|e;cwK`pLM52*^NTHr(FJwEZ%YIo#igA`;QXwE zQ1Y@B?=N48ua^bjvWt{az3eK`Nf+_XijPeG(*?v?NsJXOZ9$K(r!iq`4nCN_6(ja< zfK@aK+w<20SsZ~W$*)*5-7uJNaT1{G2=i6-IP?kkqDad9zKQ+0_8`i5%Q2e{?#9tiEsVLd8}`o3gvDovSpKqw>>``#%!2uv zFxVU=;7@!}_11NeU-TEJsF>iUGCk0gNe2JJKJY?Bha?=DO|GbFQnh#U=t;9Iu-I`6 zjQ^IRhkpoDd-4V;x>d=Z+!c(sd>%1VwrbH=?fOiutSPwp$P3ii|=E75d zxVb`(F30ESKs+&c;dABAE4!@DCe7y7->tpF7R;|qS%YctoMQCSfI2D_pfP&-DZ*S zrYxL&?X(7@wzAM{G=rVe{U2M{8;j$WGZ@`@GOWWDUmX87kxaKV0-K&toIX8(RV?+x zb)mNr#f8AHW;sM`z5-!Z7jdh%F`f!4W$rB=XUEU3=OoX!WWHXFhhyrGLCfzk7T2}n znZp84FmyFyswAE3bBo>e^*IbmMdE+<1 zAXh4kiisC+kApm|ND%DB`J&XhqzkR$4uZqOQ_S9LH(~B;d2VL$EdCIFV^X5GbGK;7 za$8T&;g=qo%q=8FS#aL?F+JCp4(ndLBUzp*dR^WrM<* z(*qa5d`2+-52pkgoFUDq=2Z zop+|+TOMEwX-3^}J&@vlV+zd$vrg`Ja30OXwd-;r(ISrxHC%#04wtC$co6=l`GCFG zY=WbYH1TlOLR>iGBmP{}j>aean8+pG0$!F6w&R=FuM;kFNaC{A->h6XkZJA|4#2^0{5FkTaOMgJj7!@D;&eV zn)9IWkj0B07jVcx9OHJ+f!`{BS<|JxIGG1TS6`XD(RH9-=i8#Ip9!Ha1UcA~Idtix ztuUW!#nLyKSYNyj^mhGaHvRdCvCJw`&CkNgKU2Uu_Z4b>EP-uNTcPFXe6nKBV~AAt z-~|sYWk;KgkoR~o46&i;L!N+Ro;axzZ{>{J%*L)o6PZu*Kf&l05xT*95d2TfB0e_l zyuER46C0)sE2 zSq+i z=)5?OtYN-GP{S(hW z3S2Jj!~G%WFl=`rxi`ZF;ubGrOkW-|T~{9mre7zK&ICV@hG6JjVTb&Kri6R{5!4N? zq7B8nsHs9Cr@_>dqf)t!w`|T!wqRrx?D?lgZL2<_e%3PlEnsdsw^DNcNdhTr4C8;h zwuF|Qkf3f|UbH*qHr9Cv-v4wF`r%U&v7KZAE}q+nx^$UfX08h)I&T&IBXs}C5wTpk8X zd%)y_U>59Z4Bi&6#y6!RSngR3h0V9wI063=eM1BW>yF~^&Aa%P`3Es?2u*YfWK$P9 zun|r32{-aQdG_cnIIByM?T0hzrYH}pX4;DfhAtD??^~F)+WA;@>@bzvupAEi8erCg z+2nccA?Cjaj#xfZoyK#fpx44El5lG$JK@l7Jg|5LF*ArKd*yxUx~wl)x_B17S+GT* z!MzyZP9`0iihR`^f100ZK)Mf^@>bhc;?Yg|Ont*Xh>0%)^#Vn>o@oKCYpNkuz`Fm< zC`37?ov+tuHIc%TZ(=u2tgwlCu?PBYo;Crw+&6w)c?N{ zD(m3CkpfJ26l9MD{lI?eZB-)br1I#ZSGpJ)aL z+oE8eLOJf4s)4F^l!&s(ZhR5H7`JpELj8w%Xg!&bX%phWZ-XZO_UI$?#lK_h8!oQu zzlXzJ6G==IEcs@jx_qZ<>jlgI#9EEFE-t^{4h)mU>Thn2b{ z2VUx8u)|1;ZT@7&Y4daisgaWqI8hUB-?o7_T{{@}yT4(IK`1+VaTBz+KV=px#`6p= zn&DtcB^e$Qkc83`F{oJ&II%sPHGX^Xtfw!Wa9_+b4VgsVCa5rFuIX^wp2xzJN@g@* z5PnBGFeT?5K)?0_yUC@F>CU{)D>(F?{jdg^grTFj_svw2r+EW^{N>|@tx|aZM7@BA ze8m2m@5uJWo03wAhcNH>O)x)LM&eFf!@C=NkUy`4P53QFw(hPL%xHV!O9AUQ9vX=8 ziP3CRO%MEVmL=zD0Z(PZd0b%g07eDe^rscpSX8ncMH+Z$889Cg{_A2(4$p%WZ5Erm z-JsvI1Yafj!T(ioChk~gZx|;_)+`w@iIPE)D3QG9IbV?sn#$6qNHjG~mQpH9AikVQ#@_XlRIM;cubMEK!xwqc=M>wQj0@MAULePLU ztuI%FirpTNxKELDD>0`#-ZIox85D+;-)8xT<>{xJk)V6D5(Z5<&`cMV#di3+uGVdXz9WgC;{IuzGrtWXtaY)ybd93sp}P+h+nA zHL+w$$vpN%SqJJm65!_j=OVU4G*etUsKi7j&TPC71uEH4GPQz@KWGADiQABye-^_F zD_Q$&f9z?zX`QB=Kq8C(huOY;BLBn#M=~nO(x?b*RNDjL{(X=UnT1=f948t7sG*wK zR3@)jjt~7O)BJba;K!MH7%?m$n^Fo?~tzD{K=QqP#{SQSY~t^v=v+tH{wfnCqaz?5JSQ}jcbZz=EwJ2h#3_4rgg zog0n6hF@XEFT~+|!Flws7yWm9eIZwCm%vag6J0EC2*oK=z{)TPrR~O;luHr#=?BAa zUaO$IUtg4MEQhw~i-|nB4C^-7V_D_^S`X}l-mgh)msBWDf82qgtvVEf_OrVuEYY!H zGKsBhBrVF;m@P4tlhcu);$cz{__Yux>s8TI3C9Q>>E@f~( z{SfbddnF`X*$6l0o}%5GF7r2o1MyJkFfH@s>AvVBPL{m~xwECP#7UD~t}7+#&w^0u zStr&FSHaWKVvxD^f;A{FXD&TLTyRqcbH02Ry|HCQ_rsknad4otUeZR+R+Fx_4TRif z0erqY!w35(Lw@%V`?%SR^NSSOuU})a&^sOsPR{44(F*>R&lJ)n*bXn*QG6A>INGmcul?<1miw9ET;_JV|!haz5mpER%Ko7ta-i^KZvAZT>q4WwTZm zm9(8Y6dj=-lmdBu?=#FQT7y3j_1Ic8zJyQ0KEqFy{jBZh)1d2V$=Y=cX@iOz^EfMV zzMN$tDW-=g9yB6O<=)iCbA)afyia4?3n1Otou-u4BHeU`t*!MSyEF5ID$>r(oEQo1 zKIWqJPF+Z7Sr4BaUJ&0TZ(N@~nY(TMgLH%&KxK>#Da|lr`d-7Ndy)bz;~Av0UC?b; zC20Mfg4nZTFu(naPz$Z-v|s;%OQN2YagSkYUiO1ak8*--VbU1YXG!->eFK5XwIFVg z1<{|2;NIH|EPeYNK4$I$lR|Bpc6=Dbw;6+~?P0Qk25~Q}#MqIdWw=j#7tY-mP0lYL zWD5+Y@YrXEA$u1?z0)<+U+0TkD)s0It?8(9T4djj^{}&P5rS{p4`@h{q|H!NK3_gp z3EZl`@WW&f5@nV`=s+cQxz=$_!yiemegg_#o~Eq>S~k7KZWOLk&g)V0%6djj9YjqQri)fJeLCA=2JnXLuqe+WuSor5jc;{ywNIC1m@9T7D zT?IKXU_#k7^LoxvRhsU-qEG$Tm6K`43%Q3OJ&lP$YikbRFwvo;rOnIg?y3(V&DwbO+Pvi9(&ecNo@Q0Y3Qi}t(%P~0ZEv!fwMTIx}K^o#gX4zgip`!y`g{fel z_ZX5}E36J#8L4V~a$^~XaW$X`lY%dMzpZc)ssU>tSxrz=i`k;8c5)Aj2 zqsbL_JXG@*U(V1GcGq=)@8lSfSK|mccpWnB>LIzNgMV(X4$&Q8nC!dHrmsES@G6Q)bjs^{qtoS`RB$&{yY zZ*IVw6;`}046(_la#-e(IF`2SJ?F7<89ARbQI~RO9u1IP$VxMk1owUk!8UhkY%pm6 z-w+3|+*JaPXK6!k>s0hpxx^;AUO;hk5Bd3M4BXJmLrIknY*pwvR8DEd+>&8v%{z)l zUlmb*&qwk`wEz8mQWT8)wPUJGATIgC66wK0mhAKap9L(z)Fd0QlKTc)_7lwEmul=@ zrV0zZ${`?lh_o4JQOAMzusffDy0QfOO|G;{dW=SuvqhWDW%Kb(wc1w!(r-RK{v z!JL~I3IiKhj=?C~xbi8Bcs<4jWCIC*vxB!)X=NKSmB2931Ty50qKeu8T-veOx}dt2 z7RY2^XMh^DXeOd8^9NfKBuO;m@}bOm3uK7h7GC$i!P*j9A;AawX0dD6{{qXSkLc}{ zLzwYEjQ=ZgE}Ck5#O9cboXRL=Sr_sV=XVOu1>M3s_Fink@;EZ4Dnt$0DAYBWz#}Fn z;l;Mcc&*wh z4wqcn6xU-=W;jlkhN*LHH~u2F8F#R5i8&}e(x#8MAL5pjrlPosD0fkd!SK`*@aM`Q zA~!0@<=pb7g4;E0#=;Ejs{VmlBn59isl{tskI|X^_m-G8bFjB~CE4+E5!}eBf$a+q zpz4Pj{B52A*Vb^8DR(vzYt3Nny3;})&56Ljk3C~LPMg_4RWKU#uO-Hp_mD=@5msdM zg?!IQqaQX{!iqokVy=-p~ zc&Pb`O>k;`-$jvU?m>mHv`Er#&FIY8rJ`@+3jhbBNOe!pFZ?;e$@Papk!^LXU~b$gM{>G9rx! z^BKAOR>V@BljB>BkCEVwB5vSf3a&h_#0Lhw7wol&rTQP0>AnBW!jWq;xu}jwHkwgm zEGSDC2bQbDYTpiWj~f`b8?B;}Z>10j^|q@mEsacAKni zegLZmb_%@nN=V{7YZ`w&8Q*zDGgtXf)ZRkf#{NVJ`fM&2{_=hYW)A!VrjdWb8gprg zeDjQWW$N>T(S0D4HX?t164FgJj{LFlR^nGtiblONxep<$Nr7${4sSdOd8)C*wbGmD z?(HO#Qzy~6HiWv^`GR815x%M83Fk8_3f#Cc>sS0Uc&Q`Fz7`Ad(WN+UuppLiZ#<7j zeAaM93o=Arwln)LbNDX{=%`kMzAN|eKu89(Scuc;r|Z$7I~50%kZp_;v6cS>3xA!m zh7;gUd@Ui1Ru^6&mmyD$_K4A4cYK&m`d9Fu$OP% +#include +#include +#include + +#include "Weights.h" + +// Utility to convert from the old dir-of-raw-data into a new .weights file. +// Should live only long enough for downstream users to convert existing data files +// to the new format. +int main(int argc, char **argv) { + if (argc != 3) { + std::cout << "Usage: weights_dir weights_file.weights\n"; + return -1; + } + + Halide::Internal::Weights w; + if (!w.load_from_dir(argv[1])) { + std::cerr << "Unable to read input dir: " << argv[1] << "\n"; + return -1; + } + + if (!w.save_to_file(argv[2])) { + std::cerr << "Unable to save output file: " << argv[2] << "\n"; + return -1; + } + + return 0; +} From d939ea5790269a946773a5e0aca1b32fbff51d2f Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 28 Jul 2022 01:29:17 -0400 Subject: [PATCH 02/63] clang-format --- .../anderson2021/AutoSchedule.cpp | 22 +- .../anderson2021/AutoSchedule.h | 13 +- src/autoschedulers/anderson2021/CostModel.h | 2 +- .../anderson2021/DefaultCostModel.cpp | 2 +- .../anderson2021/DefaultCostModel.h | 9 +- .../anderson2021/Featurization.h | 222 ++++++----------- .../anderson2021/FunctionDAG.cpp | 14 +- src/autoschedulers/anderson2021/FunctionDAG.h | 22 +- .../anderson2021/GPULoopInfo.cpp | 11 +- src/autoschedulers/anderson2021/GPULoopInfo.h | 27 +-- src/autoschedulers/anderson2021/GPUMemInfo.h | 83 +++---- src/autoschedulers/anderson2021/LoopNest.cpp | 223 ++++++++---------- src/autoschedulers/anderson2021/LoopNest.h | 217 +++++++++-------- .../anderson2021/LoopNestParser.h | 46 ++-- src/autoschedulers/anderson2021/NetworkSize.h | 4 +- .../anderson2021/SearchSpace.cpp | 80 +++---- src/autoschedulers/anderson2021/SearchSpace.h | 35 ++- .../anderson2021/SearchSpaceOptions.h | 7 +- src/autoschedulers/anderson2021/State.cpp | 122 +++++----- src/autoschedulers/anderson2021/State.h | 65 +++-- src/autoschedulers/anderson2021/Statistics.h | 24 +- src/autoschedulers/anderson2021/ThreadInfo.h | 34 ++- src/autoschedulers/anderson2021/Tiling.cpp | 7 +- src/autoschedulers/anderson2021/Tiling.h | 17 +- .../anderson2021/check_weights.cpp | 2 +- .../anderson2021/cost_model_generator.cpp | 6 +- src/autoschedulers/anderson2021/test.cpp | 2 - .../anderson2021/test/bounds.cpp | 101 ++++---- .../anderson2021/test/parser.cpp | 2 +- .../anderson2021/test/state.cpp | 10 +- .../anderson2021/test/storage_strides.cpp | 89 ++++--- src/autoschedulers/anderson2021/test/test.h | 10 +- .../anderson2021/test/thread_info.cpp | 5 +- .../anderson2021/test/tiling.cpp | 12 +- 34 files changed, 701 insertions(+), 846 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 70b84757bfd2..69b63ad1cdfb 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -104,8 +104,6 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -using std::string; -using std::vector; using std::map; using std::pair; using std::set; @@ -161,22 +159,13 @@ void configure_pipeline_features(const FunctionDAG &dag, AutoSchedule::AutoSchedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, - const std::vector& outputs, + const std::vector &outputs, std::mt19937 &rng, CostModel *cost_model, Statistics &stats, SearchSpace &search_space, - const LoopNestParser* partial_schedule) - : dag{dag} - , params{params} - , target{target} - , outputs{outputs} - , rng{rng} - , cost_model{cost_model} - , stats{stats} - , search_space{search_space} - , partial_schedule{partial_schedule} -{ + const LoopNestParser *partial_schedule) + : dag{dag}, params{params}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} { configure_pipeline_features(dag, params, cost_model); } @@ -283,7 +272,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, if (penalty > 1) { state->penalized = true; state->cost *= penalty; - for (auto& c : state->cost_per_stage) { + for (auto &c : state->cost_per_stage) { c *= penalty; } // After penalizing this state, if it's no @@ -401,7 +390,6 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, break; } } - } if (!cyos_from_file || !found) { @@ -680,7 +668,7 @@ void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const MachineParams ¶ms, const Target &target, - CostModel* cost_model, + CostModel *cost_model, int beam_size, StageMap *schedule_features) { diff --git a/src/autoschedulers/anderson2021/AutoSchedule.h b/src/autoschedulers/anderson2021/AutoSchedule.h index a068265a8db1..fff5d96563a3 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.h +++ b/src/autoschedulers/anderson2021/AutoSchedule.h @@ -55,29 +55,28 @@ struct ProgressBar { const bool draw_progress_bar = isatty(2); }; - typedef PerfectHashMap StageMapOfScheduleFeatures; struct AutoSchedule { const FunctionDAG &dag; const MachineParams ¶ms; const Target ⌖ - const std::vector& outputs; + const std::vector &outputs; std::mt19937 &rng; CostModel *cost_model; Statistics &stats; SearchSpace &search_space; - const LoopNestParser* partial_schedule; + const LoopNestParser *partial_schedule; AutoSchedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, - const std::vector& outputs, + const std::vector &outputs, std::mt19937 &rng, CostModel *cost_model, Statistics &stats, SearchSpace &search_space, - const LoopNestParser* partial_schedule); + const LoopNestParser *partial_schedule); bool use_partial_schedule() const { return partial_schedule; @@ -93,10 +92,10 @@ struct AutoSchedule { IntrusivePtr optimal_schedule(int beam_size); }; -void find_and_apply_schedule(FunctionDAG& dag, const std::vector &outputs, const MachineParams ¶ms, const Target &target, CostModel* cost_model, int beam_size, StageMapOfScheduleFeatures* schedule_features); +void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const MachineParams ¶ms, const Target &target, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); } // namespace Autoscheduler } // namespace Internal } // namespace Halide -#endif // AUTO_SCHEDULE_H +#endif // AUTO_SCHEDULE_H diff --git a/src/autoschedulers/anderson2021/CostModel.h b/src/autoschedulers/anderson2021/CostModel.h index 784c292a9ccc..85d19caeb36f 100644 --- a/src/autoschedulers/anderson2021/CostModel.h +++ b/src/autoschedulers/anderson2021/CostModel.h @@ -29,7 +29,7 @@ class CostModel { virtual void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, double *cost_ptr, - std::vector* cost_per_stage_ptr) = 0; + std::vector *cost_per_stage_ptr) = 0; // Evaluate all schedules in the queue. virtual void evaluate_costs() = 0; diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index e68f4ecca7aa..8f407b8130f2 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -391,7 +391,7 @@ void DefaultCostModel::reset() { cursor = 0; } -std::unique_ptr make_default_cost_model(Internal::Autoscheduler::Statistics& stats, +std::unique_ptr make_default_cost_model(Internal::Autoscheduler::Statistics &stats, const std::string &weights_in_path, const std::string &weights_out_path, bool randomize_weights) { diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.h b/src/autoschedulers/anderson2021/DefaultCostModel.h index 7f5b56327568..68fe0f6f4e1c 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.h +++ b/src/autoschedulers/anderson2021/DefaultCostModel.h @@ -13,7 +13,7 @@ class DefaultCostModel : public CostModel { Internal::Weights weights; Runtime::Buffer schedule_feat_queue, pipeline_feat_queue, costs, costs_per_stage; Runtime::Buffer cost_ptrs; - std::vector*> cost_per_stage_ptrs; + std::vector *> cost_per_stage_ptrs; int cursor, num_stages, num_cores; int batch_id{0}; @@ -32,12 +32,11 @@ class DefaultCostModel : public CostModel { DefaultCostModel(const std::string &weights_in_path, const std::string &weights_out_path, bool randomize_weights, - Internal::Autoscheduler::Statistics& stats) + Internal::Autoscheduler::Statistics &stats) : weights_in_path(weights_in_path), weights_out_path(weights_out_path), randomize_weights(randomize_weights), - stats{stats} - { + stats{stats} { load_weights(); } virtual ~DefaultCostModel() = default; @@ -69,7 +68,7 @@ class DefaultCostModel : public CostModel { void load_weights(); }; -std::unique_ptr make_default_cost_model(Internal::Autoscheduler::Statistics& stats, +std::unique_ptr make_default_cost_model(Internal::Autoscheduler::Statistics &stats, const std::string &weights_in_dir = "", const std::string &weights_out_dir = "", bool randomize_weights = false); diff --git a/src/autoschedulers/anderson2021/Featurization.h b/src/autoschedulers/anderson2021/Featurization.h index 94c4cb00044a..780cf0eff197 100644 --- a/src/autoschedulers/anderson2021/Featurization.h +++ b/src/autoschedulers/anderson2021/Featurization.h @@ -344,79 +344,79 @@ struct ScheduleFeatures { template void dump(OS &os) const { - os << " num_realizations: " << num_realizations << "\n" - << " num_productions: " << num_productions << "\n" - << " points_computed_per_realization: " << points_computed_per_realization << "\n" - << " points_computed_per_production: " << points_computed_per_production << "\n" - << " points_computed_per_thread: " << points_computed_per_thread << "\n" - << " points_computed_total: " << points_computed_total << "\n" - << " points_computed_minimum: " << points_computed_minimum << "\n" - << " innermost_loop_extent: " << innermost_loop_extent << "\n" - << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n" - << " unrolled_loop_extent: " << unrolled_loop_extent << "\n" - << " inner_parallelism: " << inner_parallelism << "\n" - << " outer_parallelism: " << outer_parallelism << "\n" - << " bytes_at_realization: " << bytes_at_realization << "\n" - << " bytes_at_production: " << bytes_at_production << "\n" - << " bytes_at_root: " << bytes_at_root << "\n" - << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n" - << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n" - << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n" - << " inlined_calls: " << inlined_calls << "\n" - << " unique_global_bytes_read_per_realization: " << unique_global_bytes_read_per_realization << "\n" - << " unique_shared_bytes_read_per_realization: " << unique_shared_bytes_read_per_realization << "\n" - << " unique_register_bytes_read_per_realization: " << unique_register_bytes_read_per_realization << "\n" - << " unique_global_lines_read_per_realization: " << unique_global_lines_read_per_realization << "\n" - << " unique_shared_lines_read_per_realization: " << unique_shared_lines_read_per_realization << "\n" - << " unique_register_lines_read_per_realization: " << unique_register_lines_read_per_realization << "\n" - << " unique_global_bytes_read_per_thread: " << unique_global_bytes_read_per_thread << "\n" - << " unique_shared_bytes_read_per_thread: " << unique_shared_bytes_read_per_thread << "\n" - << " unique_register_bytes_read_per_thread: " << unique_register_bytes_read_per_thread << "\n" - << " unique_global_lines_read_per_thread: " << unique_global_lines_read_per_thread << "\n" - << " unique_shared_lines_read_per_thread: " << unique_shared_lines_read_per_thread << "\n" - << " unique_register_lines_read_per_thread: " << unique_register_lines_read_per_thread << "\n" - << " global_allocation_bytes_read_per_realization: " << global_allocation_bytes_read_per_realization << "\n" - << " shared_allocation_bytes_read_per_realization: " << shared_allocation_bytes_read_per_realization << "\n" - << " register_allocation_bytes_read_per_realization: " << register_allocation_bytes_read_per_realization << "\n" - << " working_set: " << working_set << "\n" - << " num_scalars: " << num_scalars << "\n" - << " global_bytes_at_task: " << global_bytes_at_task << "\n" - << " shared_bytes_at_task: " << shared_bytes_at_task << "\n" - << " register_bytes_at_task: " << register_bytes_at_task << "\n" - << " global_innermost_bytes_at_task: " << global_innermost_bytes_at_task << "\n" - << " shared_innermost_bytes_at_task: " << shared_innermost_bytes_at_task << "\n" - << " register_innermost_bytes_at_task: " << register_innermost_bytes_at_task << "\n" - << " unique_bytes_read_per_point: " << unique_bytes_read_per_point << "\n" - << " unique_lines_read_per_point: " << unique_lines_read_per_point << "\n" - << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n" - << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n" - << " working_set_at_task: " << working_set_at_task << "\n" - << " working_set_at_production: " << working_set_at_production << "\n" - << " working_set_at_realization: " << working_set_at_realization << "\n" - << " working_set_at_root: " << working_set_at_root << "\n" - << " num_blocks: " << num_blocks << "\n" - << " num_warps_per_block: " << num_warps_per_block << "\n" - << " block_occupancy: " << block_occupancy << "\n" - << " warp_lane_utilization: " << warp_lane_utilization << "\n" - << " num_active_warps_per_block: " << num_active_warps_per_block << "\n" - << " warp_lane_utilization_at_block_y: " << warp_lane_utilization_at_block_y << "\n" - << " warp_lane_utilization_at_block_z: " << warp_lane_utilization_at_block_z << "\n" - << " idle_lane_wastage: " << idle_lane_wastage << "\n" - << " num_shared_mem_loads_per_block: " << num_shared_mem_loads_per_block << "\n" - << " num_global_mem_loads_per_block: " << num_global_mem_loads_per_block << "\n" - << " num_shared_mem_stores_per_block: " << num_shared_mem_stores_per_block << "\n" - << " num_global_mem_stores_per_block: " << num_global_mem_stores_per_block << "\n" - << " shared_mem_store_efficiency: " << shared_mem_store_efficiency << "\n" - << " shared_mem_load_efficiency: " << shared_mem_load_efficiency << "\n" - << " global_mem_store_efficiency: " << global_mem_store_efficiency << "\n" - << " global_mem_load_efficiency: " << global_mem_load_efficiency << "\n" - << " working_set_at_thread: " << working_set_at_thread << "\n" - << " shared_mem_occupancy: " << shared_mem_occupancy << "\n" - << " shared_mem_block_limit_factor: " << shared_mem_block_limit_factor << "\n" - << " max_warp_occupancy: " << max_warp_occupancy << "\n" - << " max_block_occupancy: " << max_block_occupancy << "\n" - << " num_threads_per_block: " << num_threads_per_block << "\n" - << " expr_branching: " << expr_branching << "\n"; + os << " num_realizations: " << num_realizations << "\n" + << " num_productions: " << num_productions << "\n" + << " points_computed_per_realization: " << points_computed_per_realization << "\n" + << " points_computed_per_production: " << points_computed_per_production << "\n" + << " points_computed_per_thread: " << points_computed_per_thread << "\n" + << " points_computed_total: " << points_computed_total << "\n" + << " points_computed_minimum: " << points_computed_minimum << "\n" + << " innermost_loop_extent: " << innermost_loop_extent << "\n" + << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n" + << " unrolled_loop_extent: " << unrolled_loop_extent << "\n" + << " inner_parallelism: " << inner_parallelism << "\n" + << " outer_parallelism: " << outer_parallelism << "\n" + << " bytes_at_realization: " << bytes_at_realization << "\n" + << " bytes_at_production: " << bytes_at_production << "\n" + << " bytes_at_root: " << bytes_at_root << "\n" + << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n" + << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n" + << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n" + << " inlined_calls: " << inlined_calls << "\n" + << " unique_global_bytes_read_per_realization: " << unique_global_bytes_read_per_realization << "\n" + << " unique_shared_bytes_read_per_realization: " << unique_shared_bytes_read_per_realization << "\n" + << " unique_register_bytes_read_per_realization: " << unique_register_bytes_read_per_realization << "\n" + << " unique_global_lines_read_per_realization: " << unique_global_lines_read_per_realization << "\n" + << " unique_shared_lines_read_per_realization: " << unique_shared_lines_read_per_realization << "\n" + << " unique_register_lines_read_per_realization: " << unique_register_lines_read_per_realization << "\n" + << " unique_global_bytes_read_per_thread: " << unique_global_bytes_read_per_thread << "\n" + << " unique_shared_bytes_read_per_thread: " << unique_shared_bytes_read_per_thread << "\n" + << " unique_register_bytes_read_per_thread: " << unique_register_bytes_read_per_thread << "\n" + << " unique_global_lines_read_per_thread: " << unique_global_lines_read_per_thread << "\n" + << " unique_shared_lines_read_per_thread: " << unique_shared_lines_read_per_thread << "\n" + << " unique_register_lines_read_per_thread: " << unique_register_lines_read_per_thread << "\n" + << " global_allocation_bytes_read_per_realization: " << global_allocation_bytes_read_per_realization << "\n" + << " shared_allocation_bytes_read_per_realization: " << shared_allocation_bytes_read_per_realization << "\n" + << " register_allocation_bytes_read_per_realization: " << register_allocation_bytes_read_per_realization << "\n" + << " working_set: " << working_set << "\n" + << " num_scalars: " << num_scalars << "\n" + << " global_bytes_at_task: " << global_bytes_at_task << "\n" + << " shared_bytes_at_task: " << shared_bytes_at_task << "\n" + << " register_bytes_at_task: " << register_bytes_at_task << "\n" + << " global_innermost_bytes_at_task: " << global_innermost_bytes_at_task << "\n" + << " shared_innermost_bytes_at_task: " << shared_innermost_bytes_at_task << "\n" + << " register_innermost_bytes_at_task: " << register_innermost_bytes_at_task << "\n" + << " unique_bytes_read_per_point: " << unique_bytes_read_per_point << "\n" + << " unique_lines_read_per_point: " << unique_lines_read_per_point << "\n" + << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n" + << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n" + << " working_set_at_task: " << working_set_at_task << "\n" + << " working_set_at_production: " << working_set_at_production << "\n" + << " working_set_at_realization: " << working_set_at_realization << "\n" + << " working_set_at_root: " << working_set_at_root << "\n" + << " num_blocks: " << num_blocks << "\n" + << " num_warps_per_block: " << num_warps_per_block << "\n" + << " block_occupancy: " << block_occupancy << "\n" + << " warp_lane_utilization: " << warp_lane_utilization << "\n" + << " num_active_warps_per_block: " << num_active_warps_per_block << "\n" + << " warp_lane_utilization_at_block_y: " << warp_lane_utilization_at_block_y << "\n" + << " warp_lane_utilization_at_block_z: " << warp_lane_utilization_at_block_z << "\n" + << " idle_lane_wastage: " << idle_lane_wastage << "\n" + << " num_shared_mem_loads_per_block: " << num_shared_mem_loads_per_block << "\n" + << " num_global_mem_loads_per_block: " << num_global_mem_loads_per_block << "\n" + << " num_shared_mem_stores_per_block: " << num_shared_mem_stores_per_block << "\n" + << " num_global_mem_stores_per_block: " << num_global_mem_stores_per_block << "\n" + << " shared_mem_store_efficiency: " << shared_mem_store_efficiency << "\n" + << " shared_mem_load_efficiency: " << shared_mem_load_efficiency << "\n" + << " global_mem_store_efficiency: " << global_mem_store_efficiency << "\n" + << " global_mem_load_efficiency: " << global_mem_load_efficiency << "\n" + << " working_set_at_thread: " << working_set_at_thread << "\n" + << " shared_mem_occupancy: " << shared_mem_occupancy << "\n" + << " shared_mem_block_limit_factor: " << shared_mem_block_limit_factor << "\n" + << " max_warp_occupancy: " << max_warp_occupancy << "\n" + << " max_block_occupancy: " << max_block_occupancy << "\n" + << " num_threads_per_block: " << num_threads_per_block << "\n" + << " expr_branching: " << expr_branching << "\n"; } void dump() const { @@ -424,80 +424,8 @@ struct ScheduleFeatures { dump(os); } - bool equal(const ScheduleFeatures& other) const { - return num_realizations == other.num_realizations - && num_productions == other.num_productions - && points_computed_per_realization == other.points_computed_per_realization - && points_computed_per_production == other.points_computed_per_production - && points_computed_per_thread == other.points_computed_per_thread - && points_computed_total == other.points_computed_total - && points_computed_minimum == other.points_computed_minimum - && innermost_loop_extent == other.innermost_loop_extent - && innermost_pure_loop_extent == other.innermost_pure_loop_extent - && unrolled_loop_extent == other.unrolled_loop_extent - && inner_parallelism == other.inner_parallelism - && outer_parallelism == other.outer_parallelism - && bytes_at_realization == other.bytes_at_realization - && bytes_at_production == other.bytes_at_production - && bytes_at_root == other.bytes_at_root - && innermost_bytes_at_realization == other.innermost_bytes_at_realization - && innermost_bytes_at_production == other.innermost_bytes_at_production - && innermost_bytes_at_root == other.innermost_bytes_at_root - && inlined_calls == other.inlined_calls - && unique_global_bytes_read_per_realization == other.unique_global_bytes_read_per_realization - && unique_shared_bytes_read_per_realization == other.unique_shared_bytes_read_per_realization - && unique_register_bytes_read_per_realization == other.unique_register_bytes_read_per_realization - && unique_global_lines_read_per_realization == other.unique_global_lines_read_per_realization - && unique_shared_lines_read_per_realization == other.unique_shared_lines_read_per_realization - && unique_register_lines_read_per_realization == other.unique_register_lines_read_per_realization - && unique_global_bytes_read_per_thread == other.unique_global_bytes_read_per_thread - && unique_shared_bytes_read_per_thread == other.unique_shared_bytes_read_per_thread - && unique_register_bytes_read_per_thread == other.unique_register_bytes_read_per_thread - && unique_global_lines_read_per_thread == other.unique_global_lines_read_per_thread - && unique_shared_lines_read_per_thread == other.unique_shared_lines_read_per_thread - && unique_register_lines_read_per_thread == other.unique_register_lines_read_per_thread - && global_allocation_bytes_read_per_realization == other.global_allocation_bytes_read_per_realization - && shared_allocation_bytes_read_per_realization == other.shared_allocation_bytes_read_per_realization - && register_allocation_bytes_read_per_realization == other.register_allocation_bytes_read_per_realization - && working_set == other.working_set - && num_scalars == other.num_scalars - && global_bytes_at_task == other.global_bytes_at_task - && shared_bytes_at_task == other.shared_bytes_at_task - && register_bytes_at_task == other.register_bytes_at_task - && global_innermost_bytes_at_task == other.global_innermost_bytes_at_task - && shared_innermost_bytes_at_task == other.shared_innermost_bytes_at_task - && register_innermost_bytes_at_task == other.register_innermost_bytes_at_task - && unique_bytes_read_per_point == other.unique_bytes_read_per_point - && unique_lines_read_per_point == other.unique_lines_read_per_point - && unique_bytes_read_per_task == other.unique_bytes_read_per_task - && unique_lines_read_per_task == other.unique_lines_read_per_task - && working_set_at_task == other.working_set_at_task - && working_set_at_production == other.working_set_at_production - && working_set_at_realization == other.working_set_at_realization - && working_set_at_root == other.working_set_at_root - && num_blocks == other.num_blocks - && num_warps_per_block == other.num_warps_per_block - && block_occupancy == other.block_occupancy - && warp_lane_utilization == other.warp_lane_utilization - && num_active_warps_per_block == other.num_active_warps_per_block - && warp_lane_utilization_at_block_y == other.warp_lane_utilization_at_block_y - && warp_lane_utilization_at_block_z == other.warp_lane_utilization_at_block_z - && idle_lane_wastage == other.idle_lane_wastage - && num_shared_mem_loads_per_block == other.num_shared_mem_loads_per_block - && num_global_mem_loads_per_block == other.num_global_mem_loads_per_block - && num_shared_mem_stores_per_block == other.num_shared_mem_stores_per_block - && num_global_mem_stores_per_block == other.num_global_mem_stores_per_block - && shared_mem_store_efficiency == other.shared_mem_store_efficiency - && shared_mem_load_efficiency == other.shared_mem_load_efficiency - && global_mem_store_efficiency == other.global_mem_store_efficiency - && global_mem_load_efficiency == other.global_mem_load_efficiency - && working_set_at_thread == other.working_set_at_thread - && shared_mem_occupancy == other.shared_mem_occupancy - && shared_mem_block_limit_factor == other.shared_mem_block_limit_factor - && max_warp_occupancy == other.max_warp_occupancy - && max_block_occupancy == other.max_block_occupancy - && num_threads_per_block == other.num_threads_per_block - && expr_branching == other.expr_branching; + bool equal(const ScheduleFeatures &other) const { + return num_realizations == other.num_realizations && num_productions == other.num_productions && points_computed_per_realization == other.points_computed_per_realization && points_computed_per_production == other.points_computed_per_production && points_computed_per_thread == other.points_computed_per_thread && points_computed_total == other.points_computed_total && points_computed_minimum == other.points_computed_minimum && innermost_loop_extent == other.innermost_loop_extent && innermost_pure_loop_extent == other.innermost_pure_loop_extent && unrolled_loop_extent == other.unrolled_loop_extent && inner_parallelism == other.inner_parallelism && outer_parallelism == other.outer_parallelism && bytes_at_realization == other.bytes_at_realization && bytes_at_production == other.bytes_at_production && bytes_at_root == other.bytes_at_root && innermost_bytes_at_realization == other.innermost_bytes_at_realization && innermost_bytes_at_production == other.innermost_bytes_at_production && innermost_bytes_at_root == other.innermost_bytes_at_root && inlined_calls == other.inlined_calls && unique_global_bytes_read_per_realization == other.unique_global_bytes_read_per_realization && unique_shared_bytes_read_per_realization == other.unique_shared_bytes_read_per_realization && unique_register_bytes_read_per_realization == other.unique_register_bytes_read_per_realization && unique_global_lines_read_per_realization == other.unique_global_lines_read_per_realization && unique_shared_lines_read_per_realization == other.unique_shared_lines_read_per_realization && unique_register_lines_read_per_realization == other.unique_register_lines_read_per_realization && unique_global_bytes_read_per_thread == other.unique_global_bytes_read_per_thread && unique_shared_bytes_read_per_thread == other.unique_shared_bytes_read_per_thread && unique_register_bytes_read_per_thread == other.unique_register_bytes_read_per_thread && unique_global_lines_read_per_thread == other.unique_global_lines_read_per_thread && unique_shared_lines_read_per_thread == other.unique_shared_lines_read_per_thread && unique_register_lines_read_per_thread == other.unique_register_lines_read_per_thread && global_allocation_bytes_read_per_realization == other.global_allocation_bytes_read_per_realization && shared_allocation_bytes_read_per_realization == other.shared_allocation_bytes_read_per_realization && register_allocation_bytes_read_per_realization == other.register_allocation_bytes_read_per_realization && working_set == other.working_set && num_scalars == other.num_scalars && global_bytes_at_task == other.global_bytes_at_task && shared_bytes_at_task == other.shared_bytes_at_task && register_bytes_at_task == other.register_bytes_at_task && global_innermost_bytes_at_task == other.global_innermost_bytes_at_task && shared_innermost_bytes_at_task == other.shared_innermost_bytes_at_task && register_innermost_bytes_at_task == other.register_innermost_bytes_at_task && unique_bytes_read_per_point == other.unique_bytes_read_per_point && unique_lines_read_per_point == other.unique_lines_read_per_point && unique_bytes_read_per_task == other.unique_bytes_read_per_task && unique_lines_read_per_task == other.unique_lines_read_per_task && working_set_at_task == other.working_set_at_task && working_set_at_production == other.working_set_at_production && working_set_at_realization == other.working_set_at_realization && working_set_at_root == other.working_set_at_root && num_blocks == other.num_blocks && num_warps_per_block == other.num_warps_per_block && block_occupancy == other.block_occupancy && warp_lane_utilization == other.warp_lane_utilization && num_active_warps_per_block == other.num_active_warps_per_block && warp_lane_utilization_at_block_y == other.warp_lane_utilization_at_block_y && warp_lane_utilization_at_block_z == other.warp_lane_utilization_at_block_z && idle_lane_wastage == other.idle_lane_wastage && num_shared_mem_loads_per_block == other.num_shared_mem_loads_per_block && num_global_mem_loads_per_block == other.num_global_mem_loads_per_block && num_shared_mem_stores_per_block == other.num_shared_mem_stores_per_block && num_global_mem_stores_per_block == other.num_global_mem_stores_per_block && shared_mem_store_efficiency == other.shared_mem_store_efficiency && shared_mem_load_efficiency == other.shared_mem_load_efficiency && global_mem_store_efficiency == other.global_mem_store_efficiency && global_mem_load_efficiency == other.global_mem_load_efficiency && working_set_at_thread == other.working_set_at_thread && shared_mem_occupancy == other.shared_mem_occupancy && shared_mem_block_limit_factor == other.shared_mem_block_limit_factor && max_warp_occupancy == other.max_warp_occupancy && max_block_occupancy == other.max_block_occupancy && num_threads_per_block == other.num_threads_per_block && expr_branching == other.expr_branching; } }; diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp index b942d0cdc283..be811f622376 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.cpp +++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp @@ -496,7 +496,7 @@ FunctionDAG::Edge::BoundInfo::BoundInfo(const Expr &e, const Node::Stage &consum } bool FunctionDAG::Edge::all_load_jacobian_coeffs_exist() const { - for (const auto& jac : load_jacobians) { + for (const auto &jac : load_jacobians) { if (!jac.all_coeffs_exist()) { return false; } @@ -1209,7 +1209,7 @@ int ExprBranching::visit(const Load *op) { return visit_binary(op->predicate, op->index); } -int ExprBranching::visit_nary(const std::vector& exprs) { +int ExprBranching::visit_nary(const std::vector &exprs) { int total_branching = 0; for (Expr e : exprs) { @@ -1229,7 +1229,7 @@ int ExprBranching::visit_nary(const std::vector& exprs) { } int ExprBranching::visit(const Call *op) { - for (const auto& i : inlined) { + for (const auto &i : inlined) { if (op->name == i.first->func.name()) { return compute(i.first->func); } @@ -1250,13 +1250,13 @@ int ExprBranching::visit(const VectorReduce *op) { return Super::dispatch(op->value); } -int ExprBranching::compute(const Function& f) { +int ExprBranching::compute(const Function &f) { Definition def = f.definition(); std::vector values; values.reserve(def.values().size()); for (auto v : def.values()) { - values.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form + values.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form } int branching = visit_nary(values); @@ -1264,13 +1264,13 @@ int ExprBranching::compute(const Function& f) { std::vector args; args.reserve(def.args().size()); for (auto v : def.args()) { - args.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form + args.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form } return std::max(branching, visit_nary(args)); } -void sanitize_names(std::string& str) { +void sanitize_names(std::string &str) { bool in_quotes = false; for (auto &c : str) { in_quotes ^= (c == '"'); diff --git a/src/autoschedulers/anderson2021/FunctionDAG.h b/src/autoschedulers/anderson2021/FunctionDAG.h index 00d6313d8d17..ec884b525d3f 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.h +++ b/src/autoschedulers/anderson2021/FunctionDAG.h @@ -131,7 +131,7 @@ class LoadJacobian { } bool all_coeffs_exist() const { - for (const auto& coeff : coeffs) { + for (const auto &coeff : coeffs) { if (!coeff.exists()) { return false; } @@ -152,7 +152,7 @@ class LoadJacobian { } bool is_constant() const { - for (const auto& c : coeffs) { + for (const auto &c : coeffs) { if (!c.exists() || !(c == 0)) { return false; } @@ -496,9 +496,9 @@ struct FunctionDAG { : stage(s) { } - int get_loop_index_from_var(const std::string& var) const { + int get_loop_index_from_var(const std::string &var) const { int i = 0; - for (const auto& l : loop) { + for (const auto &l : loop) { if (l.var == var) { return i; } @@ -622,7 +622,7 @@ class ExprBranching : public VariadicVisitor { using Super = VariadicVisitor; private: - const NodeMap& inlined; + const NodeMap &inlined; public: int visit(const IntImm *op); @@ -656,16 +656,16 @@ class ExprBranching : public VariadicVisitor { int visit(const Let *op); int visit(const VectorReduce *op); int visit_binary(const Expr &a, const Expr &b); - int visit_nary(const std::vector& exprs); + int visit_nary(const std::vector &exprs); - ExprBranching(const NodeMap& inlined) - : inlined{inlined} - {} + ExprBranching(const NodeMap &inlined) + : inlined{inlined} { + } - int compute(const Function& f); + int compute(const Function &f); }; -void sanitize_names(std::string& str); +void sanitize_names(std::string &str); } // namespace Autoscheduler } // namespace Internal diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.cpp b/src/autoschedulers/anderson2021/GPULoopInfo.cpp index f1c53ec8b9b0..283377f81334 100644 --- a/src/autoschedulers/anderson2021/GPULoopInfo.cpp +++ b/src/autoschedulers/anderson2021/GPULoopInfo.cpp @@ -6,7 +6,7 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -void GPULoopInfo::update(const Target& target, const LoopNest* loop) { +void GPULoopInfo::update(const Target &target, const LoopNest *loop) { if (loop->is_gpu_block(target)) { current_block_loop = loop; num_blocks = loop->get_block_and_serial_extents(loop).first; @@ -45,7 +45,7 @@ bool GPULoopInfo::at_or_inside_thread() const { return current_thread_loop != nullptr; } -std::vector GPULoopInfo::get_inner_serial_loop_extents(const LoopNest* loop_nest) const { +std::vector GPULoopInfo::get_inner_serial_loop_extents(const LoopNest *loop_nest) const { internal_assert(at_or_inside_thread()); std::vector extents; @@ -72,10 +72,10 @@ std::vector GPULoopInfo::get_inner_serial_loop_extents(const LoopNest* // f 1 gpu_simd // This method will give the extents of the loops inside the thread level but // outside the given loop_nest's realization e.g. 8 for g above. -int64_t GPULoopInfo::get_total_inner_serial_extents_outside_realization(const LoopNest* loop_nest) const { +int64_t GPULoopInfo::get_total_inner_serial_extents_outside_realization(const LoopNest *loop_nest) const { int64_t extents = 1; - for (const auto* loop : inner_loop_stack) { + for (const auto *loop : inner_loop_stack) { if (loop->node == loop_nest->node) { break; } @@ -97,8 +97,7 @@ std::unique_ptr GPULoopInfo::create_thread_info() { current_thread_loop->vectorized_loop_index, current_thread_loop->size, current_thread_loop->stage->loop, - max_thread_counts - ); + max_thread_counts); thread_info = new_thread_info.get(); return new_thread_info; } diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.h b/src/autoschedulers/anderson2021/GPULoopInfo.h index 62e669eb500f..50696544aac9 100644 --- a/src/autoschedulers/anderson2021/GPULoopInfo.h +++ b/src/autoschedulers/anderson2021/GPULoopInfo.h @@ -19,20 +19,20 @@ namespace Autoscheduler { struct LoopNest; struct GPULoopInfo { - GPULoopInfo(const LoopNest* root) - : root{root} - {} - - const LoopNest* root = nullptr; - const LoopNest* current_block_loop = nullptr; - const LoopNest* current_thread_loop = nullptr; - std::vector inner_loop_stack; + GPULoopInfo(const LoopNest *root) + : root{root} { + } + + const LoopNest *root = nullptr; + const LoopNest *current_block_loop = nullptr; + const LoopNest *current_thread_loop = nullptr; + std::vector inner_loop_stack; int64_t num_blocks = 1; int64_t total_outer_serial_extents = 1; int64_t total_inner_serial_extents = 1; - const ThreadInfo* thread_info = nullptr; + const ThreadInfo *thread_info = nullptr; - void update(const Target& target, const LoopNest* loop); + void update(const Target &target, const LoopNest *loop); int64_t total_serial_extents() const; @@ -40,16 +40,15 @@ struct GPULoopInfo { bool at_or_inside_thread() const; - std::vector get_inner_serial_loop_extents(const LoopNest* loop_nest) const; + std::vector get_inner_serial_loop_extents(const LoopNest *loop_nest) const; std::unique_ptr create_thread_info(); - int64_t get_total_inner_serial_extents_outside_realization(const LoopNest* loop_nest) const; - + int64_t get_total_inner_serial_extents_outside_realization(const LoopNest *loop_nest) const; }; } // namespace Autoscheduler } // namespace Internal } // namespace Halide -#endif // GPU_LOOP_INFO_H +#endif // GPU_LOOP_INFO_H diff --git a/src/autoschedulers/anderson2021/GPUMemInfo.h b/src/autoschedulers/anderson2021/GPUMemInfo.h index 03acff9301e9..22e6b9f029ba 100644 --- a/src/autoschedulers/anderson2021/GPUMemInfo.h +++ b/src/autoschedulers/anderson2021/GPUMemInfo.h @@ -24,34 +24,34 @@ struct SharedAccessAccumulator; struct LocalMem; struct LocalAccessAccumulator; -template +template struct MemTraits; -template <> +template<> struct MemTraits { static constexpr double bytes_per_transaction = 32; using MemInfoType = GlobalMem; using Accumulator = GlobalAccessAccumulator; }; -template <> +template<> struct MemTraits { static constexpr double bytes_per_transaction = 128; using MemInfoType = SharedMem; using Accumulator = SharedAccessAccumulator; }; -template <> +template<> struct MemTraits { static constexpr double bytes_per_transaction = 32; - using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem + using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem using Accumulator = LocalAccessAccumulator; }; -template +template using Accumulator = typename MemTraits::Accumulator; -template +template struct MemInfo { static constexpr double bytes_per_transaction = MemTraits::bytes_per_transaction; @@ -66,7 +66,7 @@ struct MemInfo { double total_bytes = total_transactions * bytes_per_transaction; double total_bytes_used = num_requests * num_bytes_used_per_request; - internal_assert(total_bytes_used <= total_bytes) + internal_assert(total_bytes_used <= total_bytes) << "\ntotal_bytes_used = " << total_bytes_used << "\ntotal_bytes = " << total_bytes << "\ntotal_transactions = " << total_transactions @@ -76,7 +76,7 @@ struct MemInfo { update_totals(total_transactions, total_bytes_used, total_bytes); } - void add(const MemInfo& other) { + void add(const MemInfo &other) { total_num_transactions += other.total_num_transactions; total_num_bytes_used += other.total_num_bytes_used; total_num_bytes += other.total_num_bytes; @@ -104,7 +104,7 @@ struct MemInfo { double total_num_bytes = 0; }; -template +template using MemInfoType = MemInfo::MemInfoType>; using GlobalMemInfo = MemInfoType; @@ -113,11 +113,11 @@ using LocalMemInfo = MemInfoType; struct Strides { public: - Strides(const std::vector& storage_strides) - : storage_strides{storage_strides} - {} + Strides(const std::vector &storage_strides) + : storage_strides{storage_strides} { + } - void add_valid(const std::vector& strides) { + void add_valid(const std::vector &strides) { add(strides, true); } @@ -140,7 +140,7 @@ struct Strides { return std::abs(result); } - void dump(bool verbose=false) { + void dump(bool verbose = false) { if (!verbose) { return; } @@ -162,7 +162,7 @@ struct Strides { } private: - void add(const std::vector& strides, bool e) { + void add(const std::vector &strides, bool e) { index_strides.push_back(strides); is_valid.push_back(e); } @@ -173,12 +173,9 @@ struct Strides { }; struct GlobalAccessAccumulator { - GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides& strides, bool verbose) - : bytes_per_access{bytes_per_access} - , dimensions{dimensions} - , strides{strides} - , verbose{verbose} - {} + GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose) + : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} { + } void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { if (!active) { @@ -186,7 +183,7 @@ struct GlobalAccessAccumulator { } if (verbose) { - aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; + aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; } int thread_ids[3] = {x, y, z}; @@ -218,7 +215,7 @@ struct GlobalAccessAccumulator { } } - void add_access_info(int num_requests, GlobalMemInfo& global_mem_info, bool is_tail_warp) const { + void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const { int num_transactions_per_request = sectors_accessed.size() + unknown_sectors; if (verbose) { @@ -229,7 +226,7 @@ struct GlobalAccessAccumulator { } int num_bytes_used_per_request = 0; - for (const auto& sector : sectors_accessed) { + for (const auto §or : sectors_accessed) { num_bytes_used_per_request += sector.second.size(); } @@ -245,8 +242,7 @@ struct GlobalAccessAccumulator { global_mem_info.add_access_info( num_requests, num_transactions_per_request, - num_bytes_used_per_request - ); + num_bytes_used_per_request); } private: @@ -259,12 +255,9 @@ struct GlobalAccessAccumulator { }; struct SharedAccessAccumulator { - SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides& strides, bool verbose) - : bytes_per_access{bytes_per_access} - , dimensions{dimensions} - , strides{strides} - , verbose{verbose} - {} + SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose) + : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} { + } void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { if (!active) { @@ -272,7 +265,7 @@ struct SharedAccessAccumulator { } if (verbose) { - aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; + aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; } int thread_ids[3] = {x, y, z}; @@ -310,9 +303,9 @@ struct SharedAccessAccumulator { } } - void add_access_info(int num_requests, SharedMemInfo& shared_mem_info, bool is_tail_warp) const { + void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const { int num_transactions_per_request = 0; - for (const auto& bank : bank_to_words_accessed) { + for (const auto &bank : bank_to_words_accessed) { num_transactions_per_request = std::max(num_transactions_per_request, (int)bank.size()); } @@ -339,8 +332,7 @@ struct SharedAccessAccumulator { shared_mem_info.add_access_info( num_requests, num_transactions_per_request, - num_bytes_used_per_request - ); + num_bytes_used_per_request); } private: @@ -355,9 +347,8 @@ struct SharedAccessAccumulator { struct LocalAccessAccumulator { LocalAccessAccumulator(int bytes_per_access, bool verbose) - : bytes_per_access{bytes_per_access} - , verbose{verbose} - {} + : bytes_per_access{bytes_per_access}, verbose{verbose} { + } void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { if (!active) { @@ -367,11 +358,11 @@ struct LocalAccessAccumulator { ++thread_count; if (verbose) { - aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; + aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n"; } } - void add_access_info(int num_requests, LocalMemInfo& local_mem_info, bool is_tail_warp) const { + void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const { int num_bytes_used_per_request = thread_count * bytes_per_access; int sectors_accessed = std::ceil((float)num_bytes_used_per_request / (float)LocalMemInfo::bytes_per_transaction); int num_transactions_per_request = sectors_accessed; @@ -393,8 +384,7 @@ struct LocalAccessAccumulator { local_mem_info.add_access_info( num_requests, num_transactions_per_request, - num_bytes_used_per_request - ); + num_bytes_used_per_request); } private: @@ -404,9 +394,8 @@ struct LocalAccessAccumulator { std::unordered_map> sectors_accessed; }; - } // namespace Autoscheduler } // namespace Internal } // namespace Halide -#endif // GPU_MEM_INFO_H +#endif // GPU_MEM_INFO_H diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index b13fedb282ab..fcdc9e3ff5ee 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -101,7 +101,7 @@ bool are_valid_thread_extents(const vector &counts) { return true; } -bool all(const vector& v) { +bool all(const vector &v) { for (auto x : v) { if (!x) { return false; @@ -322,7 +322,7 @@ GPUMemoryType LoopNest::get_gpu_memory_type(bool in_block, bool in_thread, bool return GPUMemoryType::global; } -std::vector LoopNest::unrolled_loops(const Target& target, const LoopNest* parent, const LoopNest* grandparent) const { +std::vector LoopNest::unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const { internal_assert(innermost); const auto &grandparent_bounds = grandparent->get_bounds(node); std::vector unrolled(parent->size.size(), 0); @@ -348,8 +348,8 @@ std::vector LoopNest::unrolled_loops(const Target& target, const LoopNest* return unrolled; } -bool accessed_at_constant_indices(const std::vector& unrolled, const FunctionDAG::Edge* e) { - for (const auto& jac : e->load_jacobians) { +bool accessed_at_constant_indices(const std::vector &unrolled, const FunctionDAG::Edge *e) { + for (const auto &jac : e->load_jacobians) { for (size_t loop_index = 0; loop_index < unrolled.size(); ++loop_index) { for (int i = 0; i < e->producer->dimensions; ++i) { // There are two ways for an index to be constant: @@ -372,8 +372,8 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target const LoopNest *grandparent, const LoopNest *parent) const { - for (const auto* alloc_node : store_at) { - const auto& store_site = sites.get(&alloc_node->stages[0]); + for (const auto *alloc_node : store_at) { + const auto &store_site = sites.get(&alloc_node->stages[0]); if (store_site.gpu_store_memory_type != GPUMemoryType::local) { continue; } @@ -388,7 +388,7 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target if (innermost) { auto unrolled = unrolled_loops(target, parent, grandparent); - for (const auto* e : stage->incoming_edges) { + for (const auto *e : stage->incoming_edges) { if (sites.get(&e->producer->stages[0]).gpu_store_memory_type != GPUMemoryType::local) { continue; } @@ -436,7 +436,7 @@ void LoopNest::get_sites(const Target &target, sites.get_or_create(&s).allocation_size = alloc.first; sites.get_or_create(&s).is_constant_allocation = alloc.second; - const LoopNest* store_site = sites.get_or_create(&s).store; + const LoopNest *store_site = sites.get_or_create(&s).store; if (store_site->gpu_label == block && s.index == 0) { total_shared_mem_alloc_sizes.get_or_create(store_site->stage) += alloc.first; } @@ -465,13 +465,12 @@ bool LoopNest::promote_allocs_to_registers(const Target &target, StageMap NodeMap can_be_promoted_to_registers; get_allocs_that_can_be_promoted_to_registers(target, sites, can_be_promoted_to_registers, nullptr, nullptr); - - for (auto& node : can_be_promoted_to_registers) { + for (auto &node : can_be_promoted_to_registers) { if (!node.second) { return false; } - for (auto& stage : node.first->stages) { + for (auto &stage : node.first->stages) { internal_assert(sites.get(&stage).gpu_store_memory_type == GPUMemoryType::local); sites.get(&stage).gpu_store_memory_type = GPUMemoryType::registers; } @@ -642,7 +641,7 @@ bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, c return true; } -bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType& mem_type) const { +bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const { if (loop_index < 0 || mem_type != GPUMemoryType::shared) { return false; } @@ -651,7 +650,7 @@ bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const Functio return can_vectorize_access_for_innermost_dim(jac, accessed, innermost_dim, loop_index); } -int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType& mem_type, bool verbose) const { +int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { int vector_size = 1; if (mem_type != GPUMemoryType::shared) { return vector_size; @@ -732,7 +731,7 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const { // Get the stride over "node's" storage and its element-wise stride for a unit // increment in the given thread loops -Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo& thread_info, bool verbose) const { +Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose) const { internal_assert(innermost_storage_dim >= 0); if (verbose) { @@ -770,7 +769,7 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage } Strides strides{storage_strides}; - for (const auto& thread_loop_var : thread_info.loop_vars) { + for (const auto &thread_loop_var : thread_info.loop_vars) { int loop_index = stage->get_loop_index_from_var(thread_loop_var); bool loop_index_exists = loop_index >= 0; @@ -852,7 +851,7 @@ int LoopNest::get_actual_vector_dim(const Bound &store_bounds) const { return vector_dim; } -void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo& global_mem_loads, SharedMemInfo& shared_mem_loads, LocalMemInfo& local_mem_loads, bool verbose) const { +void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose) const { if (consumer_site.is_stored_in_registers()) { return; } @@ -895,7 +894,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ mem_type = "local"; } aslog(2) << "BEGIN MEM ACCESS " << mem_type << "_mem_" << type; - aslog(2) << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name << "\n"; + aslog(2) << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name << "\n"; aslog(2) << "total_serial_loop_extents = " << total_serial_loop_extents << "\n"; } @@ -911,8 +910,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ consumer_store_bounds, thread_info, total_serial_loop_extents, - verbose - ); + verbose); feat.num_shared_mem_stores_per_block = shared_mem_info.num_transactions(); if (stage->index > 0) { @@ -934,8 +932,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ consumer_store_bounds, thread_info, total_serial_loop_extents, - verbose - ); + verbose); feat.num_global_mem_stores_per_block = global_mem_info.num_transactions(); if (stage->index > 0) { @@ -953,8 +950,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ consumer_store_bounds, thread_info, total_serial_loop_extents, - verbose - ); + verbose); //feat.num_local_mem_stores_per_block = local_mem_info.num_transactions(); if (stage->index > 0) { local_mem_loads.add(local_mem_info); @@ -978,13 +974,12 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ aslog(2) << "END MEM ACCESS " << mem_type << "_mem_" << type << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name; if (!jac.all_coeffs_exist()) { aslog(2) << " (not all coeffs exist)"; - } aslog(2) << "\n\n"; } } -template +template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const { Strides strides = compute_strides(jac, innermost_dim, node, store_bounds, thread_info, verbose); @@ -1001,8 +996,7 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const accumulator.add_access_info( num_requests, mem_info, - false - ); + false); if (verbose) { aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n"; @@ -1025,21 +1019,18 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const accumulator.add_access_info( num_requests_per_warp, mem_info, - true - ); + true); if (verbose) { aslog(2) << "END tail warp\n\n"; } } -template -void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; +template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; -template -void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; +template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; -template <> +template<> void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const { int bytes_per_access = node->bytes_per_point; @@ -1051,8 +1042,7 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian & accumulator.add_access_info( num_requests, mem_info, - false - ); + false); if (verbose) { aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n"; @@ -1075,8 +1065,7 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian & accumulator.add_access_info( num_requests_per_warp, mem_info, - true - ); + true); if (verbose) { aslog(2) << "END tail warp\n\n"; @@ -1097,21 +1086,19 @@ std::pair LoopNest::compute_local_mem_store_features(const LoadJ return {accesses, 1.0 / stride}; } -template -MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const { +template +MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const { MemInfoType mem_info; compute_num_mem_accesses_per_block(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose); return mem_info; } -template -MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const; +template MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const; -template -MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const; +template MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const; -template +template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { if (producer_has_been_scheduled) { compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); @@ -1136,13 +1123,11 @@ void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_i mem_info.add(min_info); } -template -void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; +template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; -template -void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; +template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; -template <> +template<> void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); } @@ -1240,7 +1225,7 @@ void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopIn } // Assume that when a block is active, all its warps are active -void LoopNest::compute_warp_and_block_occupancy(const MachineParams& params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { +void LoopNest::compute_warp_and_block_occupancy(const MachineParams ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { // Only compute these features for stage's that actually have a block // loop if (node != gpu_loop_info.current_block_loop->node) { @@ -1310,7 +1295,7 @@ std::pair LoopNest::find_innermost_and_paren return {child, parent}; } -int64_t LoopNest::points_accessed_per_thread(const MachineParams& params, const Target& target, const GPULoopInfo &gpu_loop_info, const std::vector& edge_chain, const LoadJacobian& jac, const LoopNest* parent, const LoopNest* grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian& serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType& mem_type, bool verbose) const { +int64_t LoopNest::points_accessed_per_thread(const MachineParams ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { std::unique_ptr innermost_parent_clone = std::make_unique(); innermost_parent_clone->copy_from(*parent); @@ -1332,7 +1317,7 @@ int64_t LoopNest::points_accessed_per_thread(const MachineParams& params, const int64_t product_of_non_licm_non_unrolled_extents = 1; int64_t product_of_non_licm_extents = 1; int num_pure_loops = 0; - const FunctionDAG::Node* producer = edge_chain.back()->producer; + const FunctionDAG::Node *producer = edge_chain.back()->producer; for (size_t idx = 0; idx < parent->size.size(); idx++) { bool can_apply_licm = true; for (int i = 0; i < producer->dimensions; i++) { @@ -1369,14 +1354,14 @@ int64_t LoopNest::points_accessed_per_thread(const MachineParams& params, const } } } else if (verbose) { - // Case 1 + // Case 1 aslog(2) << "loop idx = " << idx << ": apply licm = " << parent->size[idx] << "\n"; } } IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(params, tiling, grandparent, target, true, false, false, rvars_to_move_inward); - const auto& bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain); + const auto &bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain); int64_t num_points = 1; for (int i = 0; i < producer->dimensions; i++) { num_points *= bounds->region_required(i).extent(); @@ -1417,8 +1402,7 @@ int64_t LoopNest::points_accessed_per_thread(const MachineParams& params, const producer_has_been_scheduled, producer_innermost_dim, mem_type, - verbose - ); + verbose); if (verbose) { aslog(2) << "\n"; @@ -1495,7 +1479,7 @@ int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, const Loo return amortization; } -void LoopNest::memoize_points_computed_minimum(StageMap& memoized_features, const StageMap *features) const { +void LoopNest::memoize_points_computed_minimum(StageMap &memoized_features, const StageMap *features) const { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); const auto &inlined_feat = features->get(&(f->stages[0])); @@ -1532,9 +1516,9 @@ vector> LoopNest::collect_producers(const StageMap &sites) done.insert(e->producer); const auto &site = sites.get(&(e->producer->stages[0])); if (site.store->is_root()) { - int vector_dim = (e->producer->is_input ? 0 : - site.produce != nullptr ? site.produce->vector_dim : - -1); + int vector_dim = (e->producer->is_input ? 0 : + site.produce != nullptr ? site.produce->vector_dim : + -1); producers.push_back({e->producer->id, vector_dim}); } else if (site.produce != nullptr) { // Computation must be nested inside this task or inlined into it. @@ -1553,12 +1537,12 @@ uint64_t LoopNest::compute_hash_of_producers_stored_at_root(const StageMap> producers = collect_producers(sites); // Sort them according to node id - std::sort(producers.begin(), producers.end(), [](const pair& a, const pair& b) { + std::sort(producers.begin(), producers.end(), [](const pair &a, const pair &b) { return a.first < b.first; }); uint64_t store_root_hash = 0; - for (const auto& p : producers) { + for (const auto &p : producers) { hash_combine(store_root_hash, p.first); hash_combine(store_root_hash, p.second); } @@ -1566,7 +1550,7 @@ uint64_t LoopNest::compute_hash_of_producers_stored_at_root(const StageMap& stages) const { +void LoopNest::collect_stages(std::set &stages) const { stages.insert(stage); for (const auto &c : children) { @@ -1574,7 +1558,7 @@ void LoopNest::collect_stages(std::set& stages } } -void LoopNest::memoize_features(StageMap& memoized_features, const StageMap *features) const { +void LoopNest::memoize_features(StageMap &memoized_features, const StageMap *features) const { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); if (memoized_features.contains(&(f->stages[0]))) { @@ -1595,7 +1579,7 @@ void LoopNest::memoize_features(StageMap& memoized_features, c } void LoopNest::compute_working_set_from_features(int64_t *working_set, - const StageMap *features) const { + const StageMap *features) const { int64_t working_set_here = 0; for (const auto &c : children) { @@ -1625,8 +1609,8 @@ void LoopNest::recompute_inlined_features(const StageMap &sites, StageMap uint64_t hash_of_producers = sites.get(block->stage).hash_of_producers_stored_at_root; internal_assert(block->feature_intermediates.count(hash_of_producers) > 0); - auto& intermediate_map = block->feature_intermediates[hash_of_producers].get(&(f->stages[0])); - auto& intermediate = intermediate_map.get(stage); + auto &intermediate_map = block->feature_intermediates[hash_of_producers].get(&(f->stages[0])); + auto &intermediate = intermediate_map.get(stage); auto &inlined_feat = features->get(&(f->stages[0])); inlined_feat.inlined_calls += intermediate.inlined_calls; @@ -1678,7 +1662,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap &total_shared_mem_alloc_sizes, - Statistics& stats, + Statistics &stats, bool verbose) const { gpu_loop_info.update(target, this); @@ -1790,7 +1774,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (c->features.count(hash_of_producers) > 0) { ++stats.num_memoization_hits; - const auto& entry = c->features.at(hash_of_producers); + const auto &entry = c->features.at(hash_of_producers); for (auto it = entry.begin(); it != entry.end(); it++) { auto &stage = *(it.key()); const auto &feat = it.value(); @@ -2121,7 +2105,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, int64_t global_bytes_loaded = 0, shared_bytes_loaded = 0, local_bytes_loaded = 0, register_bytes_loaded = 0; int64_t global_lines_loaded = 0, shared_lines_loaded = 0, local_lines_loaded = 0, register_lines_loaded = 0; int64_t global_bytes_loaded_per_thread = 0, shared_bytes_loaded_per_thread = 0, register_bytes_loaded_per_thread = 0; - int64_t global_lines_loaded_per_thread = 0, shared_lines_loaded_per_thread = 0, register_lines_loaded_per_thread = 0;; + int64_t global_lines_loaded_per_thread = 0, shared_lines_loaded_per_thread = 0, register_lines_loaded_per_thread = 0; + ; int64_t global_allocation_bytes_loaded = 0, shared_allocation_bytes_loaded = 0; GlobalMemInfo global_mem_loads; SharedMemInfo shared_mem_loads; @@ -2155,8 +2140,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, global_mem_loads, shared_mem_loads, local_mem_loads, - verbose - ); + verbose); } // The parallel loop of the consumer @@ -2165,8 +2149,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, int64_t consumer_instances = innermost ? instances : feat.num_realizations; internal_assert(consumer_instances != 0); - vector>> pending; - vector edge_chain; + vector>> pending; + vector edge_chain; pending.emplace_back(stage, edge_chain); vector> jacobians; vector> thread_jacobians; @@ -2187,7 +2171,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, bool producer_has_been_scheduled = e->producer->is_input || (site.produce != nullptr); - std::vector edge_chain = p_pair.second; + std::vector edge_chain = p_pair.second; edge_chain.push_back(e); if (innermost) { @@ -2293,7 +2277,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, sanitize_names(consumer_name); std::string producer_name = e->producer->func.name(); sanitize_names(producer_name); - aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name <<"\n"; + aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); @@ -2307,8 +2291,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, *gpu_loop_info.thread_info, shared_mem_loads, points_accessed, - verbose - ); + verbose); if (verbose) { aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); @@ -2325,7 +2308,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, sanitize_names(consumer_name); std::string producer_name = e->producer->func.name(); sanitize_names(producer_name); - aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name <<"\n"; + aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); @@ -2339,8 +2322,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, *gpu_loop_info.thread_info, global_mem_loads, points_accessed, - verbose - ); + verbose); if (verbose) { aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; @@ -2364,7 +2346,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, sanitize_names(consumer_name); std::string producer_name = e->producer->func.name(); sanitize_names(producer_name); - aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name <<"\n"; + aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); @@ -2378,8 +2360,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, *gpu_loop_info.thread_info, local_mem_loads, points_accessed, - verbose - ); + verbose); if (verbose) { aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; @@ -2400,7 +2381,6 @@ void LoopNest::compute_features(const FunctionDAG &dag, done.insert(e->producer); - // Now look at the shapes of the regions read from // the producer at various sites. int64_t max_extent = 1, max_thread_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1; @@ -2464,9 +2444,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (!e->producer->is_input) { const int64_t producer_store_instances = - producer_has_been_scheduled - ? features->get_or_create(&(e->producer->stages[0])).num_realizations - : site.num_realizations; + producer_has_been_scheduled ? features->get_or_create(&(e->producer->stages[0])).num_realizations : site.num_realizations; internal_assert(producer_store_instances > 0); @@ -2662,8 +2640,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (use_memoized_features) { const auto &block = sites.get(stage).task; uint64_t hash_of_producers = sites.get(block->stage).hash_of_producers_stored_at_root; - auto& intermediate_map = block->feature_intermediates[hash_of_producers].get_or_create(&(f->stages[0])); - auto& intermediate = intermediate_map.get_or_create(stage); + auto &intermediate_map = block->feature_intermediates[hash_of_producers].get_or_create(&(f->stages[0])); + auto &intermediate = intermediate_map.get_or_create(stage); intermediate.inlined_calls = it.value() * subinstances; intermediate.num_scalars = it.value() * feat.num_scalars; @@ -2700,7 +2678,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, // required of 'g' should be 1 point for each point of 'out' but get_bounds() // will also include the edge 'g' -> 'f' and give the result 201 points for every point // of 'out') -const Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector& edge_chain) const { +const Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const { internal_assert(edge_chain.size() >= 1); internal_assert(edge_chain[0]->consumer == stage) @@ -2712,17 +2690,17 @@ const Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, co << " not " << f->func.name(); vector bounds; - BoundContents* bound; + BoundContents *bound; // For the final consumer, we rely on get_bounds() (i.e. on the bounds for it to // satisfy all of its downstream consumers instead of just along a single edge). This should be // okay because it is computed in the current loop nest so its bounds need // to account for all its downstream consumers. - const auto& c_bounds = get_bounds(edge_chain[0]->consumer->node); + const auto &c_bounds = get_bounds(edge_chain[0]->consumer->node); Bound cur_consumer_bounds = c_bounds; - for (const auto* e : edge_chain) { - const auto* producer = e->producer; + for (const auto *e : edge_chain) { + const auto *producer = e->producer; bound = producer->make_bound(); auto init = Span::empty_span(); @@ -2834,8 +2812,8 @@ std::string LoopNest::to_string() const { } // Recursively print a loop nest representation to the given stream -template -void LoopNest::dump(T& stream, string prefix, const LoopNest *parent) const { +template +void LoopNest::dump(T &stream, string prefix, const LoopNest *parent) const { if (!is_root()) { // Non-root nodes always have parents. internal_assert(parent != nullptr); @@ -2913,11 +2891,9 @@ void LoopNest::dump(T& stream, string prefix, const LoopNest *parent) const { } } -template -void LoopNest::dump(aslog& stream, string prefix, const LoopNest *parent) const; +template void LoopNest::dump(aslog &stream, string prefix, const LoopNest *parent) const; -template -void LoopNest::dump(std::ostringstream& stream, string prefix, const LoopNest *parent) const; +template void LoopNest::dump(std::ostringstream &stream, string prefix, const LoopNest *parent) const; // Does this loop nest access the given Func bool LoopNest::calls(const FunctionDAG::Node *f) const { @@ -3396,7 +3372,8 @@ vector> LoopNest::compute_in_tiles(const FunctionDA (!in_realization || size.empty() || vector_dim == -1 || - size[vector_dim] == 1) && can_compute_here) { + size[vector_dim] == 1) && + can_compute_here) { std::unique_ptr r{new LoopNest}; r->copy_from(*this); @@ -3466,7 +3443,7 @@ int64_t LoopNest::product_of_self_and_descendants(int loop_index) const { int64_t LoopNest::product_of_descendants(int loop_index) const { int64_t prod = 1; - const LoopNest* cur = this; + const LoopNest *cur = this; while (!cur->innermost) { bool found = false; for (const auto &c : cur->children) { @@ -3486,8 +3463,8 @@ int64_t LoopNest::product_of_descendants(int loop_index) const { return prod; } -bool LoopNest::has_constant_region_computed(const FunctionDAG::Node* node) const { - const auto& bounds = get_bounds(node); +bool LoopNest::has_constant_region_computed(const FunctionDAG::Node *node) const { + const auto &bounds = get_bounds(node); for (int i = 0; i < node->dimensions; i++) { if (!bounds->region_computed(i).constant_extent()) { return false; @@ -3496,8 +3473,8 @@ bool LoopNest::has_constant_region_computed(const FunctionDAG::Node* node) const return true; } -bool LoopNest::has_constant_region_required(const FunctionDAG::Node* node) const { - const auto& bounds = get_bounds(node); +bool LoopNest::has_constant_region_required(const FunctionDAG::Node *node) const { + const auto &bounds = get_bounds(node); for (int i = 0; i < node->dimensions; i++) { if (!bounds->region_required(i).constant_extent()) { return false; @@ -3506,8 +3483,8 @@ bool LoopNest::has_constant_region_required(const FunctionDAG::Node* node) const return true; } -bool LoopNest::other_stage_has_same_producer(const FunctionDAG::Node* producer) const { - for (const auto& other_stage : node->stages) { +bool LoopNest::other_stage_has_same_producer(const FunctionDAG::Node *producer) const { + for (const auto &other_stage : node->stages) { if (stage->index == other_stage.index) { continue; } @@ -3521,7 +3498,7 @@ bool LoopNest::other_stage_has_same_producer(const FunctionDAG::Node* producer) return false; } -int LoopNest::num_serial_loops(const FunctionDAG::Node::Stage* stage) const { +int LoopNest::num_serial_loops(const FunctionDAG::Node::Stage *stage) const { int num_serial_loops = 0; for (const auto &child : children) { if (child->stage == stage) { @@ -3545,7 +3522,7 @@ int LoopNest::num_serial_loops() const { return num_serial_loops(stage); } -bool LoopNest::producer_computed_here_or_further_in(const FunctionDAG::Node* producer) const { +bool LoopNest::producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const { for (const auto &child : children) { if (child->node == producer) { return true; @@ -3588,7 +3565,7 @@ void LoopNest::apply(LoopLevel here, const LoopNest *compute_site, const Target &target, std::vector &ancestors, - const NodeMap& all_inlined) const { + const NodeMap &all_inlined) const { if (is_root()) { for (auto &c : children) { Func(c->node->func).compute_root(); @@ -3897,9 +3874,9 @@ void LoopNest::apply(LoopLevel here, } } -void LoopNest::update_producers_to_be_staged(StageScheduleState& state, const NodeMap& all_inlined) const { - std::vector>> pending; - std::vector edge_chain; +void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const NodeMap &all_inlined) const { + std::vector>> pending; + std::vector edge_chain; pending.emplace_back(stage, edge_chain); NodeMap done; @@ -3907,10 +3884,10 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState& state, const No auto cur_pair = pending.back(); pending.pop_back(); - auto* s = cur_pair.first; + auto *s = cur_pair.first; for (const auto *e : s->incoming_edges) { - std::vector edge_chain = cur_pair.second; + std::vector edge_chain = cur_pair.second; edge_chain.push_back(e); // If the producer is inlined, then its producers should potentially be @@ -3959,7 +3936,7 @@ double LoopNest::max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loo } bool LoopNest::has_valid_thread_extents() const { - for (const auto& c : children) { + for (const auto &c : children) { if (!are_valid_thread_extents(c->get_union_thread_counts(nullptr))) { return false; } @@ -3968,7 +3945,7 @@ bool LoopNest::has_valid_thread_extents() const { return true; } -void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap& nodes_to_freeze, NodeMap& inlined_nodes) const { +void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap &nodes_to_freeze, NodeMap &inlined_nodes) const { if (innermost) { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); @@ -3979,12 +3956,12 @@ void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap& nodes_t } } - for (const auto& c : children) { + for (const auto &c : children) { c->collect_nodes_that_should_be_inlined(nodes_to_freeze, inlined_nodes); } } -void LoopNest::collect_all_inlined(NodeMap& all_inlined) const { +void LoopNest::collect_all_inlined(NodeMap &all_inlined) const { if (innermost) { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); @@ -3992,7 +3969,7 @@ void LoopNest::collect_all_inlined(NodeMap& all_inlined) const { } } - for (const auto& c : children) { + for (const auto &c : children) { c->collect_all_inlined(all_inlined); } } diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h index de670214dc3b..d5fc9070e6c5 100644 --- a/src/autoschedulers/anderson2021/LoopNest.h +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -6,14 +6,14 @@ #ifndef LOOP_NEST_H #define LOOP_NEST_H +#include "ASLog.h" #include "FunctionDAG.h" -#include "GPUMemInfo.h" #include "GPULoopInfo.h" +#include "GPUMemInfo.h" #include "PerfectHashMap.h" #include "SearchSpaceOptions.h" #include "Statistics.h" #include "ThreadInfo.h" -#include "ASLog.h" #include "Tiling.h" #include #include @@ -28,10 +28,19 @@ using NodeMap = PerfectHashMap; template using StageMap = PerfectHashMap; -enum GPU_parallelism { block, thread, serial, simd, parallelized, none }; +enum GPU_parallelism { block, + thread, + serial, + simd, + parallelized, + none }; // inlined => func is inlined so has no memory store location -enum class GPUMemoryType { global, shared, local, registers, inlined }; +enum class GPUMemoryType { global, + shared, + local, + registers, + inlined }; bool may_subtile(); @@ -45,17 +54,17 @@ constexpr int64_t get_register_mem_alloc_limit() { return 128; } -int get_unroll_limit(const Target& target); +int get_unroll_limit(const Target &target); bool in_range_zero_one(double x); -bool are_valid_thread_extents(const vector& counts); +bool are_valid_thread_extents(const vector &counts); double get_idle_lane_wastage_limit_env_var(); double get_idle_lane_wastage_limit(); -bool all(const vector& v); -bool accessed_at_constant_indices(const std::vector& unrolled, const FunctionDAG::Edge* e); +bool all(const vector &v); +bool accessed_at_constant_indices(const std::vector &unrolled, const FunctionDAG::Edge *e); // We're going to do a tree search over possible schedules to find an // optimal one. A tree search requires a state, and a function that @@ -127,15 +136,15 @@ struct LoopNest { mutable std::map>> feature_intermediates; mutable std::map> features; - bool is_gpu_serial(const Target& target) const { + bool is_gpu_serial(const Target &target) const { return target.has_gpu_feature() && gpu_label == serial; } - bool is_gpu_thread(const Target& target) const { + bool is_gpu_thread(const Target &target) const { return target.has_gpu_feature() && gpu_label == thread; } - bool is_gpu_block(const Target& target) const { + bool is_gpu_block(const Target &target) const { return target.has_gpu_feature() && gpu_label == block; } @@ -194,29 +203,37 @@ struct LoopNest { // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage. struct Sites { - const LoopNest *compute = nullptr; // Its containing compute_at site - const LoopNest *store = nullptr; // Its containing store_at site - const LoopNest *produce = nullptr; // Its own outermost node - const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop - const LoopNest *task = nullptr; // The parallel for loop it belongs to - const LoopNest *thread = nullptr; // Its containing gpu_thread loop - GPUMemoryType gpu_store_memory_type; // global, local, shared? - int64_t allocation_size = 0; // Allocation size in bytes - bool is_constant_allocation = false; // Does the allocation have constant size? - int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers - bool inlined = false; // Is the Func inlined? - std::vector inlined_innermosts; // Is the Func inlined? + const LoopNest *compute = nullptr; // Its containing compute_at site + const LoopNest *store = nullptr; // Its containing store_at site + const LoopNest *produce = nullptr; // Its own outermost node + const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop + const LoopNest *task = nullptr; // The parallel for loop it belongs to + const LoopNest *thread = nullptr; // Its containing gpu_thread loop + GPUMemoryType gpu_store_memory_type; // global, local, shared? + int64_t allocation_size = 0; // Allocation size in bytes + bool is_constant_allocation = false; // Does the allocation have constant size? + int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers + bool inlined = false; // Is the Func inlined? + std::vector inlined_innermosts; // Is the Func inlined? uint64_t hash_of_producers_stored_at_root; - bool is_stored_in_global_mem() const { return gpu_store_memory_type == GPUMemoryType::global; } - bool is_stored_in_shared_mem() const { return gpu_store_memory_type == GPUMemoryType::shared; } - bool is_stored_in_local_mem() const { return gpu_store_memory_type == GPUMemoryType::local; } - bool is_stored_in_registers() const { return gpu_store_memory_type == GPUMemoryType::registers; } + bool is_stored_in_global_mem() const { + return gpu_store_memory_type == GPUMemoryType::global; + } + bool is_stored_in_shared_mem() const { + return gpu_store_memory_type == GPUMemoryType::shared; + } + bool is_stored_in_local_mem() const { + return gpu_store_memory_type == GPUMemoryType::local; + } + bool is_stored_in_registers() const { + return gpu_store_memory_type == GPUMemoryType::registers; + } }; - GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const; + GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined = false) const; - std::vector unrolled_loops(const Target& target, const LoopNest* parent, const LoopNest* grandparent) const; + std::vector unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const; void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap &sites, @@ -227,7 +244,7 @@ struct LoopNest { bool promote_allocs_to_registers(const Target &target, StageMap &sites) const; // Compute all the sites of interest for each pipeline stage - void get_sites(const Target& target, + void get_sites(const Target &target, StageMap &sites, StageMap &shared_mem_alloc_sizes, const LoopNest *task = nullptr, @@ -245,85 +262,85 @@ struct LoopNest { } } - bool exceeds_serial_extents_limit(const Target& target, const LoopNest* parent, bool in_threads_loop) const; + bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const; - bool node_has_dynamic_region_computed(const FunctionDAG::Node* f) const; + bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const; bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const; - const LoopNest* find_pure_stage_loop_nest(const FunctionDAG::Node* node) const; + const LoopNest *find_pure_stage_loop_nest(const FunctionDAG::Node *node) const; - int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node* node) const; + int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const; - int get_vectorized_loop_index_from_pure_stage(const LoopNest& root) const; + int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const; // Get the stride over "node's" storage for a unit increment in the vectorized loop's // index - double storage_stride(const LoadJacobian& jac, int innermost_storage_dim, const FunctionDAG::Node* storage_node, const Bound& store_bounds, const LoopNest& root) const; + double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const; - Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo& thread_info, bool verbose=false) const; + Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose = false) const; - bool all_strides_exist(const LoadJacobian& jac, const FunctionDAG::Node* storage_node, const LoopNest& root) const; + bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const; int get_actual_vector_dim(const Bound &store_bounds) const; - void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo& global_mem_loads, SharedMemInfo& shared_mem_loads, LocalMemInfo& local_mem_loads, bool verbose=false) const; + void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose = false) const; bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const; - bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType& mem_type) const; + bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const; - int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType& mem_type, bool verbose=false) const; + int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; - int vectorized_access_size(size_t loop_index, bool verbose=false) const; + int vectorized_access_size(size_t loop_index, bool verbose = false) const; - template - void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose=false) const; + template + void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose = false) const; - std::pair compute_local_mem_store_features(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const LoopNest& root, double serial_loop_extents) const; + std::pair compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const; - template - MemInfoType compute_mem_store_info(const LoadJacobian& jac, int consumer_innermost_dim, const FunctionDAG::Node* node, const Bound& consumer_store_bounds, const ThreadInfo& thread_info, double serial_loop_extents, bool verbose) const; + template + MemInfoType compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const; - template - void compute_mem_load_features(const LoadJacobian& jac, int producer_innermost_dim, const FunctionDAG::Node* node, const Bound& producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo& thread_info, MemInfoType& mem_info, double serial_loop_extents, bool verbose=false) const; + template + void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double serial_loop_extents, bool verbose = false) const; double compute_local_mem_stride(double stride, double bytes) const; // Assumes block, serial, thread or block, thread nesting - const LoopNest* get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const; + const LoopNest *get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const; - std::pair get_block_and_serial_extents(const LoopNest* block) const; + std::pair get_block_and_serial_extents(const LoopNest *block) const; bool all_paths_to_leaves_have_thread_loop() const; bool has_thread_loop_descendant() const; - void compute_warp_features(ScheduleFeatures& features, const GPULoopInfo& gpu_loop_info) const; + void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const; // Assume that when a block is active, all its warps are active - void compute_warp_and_block_occupancy(const MachineParams& params, ScheduleFeatures &feat, const GPULoopInfo& gpu_loop_info) const; + void compute_warp_and_block_occupancy(const MachineParams ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const; - void compute_shared_mem_occupancy(const Target& target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; + void compute_shared_mem_occupancy(const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; - std::pair find_innermost_and_parent() const; + std::pair find_innermost_and_parent() const; - int64_t points_accessed_per_thread(const MachineParams& params, const Target& target, const GPULoopInfo &gpu_loop_info, const std::vector& edge_chain, const LoadJacobian& jac, const LoopNest* parent, const LoopNest* grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian& serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType& mem_type, bool verbose=false) const; + int64_t points_accessed_per_thread(const MachineParams ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; - int64_t compute_licm_amortization(const LoopNest* innermost, const LoopNest* parent, const ScheduleFeatures& feat, const LoadJacobian& jac, int producer_dims) const; + int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const; - void memoize_points_computed_minimum(StageMap& memoized_features, const StageMap *features) const; + void memoize_points_computed_minimum(StageMap &memoized_features, const StageMap *features) const; vector> collect_producers(const StageMap &sites) const; uint64_t compute_hash_of_producers_stored_at_root(const StageMap &sites) const; - void collect_stages(std::set& stages) const; + void collect_stages(std::set &stages) const; - void memoize_features(StageMap& memoized_features, const StageMap *features) const; + void memoize_features(StageMap &memoized_features, const StageMap *features) const; void compute_working_set_from_features(int64_t *working_set, - const StageMap *features) const; + const StageMap *features) const; void recompute_inlined_features(const StageMap &sites, StageMap *features) const; @@ -332,7 +349,7 @@ struct LoopNest { // Do a recursive walk over the loop nest computing features to feed the cost model. void compute_features(const FunctionDAG &dag, const MachineParams ¶ms, - const Target& target, + const Target &target, const StageMap &sites, int64_t instances, int64_t parallelism, @@ -346,8 +363,8 @@ struct LoopNest { GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap &total_shared_mem_alloc_sizes, - Statistics& stats, - bool verbose=false) const; + Statistics &stats, + bool verbose = false) const; bool is_root() const { // The root is the sole node without a Func associated with @@ -369,15 +386,15 @@ struct LoopNest { // consumers along the given edge chain), from which we know what region // would be computed if it were scheduled here and what its loop nest // would be. - const Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector& edge_chain) const; + const Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const; void dump() const; std::string to_string() const; // Recursively print a loop nest representation to stderr - template - void dump(T& stream, string prefix, const LoopNest *parent) const; + template + void dump(T &stream, string prefix, const LoopNest *parent) const; // Does this loop nest access the given Func bool calls(const FunctionDAG::Node *f) const; @@ -416,13 +433,13 @@ struct LoopNest { IntrusivePtr parallelize_in_tiles(const MachineParams ¶ms, const vector &tiling, const LoopNest *parent, - const Target& target, + const Target &target, bool inner_tiling, bool adjust_tiling, - bool move_all_rvars_inward=true, - const vector &rvars_to_move_inward={}) const; + bool move_all_rvars_inward = true, + const vector &rvars_to_move_inward = {}) const; - int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const; + int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, bool in_threads_loop = false) const; int64_t get_total_constant_local_mem_alloc_size() const; // All store ats further in than the block level must be fixed @@ -443,7 +460,7 @@ struct LoopNest { bool in_realization, bool in_threads_loop, bool is_pre_pass, - vector union_counts=vector()) const; + vector union_counts = vector()) const; // Below here we have methods that apply a schedule to a Halide pipeline. @@ -481,19 +498,21 @@ struct LoopNest { // Some flags. bool innermost_pure_dim = false, - outermost = false, - parallel = false, - exists = false, - pure = false, - constant_extent = false; + outermost = false, + parallel = false, + exists = false, + pure = false, + constant_extent = false; bool vectorized = false; bool gpu_threads = false; - FuncVar() : orig(Var()), var(Var()) {} + FuncVar() + : orig(Var()), var(Var()) { + } }; - const FunctionDAG::Node* node; - const FunctionDAG::Node::Stage* stage; + const FunctionDAG::Node *node; + const FunctionDAG::Node::Stage *stage; bool parallel = false; bool vectorized = false; bool all_innermost_unrolled = false; @@ -506,22 +525,22 @@ struct LoopNest { vector ordered_vars; vector gpu_thread_extents; - NodeMap>>> producers_to_be_staged; + NodeMap>>> producers_to_be_staged; // From outermost in - vector ancestors; + vector ancestors; std::ostringstream schedule_source; }; - bool has_constant_region_computed(const FunctionDAG::Node* node) const; - bool has_constant_region_required(const FunctionDAG::Node* node) const; - bool other_stage_has_same_producer(const FunctionDAG::Node* producer) const; - int num_serial_loops(const FunctionDAG::Node::Stage* stage) const; + bool has_constant_region_computed(const FunctionDAG::Node *node) const; + bool has_constant_region_required(const FunctionDAG::Node *node) const; + bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const; + int num_serial_loops(const FunctionDAG::Node::Stage *stage) const; int num_serial_loops() const; - bool producer_computed_here_or_further_in(const FunctionDAG::Node* producer) const; + bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const; - void update_producers_to_be_staged(StageScheduleState& state, const NodeMap& all_inlined) const; + void update_producers_to_be_staged(StageScheduleState &state, const NodeMap &all_inlined) const; bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const; // Apply the schedule represented by this loop nest to a Halide pipeline. @@ -531,32 +550,30 @@ struct LoopNest { int depth, const LoopNest *parent, const LoopNest *compute_site, - const Target& target, - std::vector& ancestors, - const NodeMap& all_inlined) const; + const Target &target, + std::vector &ancestors, + const NodeMap &all_inlined) const; - double max_idle_lane_wastage(const Target& target, GPULoopInfo gpu_loop_info) const; + double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const; bool has_valid_thread_extents() const; - void collect_nodes_that_should_be_inlined(const NodeMap& nodes_to_freeze, NodeMap& inlined_nodes) const; + void collect_nodes_that_should_be_inlined(const NodeMap &nodes_to_freeze, NodeMap &inlined_nodes) const; - void collect_all_inlined(NodeMap& all_inlined) const; + void collect_all_inlined(NodeMap &all_inlined) const; int64_t product_of_self_and_descendants(int loop_index) const; int64_t product_of_descendants(int loop_index) const; - void get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest=nullptr) const; + void get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest = nullptr) const; }; struct Filter { - const LoopNest* loop_nest; + const LoopNest *loop_nest; bool logging = false; - Filter(const LoopNest* loop_nest) - : loop_nest{loop_nest} - , logging{enable_filter_printing()} - { + Filter(const LoopNest *loop_nest) + : loop_nest{loop_nest}, logging{enable_filter_printing()} { if (logging) { std::cerr << "\nState filtered: \n"; loop_nest->dump(); diff --git a/src/autoschedulers/anderson2021/LoopNestParser.h b/src/autoschedulers/anderson2021/LoopNestParser.h index 799012d6cf20..4f3e5c1a1278 100644 --- a/src/autoschedulers/anderson2021/LoopNestParser.h +++ b/src/autoschedulers/anderson2021/LoopNestParser.h @@ -15,9 +15,9 @@ namespace Internal { namespace Autoscheduler { class LoopNestParser { - void parse(const std::vector& loop_nest) { + void parse(const std::vector &loop_nest) { std::unordered_map> stage_to_loop_nest; - for (const auto& line : loop_nest) { + for (const auto &line : loop_nest) { if (line.empty()) { continue; } @@ -29,8 +29,7 @@ class LoopNestParser { std::istringstream iss(line); std::vector tokens{ std::istream_iterator(iss), - std::istream_iterator() - }; + std::istream_iterator()}; std::string stage = tokens.at(0); bool is_inlined = tokens.at(0) == "inlined:"; @@ -68,9 +67,9 @@ class LoopNestParser { } } - for (const auto& entry : stage_to_loop_nest) { + for (const auto &entry : stage_to_loop_nest) { std::string loop_nest = ""; - for (const auto& line : entry.second) { + for (const auto &line : entry.second) { loop_nest += line + "\n"; } @@ -79,13 +78,13 @@ class LoopNestParser { // If a stage appears in a 'realize: ' line but nowhere else, remove it std::vector to_remove; - for (const auto& entry : compute_root_stages) { + for (const auto &entry : compute_root_stages) { if (entry.second == -1) { to_remove.push_back(entry.first); } } - for (const auto& key : to_remove) { + for (const auto &key : to_remove) { compute_root_stages.erase(key); partially_scheduled.erase(key); all_stages.erase(key); @@ -101,53 +100,52 @@ class LoopNestParser { std::unordered_set all_stages; public: - LoopNestParser(const std::vector& loop_nest) - : loop_nest{loop_nest} - { + LoopNestParser(const std::vector &loop_nest) + : loop_nest{loop_nest} { parse(loop_nest); } void dump() const { aslog(0) << "All stages:\n"; - for (const auto& s : all_stages) { + for (const auto &s : all_stages) { aslog(0) << s << "\n"; } aslog(0) << "\ncompute_root stages:\n"; - for (const auto& s : compute_root_stages) { + for (const auto &s : compute_root_stages) { aslog(0) << s.first << " with vector_dim = " << s.second << "\n"; } aslog(0) << "\nPartially scheduled stages:\n"; - for (const auto& s : partially_scheduled) { + for (const auto &s : partially_scheduled) { aslog(0) << s << " with vector_dim = " << compute_root_stages.at(s) << "\n"; } aslog(0) << "\nInlined stages:\n"; - for (const auto& s : inlined) { + for (const auto &s : inlined) { aslog(0) << s << "\n"; } aslog(0) << "\nFull loop nest:\n"; - for (const auto& s : loop_nest) { + for (const auto &s : loop_nest) { aslog(0) << s << "\n"; } aslog(0) << "\n"; } - bool is_in_partial_schedule(const FunctionDAG::Node* node) const { + bool is_in_partial_schedule(const FunctionDAG::Node *node) const { return node && all_stages.count(node->func.name()) > 0; } - bool contains_sub_loop_nest_for_shared_stages(const LoopNestParser& other) const { + bool contains_sub_loop_nest_for_shared_stages(const LoopNestParser &other) const { return contains_sub_loop_nest(other, true); } - // 'only_consider_shared_stages': check if 'other' is contained in this loop - // nest, but ignore stages that are present in 'other' but not present in + // 'only_consider_shared_stages': check if 'other' is contained in this loop + // nest, but ignore stages that are present in 'other' but not present in // this loop nest - bool contains_sub_loop_nest(const LoopNestParser& other, bool only_consider_shared_stages=false) const { - for (const auto& stage : other.all_stages) { + bool contains_sub_loop_nest(const LoopNestParser &other, bool only_consider_shared_stages = false) const { + for (const auto &stage : other.all_stages) { if (all_stages.count(stage) == 0) { if (only_consider_shared_stages) { continue; @@ -180,7 +178,7 @@ class LoopNestParser { return true; } - static LoopNestParser from_string(const std::string& str) { + static LoopNestParser from_string(const std::string &str) { std::istringstream in(str); std::string line; std::vector loop_nest; @@ -192,7 +190,7 @@ class LoopNestParser { return LoopNestParser(loop_nest); } - static std::unique_ptr from_file(const std::string& filename) { + static std::unique_ptr from_file(const std::string &filename) { std::ifstream file(filename); std::string line; std::vector loop_nest; diff --git a/src/autoschedulers/anderson2021/NetworkSize.h b/src/autoschedulers/anderson2021/NetworkSize.h index fa8f5566110f..036756b0af77 100644 --- a/src/autoschedulers/anderson2021/NetworkSize.h +++ b/src/autoschedulers/anderson2021/NetworkSize.h @@ -6,7 +6,7 @@ namespace Halide { // model and also the cost model training script. const int head1_channels = 8, head1_w = 40, head1_h = 7; const int head2_channels = 24, head2_w = 73; -const int conv1_channels = 32; // Only 30 are used (needs to be a multiple of 8 for vectorization in cost_model_generator.cpp) -} // namespace Halide +const int conv1_channels = 32; // Only 30 are used (needs to be a multiple of 8 for vectorization in cost_model_generator.cpp) +} // namespace Halide #endif // HALIDE_NETWORK_SIZE_H diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index c5de0e8f48a2..46e5e7110d60 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -19,21 +19,12 @@ SearchSpace::SearchSpace(const FunctionDAG &dag, std::mt19937 &rng, CostModel *cost_model, Statistics &stats, - const LoopNestParser* partial_schedule) - : dag{dag} - , params{params} - , target{target} - , search_space_options{search_space_options} - , rng{rng} - , cost_model{cost_model} - , stats{stats} - , randomize_tilings{use_randomized_tilings()} - , partial_schedule{partial_schedule} -{ + const LoopNestParser *partial_schedule) + : dag{dag}, params{params}, target{target}, search_space_options{search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, randomize_tilings{use_randomized_tilings()}, partial_schedule{partial_schedule} { memoized_compute_root_blocks.make_large(dag.nodes.size()); } -void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest* new_root) { +void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root) { int vector_dim = -1; bool loop_nest_found = false; for (auto &c : new_root->children) { @@ -46,7 +37,7 @@ void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest* new_ro internal_assert(loop_nest_found); - auto& blocks = memoized_compute_root_blocks.get_or_create(node)[vector_dim]; + auto &blocks = memoized_compute_root_blocks.get_or_create(node)[vector_dim]; for (auto &c : new_root->children) { if (c->node == node) { @@ -61,13 +52,13 @@ void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest* new_ro bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, std::function &&)> &accept_child, const FunctionDAG::Node *node, - int& num_children) const { + int &num_children) const { if (!memoized_compute_root_blocks.contains(node)) { return false; } int vector_dim = -1; - for (const auto& c : state->root->children) { + for (const auto &c : state->root->children) { if (c->node == node && c->stage->index == 0) { vector_dim = c->vector_dim; break; @@ -89,7 +80,7 @@ bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, child->num_decisions_made++; int block_index = 0; - for (const auto& c : new_root->children) { + for (const auto &c : new_root->children) { if (c->node == node) { break; } @@ -97,7 +88,7 @@ bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, } for (size_t j = 0; j < num_stages; ++j) { - LoopNest* new_block = new LoopNest; + LoopNest *new_block = new LoopNest; new_block->copy_from_including_features(*blocks[i + j]); new_root->children[block_index++] = new_block; } @@ -113,9 +104,9 @@ bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, } vector SearchSpace::filter_parallel_tile_options(IntrusivePtr state, - const FunctionDAG::Node *node, - vector>& inner_tilings, - const vector& pure_size) const { + const FunctionDAG::Node *node, + vector> &inner_tilings, + const vector &pure_size) const { vector options; vector insufficient_parallelism; for (size_t i = 0; i < inner_tilings.size(); i++) { @@ -182,7 +173,7 @@ vector SearchSpace::filter_parallel_tile_option int64_t parallelism_limit = params.parallelism; while (options.empty()) { - for (auto& o : insufficient_parallelism) { + for (auto &o : insufficient_parallelism) { if (o.min_parallelism >= parallelism_limit) { options.emplace_back(std::move(o)); } @@ -196,9 +187,9 @@ vector SearchSpace::filter_parallel_tile_option return options; } -vector SearchSpace::filter_thread_tile_options(vector>& loop_nests) const { +vector SearchSpace::filter_thread_tile_options(vector> &loop_nests) const { vector options; - for (const auto& loop_nest : loop_nests) { + for (const auto &loop_nest : loop_nests) { if (!loop_nest->has_valid_thread_extents()) { Filter(loop_nest.get()) << "Invalid thread extents\n"; continue; @@ -215,12 +206,12 @@ vector SearchSpace::filter_thread_tile_options(vector& primary_options, - std::unordered_map& secondary_options, +void SearchSpace::process_pending_states(std::unordered_map &primary_options, + std::unordered_map &secondary_options, int &num_children, std::function &&)> &accept_child, - const FunctionDAG::Node* node) { - for (auto& entry : primary_options) { + const FunctionDAG::Node *node) { + for (auto &entry : primary_options) { size_t N = entry.second.size(); if (N > 1 && !is_in_partial_schedule(node)) { N = std::log2(entry.second.size()); @@ -243,7 +234,7 @@ void SearchSpace::process_pending_states(std::unordered_mapcalculate_cost(dag, params, target, cost_model, stats)) { num_children++; @@ -317,7 +308,6 @@ void SearchSpace::generate_children(IntrusivePtr state, int num_children = 0; - if (phase == 0) { // Injecting realizations { @@ -352,8 +342,8 @@ void SearchSpace::generate_children(IntrusivePtr state, // inlining it is legal, just inline it. This saves time // on long chains of pointwise things. must_inline = (node->is_pointwise && - (num_children > 0) && - (node->outgoing_edges.size() == 1)); + (num_children > 0) && + (node->outgoing_edges.size() == 1)); if (must_inline) { for (const auto *e : node->stages[0].incoming_edges) { must_inline &= e->producer->is_pointwise; @@ -372,7 +362,7 @@ void SearchSpace::generate_children(IntrusivePtr state, new_root->copy_from(*root); const auto &nodes = compute_root_nodes.get(node); for (const auto &n : nodes) { - const auto* compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{}); + const auto *compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{}); new_root->children.push_back(compute_root_loop); } new_root->store_at.insert(node); @@ -417,10 +407,10 @@ void SearchSpace::generate_children(IntrusivePtr state, auto options = filter_thread_tile_options(tile_options); stats.filter_thread_tiles_time += timer.elapsed(); - for (const auto& o : options) { + for (const auto &o : options) { if (!randomize_tilings && num_children >= 1 && o.max_idle_lane_wastage > 0.5) { Filter(o.loop_nest.get()) << "Excess idle lane wastage\n" - << "max_idle_lane_wastage = " << o.max_idle_lane_wastage << "\n"; + << "max_idle_lane_wastage = " << o.max_idle_lane_wastage << "\n"; break; } @@ -490,7 +480,7 @@ void SearchSpace::generate_children(IntrusivePtr state, std::unordered_map>> primary_options; std::unordered_map>> secondary_options; - for (auto ¶llel_t: parallel_tilings) { + for (auto ¶llel_t : parallel_tilings) { LoopNest parallel_root; parallel_root.copy_from(*root); @@ -509,7 +499,7 @@ void SearchSpace::generate_children(IntrusivePtr state, // at root level sibling thread counts are in separate blocks, extents are irrelevant vector max_size((int)(stage_sizes[0].size()), 1); - auto block_tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, node->dimensions-1, vectorized_indices, false, true); + auto block_tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, node->dimensions - 1, vectorized_indices, false, true); // If no options, create a thread tiling as large as possible with block size (1,1,1). // This can happen if the loops are too small to generate desired gpu tiles. @@ -591,7 +581,7 @@ void SearchSpace::generate_children(IntrusivePtr state, } struct ClearInlinedMutator { - void operator()(LoopNest* new_loop_nest) const { + void operator()(LoopNest *new_loop_nest) const { new_loop_nest->inlined = {}; } }; @@ -600,13 +590,13 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { std::vector> node_ids_and_costs; NodeMap node_costs; size_t num_nodes = 0; - for (const auto& n : dag.nodes) { + for (const auto &n : dag.nodes) { if (n.is_input) { continue; } int i = 0; - for (const auto& s : n.stages) { + for (const auto &s : n.stages) { if (!node_costs.contains(dag.stage_id_to_node_map.at(s.id))) { node_costs.get_or_create(dag.stage_id_to_node_map.at(s.id)) = 0; } @@ -621,11 +611,11 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { node_ids_and_costs.push_back({it.key()->id, it.value()}); } - for (const auto& n : node_ids_and_costs) { + for (const auto &n : node_ids_and_costs) { internal_assert(n.first >= 0); } - std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(), [](const std::pair& a, const std::pair& b) { + std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(), [](const std::pair &a, const std::pair &b) { return a.second < b.second; }); @@ -641,7 +631,7 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { ClearInlinedMutator mutator{}; - for (const auto& c : best->root->children) { + for (const auto &c : best->root->children) { if (nodes_to_freeze.contains(c->node)) { auto new_loop_nest = deep_copy_loop_nest(c, mutator); compute_root_nodes.get_or_create(c->node).push_back(new_loop_nest); @@ -650,7 +640,7 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { } } -vector> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr& pure_stage, const FunctionDAG::Node *node) const { +vector> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, const FunctionDAG::Node *node) const { std::vector vec_dim_serial_sizes; pure_stage->generate_vec_dim_serial_tilings(vec_dim_serial_sizes); @@ -663,8 +653,8 @@ vector> SearchSpace::generate_compute_root_serial_tilings(const true); } -bool SearchSpace::add_child(const IntrusivePtr& state, - const IntrusivePtr& new_root, +bool SearchSpace::add_child(const IntrusivePtr &state, + const IntrusivePtr &new_root, std::function &&)> &accept_child) const { auto child = state->make_child(); child->root = std::move(new_root); diff --git a/src/autoschedulers/anderson2021/SearchSpace.h b/src/autoschedulers/anderson2021/SearchSpace.h index 538c441a42a2..49205a8e90c4 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.h +++ b/src/autoschedulers/anderson2021/SearchSpace.h @@ -1,6 +1,7 @@ #ifndef SEARCH_SPACE_H #define SEARCH_SPACE_H +#include "ASLog.h" #include "CostModel.h" #include "DefaultCostModel.h" #include "Featurization.h" @@ -8,12 +9,11 @@ #include "LoopNest.h" #include "LoopNestParser.h" #include "PerfectHashMap.h" -#include "ASLog.h" #include "SearchSpaceOptions.h" #include "State.h" #include -#include #include +#include #include namespace Halide { @@ -30,7 +30,7 @@ struct SearchSpace { CostModel *cost_model; Statistics &stats; bool randomize_tilings; - const LoopNestParser* partial_schedule; + const LoopNestParser *partial_schedule; NodeMap inlined_nodes; NodeMap>> compute_root_nodes; @@ -43,7 +43,7 @@ struct SearchSpace { std::mt19937 &rng, CostModel *cost_model, Statistics &stats, - const LoopNestParser* partial_schedule); + const LoopNestParser *partial_schedule); // Sort / filter parallel tile options struct ParallelTileOption { @@ -66,18 +66,17 @@ struct SearchSpace { vector filter_parallel_tile_options(IntrusivePtr state, const FunctionDAG::Node *node, - vector>& inner_tilings, - const vector& pure_size) const; + vector> &inner_tilings, + const vector &pure_size) const; - vector filter_thread_tile_options(vector>& loop_nests) const; + vector filter_thread_tile_options(vector> &loop_nests) const; - void memoize_blocks(const FunctionDAG::Node *node, LoopNest* new_root); + void memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root); bool add_states_from_memoized_blocks(IntrusivePtr state, std::function &&)> &accept_child, const FunctionDAG::Node *node, - int& num_children) const; - + int &num_children) const; // Generate successor states for given 'state' void generate_children(IntrusivePtr state, @@ -87,25 +86,23 @@ struct SearchSpace { void freeze_lowest_cost_stages(const IntrusivePtr best); - vector> generate_compute_root_serial_tilings(const IntrusivePtr& pure_stage, const FunctionDAG::Node *node) const; + vector> generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, const FunctionDAG::Node *node) const; - bool add_child(const IntrusivePtr& state, - const IntrusivePtr& new_root, + bool add_child(const IntrusivePtr &state, + const IntrusivePtr &new_root, std::function &&)> &accept_child) const; - void process_pending_states(std::unordered_map& primary_options, - std::unordered_map& secondary_options, + void process_pending_states(std::unordered_map &primary_options, + std::unordered_map &secondary_options, int &num_children, std::function &&)> &accept_child, - const FunctionDAG::Node* node); + const FunctionDAG::Node *node); bool is_in_partial_schedule(const FunctionDAG::Node *node) const; }; - - } // namespace Autoscheduler } // namespace Internal } // namespace Halide -#endif // SEARCH_SPACE_H +#endif // SEARCH_SPACE_H diff --git a/src/autoschedulers/anderson2021/SearchSpaceOptions.h b/src/autoschedulers/anderson2021/SearchSpaceOptions.h index 7d22e20b9072..ccb862085f1b 100644 --- a/src/autoschedulers/anderson2021/SearchSpaceOptions.h +++ b/src/autoschedulers/anderson2021/SearchSpaceOptions.h @@ -17,9 +17,8 @@ struct SearchSpaceOptions { std::bitset<4> options; - SearchSpaceOptions(const std::string& bit_str) - : options{bit_str} - { + SearchSpaceOptions(const std::string &bit_str) + : options{bit_str} { aslog(0) << "Search space options:\n"; aslog(0) << "Input string: " << bit_str << "\n"; aslog(0) << "Compute root: " << compute_root() << "\n"; @@ -65,4 +64,4 @@ struct SearchSpaceOptions { } // namespace Internal } // namespace Halide -#endif // SEARCH_SPACE_OPTIONS_H +#endif // SEARCH_SPACE_OPTIONS_H diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index aa1cd5d8c683..eeeb17650cf5 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -37,8 +37,8 @@ void State::compute_loop_nest_parents(map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap& total_shared_mem_alloc_sizes) const { - std::vector ancestors; +const LoopNest *State::deepest_valid_compute_location(const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const { + std::vector ancestors; // Innermost loop nests are never considered as compute locations if (!loop->innermost) { @@ -168,12 +168,12 @@ const LoopNest *State::deepest_common_ancestor(const mapchildren) { + for (const auto &c : root->children) { if (c->gpu_label != block) { continue; } - for (const auto& block_c : c->children) { + for (const auto &block_c : c->children) { if (!block_c->all_paths_to_leaves_have_thread_loop()) { return true; } @@ -184,7 +184,7 @@ bool State::has_loop_nest_without_thread_loops() const { } bool State::has_compute_root_loops_without_blocks() const { - for (const auto& c : root->children) { + for (const auto &c : root->children) { if (c->gpu_label == none) { return true; } @@ -193,7 +193,7 @@ bool State::has_compute_root_loops_without_blocks() const { return false; } -void State::FeatureLoopNestMutator::operator()(LoopNest* new_loop_nest) const { +void State::FeatureLoopNestMutator::operator()(LoopNest *new_loop_nest) const { split_compute_root_loops(new_loop_nest); add_outer_thread_loops(new_loop_nest); } @@ -202,13 +202,13 @@ void State::FeatureLoopNestMutator::operator()(LoopNest* new_loop_nest) const { // blocks, threads, and serial loops. To enable the cost model to make a // meaningful prediction on these pre-split loops, we assume a split into // blocks and threads with a single full warp (if possible) -void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest* loop_nest) const { +void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest) const { if (!loop_nest || !loop_nest->is_root()) { return; } for (auto it = loop_nest->children.rbegin(); it != loop_nest->children.rend(); ++it) { - auto& c = *it; + auto &c = *it; if (c->gpu_label != none) { continue; } @@ -271,7 +271,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest* loop_nest // If a loop nest does not have thread loops, split the outermost serial // loops to create thread loops with extents 1 -void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest* loop_nest) const { +void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) const { if (!loop_nest) { return; } @@ -283,7 +283,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest* loop_nest) // all serial descendants // // (a) should be surrounded by a thread loop - for (auto& c : loop_nest->children) { + for (auto &c : loop_nest->children) { if (c->has_thread_loop_descendant()) { continue; } @@ -304,7 +304,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest* loop_nest) if (loop_nest->gpu_label == serial) { bool has_child_with_thread_descendant = false; - for (const auto& c : loop_nest->children) { + for (const auto &c : loop_nest->children) { if (c->has_thread_loop_descendant()) { has_child_with_thread_descendant = true; break; @@ -325,7 +325,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest* loop_nest) // serial (a) // // (a) should be surrounded by a thread loop - for (auto& c : loop_nest->children) { + for (auto &c : loop_nest->children) { if (c->has_thread_loop_descendant()) { continue; } @@ -338,11 +338,10 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest* loop_nest) c->gpu_label = thread; c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); } - } } -IntrusivePtr State::get_root_for_features(const MachineParams ¶ms, const Target& target) const { +IntrusivePtr State::get_root_for_features(const MachineParams ¶ms, const Target &target) const { if (!has_compute_root_loops_without_blocks() && !has_loop_nest_without_thread_loops()) { return root; } @@ -360,7 +359,7 @@ IntrusivePtr State::get_root_for_features(const MachineParams &p return new_root; } -void State::set_gpu_store_site(const map>& parent, const LoopNest* loop, LoopNest::Sites& site) const { +void State::set_gpu_store_site(const map> &parent, const LoopNest *loop, LoopNest::Sites &site) const { // If site.store is inside a block but outside a loop, the // GPU store site should instead be the block because the shared // mem allocation will be hoisted @@ -392,7 +391,7 @@ void State::set_gpu_store_site(const map *features, Statistics& stats, bool verbose) const { +bool State::compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, StageMap *features, Statistics &stats, bool verbose) const { auto feature_root = get_root_for_features(params, target); StageMap sites; @@ -446,7 +445,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p if (consumer_site.inlined) { // If this func is inlined, find the deepest common ancestor // of all its inlined locations - for (const auto* innermost : consumer_site.inlined_innermosts) { + for (const auto *innermost : consumer_site.inlined_innermosts) { loop = deepest_common_ancestor(parent, innermost, loop); } } else { @@ -457,7 +456,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p bool first = true; // If this func is inlined, find the deepest common ancestor // of all its inlined locations - for (const auto* innermost : consumer_site.inlined_innermosts) { + for (const auto *innermost : consumer_site.inlined_innermosts) { if (first) { first = false; loop = innermost; @@ -492,7 +491,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p } } - for (const auto& c : feature_root->children) { + for (const auto &c : feature_root->children) { sites.get(c->stage).hash_of_producers_stored_at_root = c->compute_hash_of_producers_stored_at_root(sites); } @@ -513,7 +512,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p return true; } -void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, std::ostream &out) const { +void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, std::ostream &out) const { StageMap features; Statistics stats; compute_featurization(dag, params, target, &features, stats); @@ -541,8 +540,8 @@ void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ } } -bool State::contains_store_at(const set& outermost_store_at, const IntrusivePtr& parent) const { - for (const auto& c : parent->children) { +bool State::contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const { + for (const auto &c : parent->children) { if (c->store_at.size() > 0) { return true; } @@ -566,8 +565,8 @@ bool State::contains_store_at(const set& outermost_st // store_ats further in will be hoisted and expanded, increasing the // amount of shared memory required. bool State::contains_store_at_further_in_than_outermost() const { - for (const auto& child : root->children) { - for (const auto& grandchild : child->children) { + for (const auto &child : root->children) { + for (const auto &grandchild : child->children) { if (contains_store_at(child->store_at, grandchild)) { return true; } @@ -576,7 +575,6 @@ bool State::contains_store_at_further_in_than_outermost() const { return false; } - bool State::has_dynamic_allocation_inside_thread() const { return root->has_dynamic_allocation_inside_thread(false); } @@ -589,7 +587,7 @@ bool State::exceeds_serial_extents_limit(const Target &target) const { return root->exceeds_serial_extents_limit(target, nullptr, false); } -int64_t State::get_shared_mem_alloc_size(const LoopNest* block, const LoopNest* loop) const { +int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const { int64_t result = 0; if (loop->gpu_label == thread) { @@ -610,7 +608,7 @@ int64_t State::get_shared_mem_alloc_size(const LoopNest* block, const LoopNest* } } - for (const auto& c : loop->children) { + for (const auto &c : loop->children) { result += get_shared_mem_alloc_size(block, c.get()); } @@ -628,7 +626,7 @@ bool State::exceeds_shared_memory_limit(const Target &target) const { return false; } - for (const auto& c : root->children) { + for (const auto &c : root->children) { // If the working set is too large on the GPU, shared memory will be // exhausted, so reject any such schedules if (get_shared_mem_alloc_size(c.get(), c.get()) > limit) { @@ -644,7 +642,7 @@ bool State::exceeds_local_memory_limit(const Target &target) const { return false; } - for (const auto& c : root->children) { + for (const auto &c : root->children) { if (c->get_total_constant_local_mem_alloc_size() > get_stack_memory_limit()) { return true; } @@ -657,7 +655,7 @@ bool State::exceeds_local_memory_limit(const Target &target) const { return false; } -bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, CostModel *cost_model, Statistics& stats, bool verbose) { +bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) { Timer timer; if (!root->has_valid_thread_extents()) { Filter(root.get()) << "Invalid thread extents\n"; @@ -713,11 +711,11 @@ bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, if (feat.points_computed_total + feat.inlined_calls > 10 * feat.points_computed_minimum) { Filter(root.get()) << "Excess recompute for " << it.key()->node->func.name() << " stage " << it.key()->index << "\n" - << "points_computed_total = " << feat.points_computed_total << "\n" - << "inlined_calls = " << feat.inlined_calls << "\n" - << "points_computed_total + inlined_calls = " << feat.points_computed_total + feat.inlined_calls << "\n" - << "points_computed_minimum = " << feat.points_computed_minimum << "\n" - << "8 * points_computed_minimum = " << 8 * feat.points_computed_minimum << "\n"; + << "points_computed_total = " << feat.points_computed_total << "\n" + << "inlined_calls = " << feat.inlined_calls << "\n" + << "points_computed_total + inlined_calls = " << feat.points_computed_total + feat.inlined_calls << "\n" + << "points_computed_minimum = " << feat.points_computed_minimum << "\n" + << "8 * points_computed_minimum = " << 8 * feat.points_computed_minimum << "\n"; cost = 1e50; return false; } @@ -761,10 +759,10 @@ void State::print_compute_locations() const { root->get_stages_computed_in_each_compute_root_loop(descendants); aslog(0) << "BEGIN compute locations\n"; - for (const auto& d : descendants) { + for (const auto &d : descendants) { aslog(0) << d.first->sanitized_name << " -> "; - for (const auto& descendant : d.second) { + for (const auto &descendant : d.second) { aslog(0) << descendant.first->sanitized_name << " "; } @@ -773,7 +771,7 @@ void State::print_compute_locations() const { aslog(0) << "END compute locations\n"; } -void State::fuse_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents, const vector& constant_extents) const { +void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents, const vector &constant_extents) const { if (parallel_vars.empty() || parallel_extents.empty()) { return; } @@ -810,8 +808,8 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, c auto inner_i = block_var_assignments[block_i][0]; auto outer_i = block_var_assignments[block_i][i]; state->schedule_source << "\n .fuse(" << parallel_vars[inner_i].name() - << ", " << parallel_vars[outer_i].name() - << ", " << parallel_vars[inner_i].name() << ")"; + << ", " << parallel_vars[outer_i].name() + << ", " << parallel_vars[inner_i].name() << ")"; stage.fuse(parallel_vars[inner_i], parallel_vars[outer_i], parallel_vars[inner_i]); @@ -843,11 +841,11 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, c } } -void State::mark_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents) const { +void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents) const { int max_blocks[3] = {2147483647, 65535, 65535}; uint8_t n_loops_tagged_gpu_blocks = 0; - for (auto& v : parallel_vars) { + for (auto &v : parallel_vars) { if (n_loops_tagged_gpu_blocks >= 3 || parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) { break; } @@ -862,15 +860,15 @@ void State::mark_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, c } } -bool State::mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, std::unordered_set& new_serial_vars, std::ostringstream& staged_funcs_schedule_source) const { +bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const { uint8_t num_loops_tagged_gpu_thread = 0; int64_t total_threads = 1; int max_threads[3] = {1024, 1024, 64}; bool first = true; - for (const auto& v : state->vars) { - if (!v.exists || !v.gpu_threads || v.extent == 1) { + for (const auto &v : state->vars) { + if (!v.exists || !v.gpu_threads || v.extent == 1) { continue; } @@ -891,12 +889,12 @@ bool State::mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, Func func(state->node->func); - for (const auto& to_be_staged : state->producers_to_be_staged) { - const auto* producer_node = to_be_staged.first; + for (const auto &to_be_staged : state->producers_to_be_staged) { + const auto *producer_node = to_be_staged.first; - for (const auto& cur_pair : to_be_staged.second) { - const LoopNest* loop_nest = cur_pair.first; - const std::vector& edge_chain = cur_pair.second; + for (const auto &cur_pair : to_be_staged.second) { + const LoopNest *loop_nest = cur_pair.first; + const std::vector &edge_chain = cur_pair.second; internal_assert(edge_chain.at(0)->consumer == loop_nest->stage); internal_assert(edge_chain.back()->producer == producer_node); @@ -930,10 +928,10 @@ bool State::mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, << v.var.var.name() << ")"; - const auto& bounds = loop_nest->get_bounds_along_edge_chain(producer_node, edge_chain); + const auto &bounds = loop_nest->get_bounds_along_edge_chain(producer_node, edge_chain); int i = 0; - for (const auto& l : producer_node->stages[0].loop) { + for (const auto &l : producer_node->stages[0].loop) { Var unrolled_var(l.var); int extent = bounds->region_required(i++).extent(); @@ -957,7 +955,7 @@ bool State::mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, return num_loops_tagged_gpu_thread > 0; } -bool State::can_fuse_gpu(const vector& parallel_extents) const { +bool State::can_fuse_gpu(const vector ¶llel_extents) const { int64_t total = 1; for (auto extent : parallel_extents) { total *= extent; @@ -973,7 +971,7 @@ bool State::can_fuse_gpu(const vector& parallel_extents) const { // user to copy-paste to freeze this schedule as permanent artifact. void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target) { StageMap> state_map; - std::vector ancestors; + std::vector ancestors; NodeMap all_inlined; root->collect_all_inlined(all_inlined); @@ -1108,7 +1106,7 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, std::ostringstream staged_funcs_schedule_source; if (target.has_gpu_feature()) { - std::set invalid; + std::set invalid; // Iterate from output backwards for (const auto &n : dag.nodes) { for (auto &p : state_map) { @@ -1126,7 +1124,7 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, bool has_enclosing_parallel = p.second->parallel; if (!has_enclosing_parallel) { - for (auto* ancestor : p.second->ancestors) { + for (auto *ancestor : p.second->ancestors) { if (ancestor->parallel) { has_enclosing_parallel = true; break; @@ -1150,9 +1148,9 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, // There is no outer loop marked as gpu_block. // Split the outer loop to create a new outer var with // extent = 1 and mark it gpu_blocks() - const auto& outer_var = p.second->ordered_vars.back(); + const auto &outer_var = p.second->ordered_vars.back(); vector vars; - for (const auto& v : p.second->ordered_vars) { + for (const auto &v : p.second->ordered_vars) { vars.push_back(v.var); } @@ -1175,7 +1173,7 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, p.second->schedule_source << "\n .reorder("; bool first = true; - for (const auto& v : vars) { + for (const auto &v : vars) { if (!first) { p.second->schedule_source << ", "; } @@ -1196,7 +1194,7 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, } } - for (const auto& v : new_serial_vars) { + for (const auto &v : new_serial_vars) { src << "Var " << v << "(\"" << v << "\");\n"; } @@ -1242,9 +1240,9 @@ void State::update_always_consider_inline_options(const FunctionDAG::Node *node) NodeMap currently_inlined; root->collect_all_inlined(currently_inlined); - std::unordered_set non_inlined_consumers; + std::unordered_set non_inlined_consumers; std::unordered_set done; - std::vector pending; + std::vector pending; pending.push_back(node); while (!pending.empty()) { diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h index 33c05fa419f5..88aa69225f1b 100644 --- a/src/autoschedulers/anderson2021/State.h +++ b/src/autoschedulers/anderson2021/State.h @@ -1,13 +1,13 @@ #ifndef STATE_H #define STATE_H +#include "ASLog.h" #include "CostModel.h" #include "DefaultCostModel.h" #include "Featurization.h" #include "FunctionDAG.h" #include "LoopNest.h" #include "PerfectHashMap.h" -#include "ASLog.h" #include #include #include @@ -16,13 +16,11 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -using std::string; -using std::vector; using std::map; using std::pair; using std::set; -using std::unordered_set; using std::string; +using std::unordered_set; using std::vector; bool verify_memoized_features(); @@ -31,7 +29,7 @@ bool is_memoize_blocks_enabled(); double get_stack_memory_adjustment_factor(); -constexpr int kLocalMemoryLimit = 524288; // 512 KB +constexpr int kLocalMemoryLimit = 524288; // 512 KB // Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM) // = 103232 bytes @@ -44,15 +42,16 @@ bool use_adjusted_tilings(); bool compute_root_and_inline_only(); struct NoOpMutator { - void operator()(LoopNest* new_loop_nest) const {} + void operator()(LoopNest *new_loop_nest) const { + } }; -template -void deep_copy_loop_nest(LoopNest* new_loop_nest, const LoopNest* new_loop_nest_parent, const IntrusivePtr& existing_loop_nest, const PostCreateMutator& post_create_mutator) { +template +void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr &existing_loop_nest, const PostCreateMutator &post_create_mutator) { new_loop_nest->copy_from(*existing_loop_nest); for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) { - LoopNest* new_child = new LoopNest; + LoopNest *new_child = new LoopNest; new_loop_nest->children[i] = new_child; deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator); } @@ -60,9 +59,9 @@ void deep_copy_loop_nest(LoopNest* new_loop_nest, const LoopNest* new_loop_nest_ post_create_mutator(new_loop_nest); } -template -LoopNest* deep_copy_loop_nest(const IntrusivePtr& loop_nest, const PostCreateMutator& post_create_mutator) { - LoopNest* new_loop_nest = new LoopNest; +template +LoopNest *deep_copy_loop_nest(const IntrusivePtr &loop_nest, const PostCreateMutator &post_create_mutator) { + LoopNest *new_loop_nest = new LoopNest; deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator); return new_loop_nest; } @@ -95,9 +94,9 @@ struct State { // We use the post_create_mutator so that the loop nests can be modified // before they become IntrusivePtr as children and cannot be modified - template - LoopNest* create_feature_root(const PostCreateMutator& post_create_mutator) const { - LoopNest* new_root = new LoopNest; + template + LoopNest *create_feature_root(const PostCreateMutator &post_create_mutator) const { + LoopNest *new_root = new LoopNest; deep_copy_loop_nest(new_root, nullptr, root, post_create_mutator); return new_root; } @@ -107,31 +106,31 @@ struct State { bool has_compute_root_loops_without_blocks() const; struct FeatureLoopNestMutator { - const MachineParams& params; - const Target& target; + const MachineParams ¶ms; + const Target ⌖ - void operator()(LoopNest* new_loop_nest) const; + void operator()(LoopNest *new_loop_nest) const; // In phase 2, any compute_root loop marked 'none' will be split into // blocks, threads, and serial loops. To enable the cost model to make a // meaningful prediction on these pre-split loops, we assume a split into // blocks and threads with a single full warp (if possible) - void split_compute_root_loops(LoopNest* loop_nest) const; + void split_compute_root_loops(LoopNest *loop_nest) const; // If a loop nest does not have thread loops, split the outermost serial // loops to create thread loops with extents 1 - void add_outer_thread_loops(LoopNest* loop_nest) const; + void add_outer_thread_loops(LoopNest *loop_nest) const; }; - IntrusivePtr get_root_for_features(const MachineParams ¶ms, const Target& target) const; + IntrusivePtr get_root_for_features(const MachineParams ¶ms, const Target &target) const; - void set_gpu_store_site(const map>& parent, const LoopNest* loop, LoopNest::Sites& site) const; + void set_gpu_store_site(const map> &parent, const LoopNest *loop, LoopNest::Sites &site) const; - bool compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, StageMap *features, Statistics& stats, bool verbose=false) const; + bool compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, StageMap *features, Statistics &stats, bool verbose = false) const; - void save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, std::ostream &out) const; + void save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, std::ostream &out) const; - bool contains_store_at(const set& outermost_store_at, const IntrusivePtr& parent) const; + bool contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const; // For GPU, only allow store_at root or inside the outermost loop nest. Any // store_ats further in will be hoisted and expanded, increasing the @@ -142,13 +141,13 @@ struct State { bool exceeds_serial_extents_limit(const Target &target) const; - int64_t get_shared_mem_alloc_size(const LoopNest* block, const LoopNest* loop) const; + int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const; bool exceeds_shared_memory_limit(const Target &target) const; bool exceeds_local_memory_limit(const Target &target) const; - bool calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target& target, CostModel *cost_model, Statistics& stats, bool verbose = false); + bool calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false); // Make a child copy of this state. The loop nest is const (we // make mutated copies of it, rather than mutating it), so we can @@ -160,13 +159,13 @@ struct State { void print_compute_locations() const; - void fuse_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents, const vector& constant_extents) const; + void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents, const vector &constant_extents) const; - void mark_gpu_blocks(LoopNest::StageScheduleState* state, Stage& stage, const vector& parallel_vars, const vector& parallel_extents) const; + void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents) const; - bool mark_gpu_threads(LoopNest::StageScheduleState* state, Stage& stage, std::unordered_set& new_serial_vars, std::ostringstream& staged_funcs_schedule_source) const; + bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const; - bool can_fuse_gpu(const vector& parallel_extents) const; + bool can_fuse_gpu(const vector ¶llel_extents) const; // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the @@ -177,7 +176,7 @@ struct State { void add_to_always_consider_inline_options(const FunctionDAG::Node *node); void update_always_consider_inline_options(const FunctionDAG::Node *node); - const LoopNest *deepest_valid_compute_location(const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap& total_shared_mem_alloc_sizes) const; + const LoopNest *deepest_valid_compute_location(const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const; int64_t total_loop_extents_of_ancestors(const map> &parent, const LoopNest *loop) const; }; @@ -250,4 +249,4 @@ class StateQueue { } // namespace Internal } // namespace Halide -#endif // STATE_H +#endif // STATE_H diff --git a/src/autoschedulers/anderson2021/Statistics.h b/src/autoschedulers/anderson2021/Statistics.h index e8efee149fa2..e0ccdd1bbb57 100644 --- a/src/autoschedulers/anderson2021/Statistics.h +++ b/src/autoschedulers/anderson2021/Statistics.h @@ -10,18 +10,17 @@ namespace Autoscheduler { using Clock = std::chrono::high_resolution_clock; -template +template struct ScopedStatistic { - const T& value; + const T &value; std::string msg; - ScopedStatistic(const T& value, const std::string& msg) - : value{value} - , msg{msg} - {} + ScopedStatistic(const T &value, const std::string &msg) + : value{value}, msg{msg} { + } ~ScopedStatistic() { - aslog(0) << msg << " = " << value << "\n"; + aslog(0) << msg << " = " << value << "\n"; } }; @@ -29,10 +28,8 @@ struct ScopedTimer { std::chrono::time_point start; std::string msg; - ScopedTimer(const std::string& msg) - : start{Clock::now()} - , msg{msg} - { + ScopedTimer(const std::string &msg) + : start{Clock::now()}, msg{msg} { aslog(0) << "Start: " << msg << "\n"; } @@ -47,8 +44,7 @@ struct Timer { std::chrono::time_point start; Timer() - : start{Clock::now()} - { + : start{Clock::now()} { } void restart() { @@ -130,4 +126,4 @@ struct Statistics { } // namespace Internal } // namespace Halide -#endif // STATISTICS_H +#endif // STATISTICS_H diff --git a/src/autoschedulers/anderson2021/ThreadInfo.h b/src/autoschedulers/anderson2021/ThreadInfo.h index a6d990adbb29..83a81c7d4723 100644 --- a/src/autoschedulers/anderson2021/ThreadInfo.h +++ b/src/autoschedulers/anderson2021/ThreadInfo.h @@ -38,7 +38,7 @@ struct ThreadTileOption { }; struct ThreadInfo { - ThreadInfo(int vectorized_loop_index, const std::vector& size, const std::vector& loop, const std::vector& max_thread_counts) { + ThreadInfo(int vectorized_loop_index, const std::vector &size, const std::vector &loop, const std::vector &max_thread_counts) { init_threads_in_this_block(max_thread_counts); std::size_t num_thread_loops = 0; @@ -83,8 +83,8 @@ struct ThreadInfo { count_num_active_warps_per_block(); } - template - void for_each_thread_id(const Fn& fn) const { + template + void for_each_thread_id(const Fn &fn) const { int thread_id = 0; for (int z = 0; z < threads_in_this_block[2]; z++) { for (int y = 0; y < threads_in_this_block[1]; y++) { @@ -96,9 +96,7 @@ struct ThreadInfo { // for thread.x in [0, 5]: // ... // For the 2nd loop, skip threads with x id >= 5 - bool active = x < threads[0] - && y < threads[1] - && z < threads[2]; + bool active = x < threads[0] && y < threads[1] && z < threads[2]; fn(thread_id, active, thread_id == num_threads_in_this_block - 1); ++thread_id; @@ -107,8 +105,8 @@ struct ThreadInfo { } } - template - void for_each_thread_id_in_first_warp(Fn& fn) const { + template + void for_each_thread_id_in_first_warp(Fn &fn) const { int thread_id = 0; for (int z = 0; z < threads_in_this_block[2]; z++) { for (int y = 0; y < threads_in_this_block[1]; y++) { @@ -120,9 +118,7 @@ struct ThreadInfo { // for thread.x in [0, 5]: // ... // For the 2nd loop, skip threads with x id >= 5 - bool active = x < threads[0] - && y < threads[1] - && z < threads[2]; + bool active = x < threads[0] && y < threads[1] && z < threads[2]; bool last_thread = thread_id == 31; fn(thread_id, x, y, z, active, last_thread); @@ -136,8 +132,8 @@ struct ThreadInfo { } } - template - void for_each_thread_id_in_tail_warp(Fn& fn) const { + template + void for_each_thread_id_in_tail_warp(Fn &fn) const { int thread_id = final_warp_initial_thread_id; int last_thread_id = thread_id + num_threads_in_final_warp - 1; @@ -150,16 +146,14 @@ struct ThreadInfo { internal_assert(y < threads_in_this_block[1]); internal_assert(x < threads_in_this_block[0]); - bool active = x < threads[0] - && y < threads[1] - && z < threads[2]; + bool active = x < threads[0] && y < threads[1] && z < threads[2]; fn(thread_id, x, y, z, active, thread_id == last_thread_id); } } - template - void for_each_active_thread_id(const Fn& fn) const { + template + void for_each_active_thread_id(const Fn &fn) const { for_each_thread_id([&](int thread_id, bool is_active, bool is_last_thread) { if (!is_active) { return; @@ -199,7 +193,7 @@ struct ThreadInfo { std::vector loop_vars; private: - void init_threads_in_this_block(const std::vector& max_thread_counts) { + void init_threads_in_this_block(const std::vector &max_thread_counts) { int num_thread_loops = 0; for (auto c : max_thread_counts) { if (c == 1) { @@ -272,4 +266,4 @@ struct ThreadInfo { } // namespace Internal } // namespace Halide -#endif // THREAD_INFO_H +#endif // THREAD_INFO_H diff --git a/src/autoschedulers/anderson2021/Tiling.cpp b/src/autoschedulers/anderson2021/Tiling.cpp index 14b5c8f3f54d..7171a726f293 100644 --- a/src/autoschedulers/anderson2021/Tiling.cpp +++ b/src/autoschedulers/anderson2021/Tiling.cpp @@ -6,8 +6,8 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -bool all_ones(const std::vector& nums) { - for (const auto& n : nums) { +bool all_ones(const std::vector &nums) { + for (const auto &n : nums) { if (n != 1) { return false; } @@ -15,7 +15,7 @@ bool all_ones(const std::vector& nums) { return true; } -bool equal_to_existing_size(const std::vector& s, const std::vector& nums) { +bool equal_to_existing_size(const std::vector &s, const std::vector &nums) { for (size_t i = 0; i < s.size(); ++i) { if (s[i] != nums[i]) { return false; @@ -328,4 +328,3 @@ vector> generate_gpu_tilings(const vector> &stag } // namespace Autoscheduler } // namespace Internal } // namespace Halide - diff --git a/src/autoschedulers/anderson2021/Tiling.h b/src/autoschedulers/anderson2021/Tiling.h index 031f91ab9049..80ce5fcd684d 100644 --- a/src/autoschedulers/anderson2021/Tiling.h +++ b/src/autoschedulers/anderson2021/Tiling.h @@ -10,16 +10,16 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -bool all_ones(const std::vector& nums); +bool all_ones(const std::vector &nums); -bool equal_to_existing_size(const std::vector& s, const std::vector& nums); +bool equal_to_existing_size(const std::vector &s, const std::vector &nums); vector> generate_serial_tilings(const vector &s, int d, int last_d, int vectorized_index, const vector &vec_dim_serial_sizes, - bool filter_small_outer_extents=false, - bool allow_inner_ones=false); + bool filter_small_outer_extents = false, + bool allow_inner_ones = false); // Given a multi-dimensional box of dimensionality d, generate a list // of candidate tile sizes for it, logarithmically spacing the sizes @@ -33,7 +33,6 @@ vector> generate_tilings(const vector &s, int d, int fa bool allow_splits, const vector &inner_sizes = vector()); - /** moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dimensions when loop nests are lowered **/ void lowered_dims(const vector &size, int vector_loop_i, vector &lowered_size); @@ -47,12 +46,12 @@ void lowered_dims(const vector &size, int vector_loop_i, vector> generate_gpu_tilings(const vector> &stage_sizes, - const vector> &pure_dims, - const vector &max_s, - int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage); + const vector> &pure_dims, + const vector &max_s, + int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage); } // namespace Autoscheduler } // namespace Internal } // namespace Halide -#endif // TILING_H +#endif // TILING_H diff --git a/src/autoschedulers/anderson2021/check_weights.cpp b/src/autoschedulers/anderson2021/check_weights.cpp index 9e8ea5ec9b2e..a9ac923336b4 100644 --- a/src/autoschedulers/anderson2021/check_weights.cpp +++ b/src/autoschedulers/anderson2021/check_weights.cpp @@ -1,7 +1,7 @@ #include #include -#include #include +#include #include "CostModel.h" #include "NetworkSize.h" diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp index c6608a412d0a..ee863015e15c 100644 --- a/src/autoschedulers/anderson2021/cost_model_generator.cpp +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -185,7 +185,7 @@ class CostModel : public Generator> { return 1 / (1 + exp(-e)); } - Expr print_wrap(Expr e, const std::string& out, const Var& n, const Var& w) { + Expr print_wrap(Expr e, const std::string &out, const Var &n, const Var &w) { if (training || !enable_debug_output) { return e; } @@ -344,8 +344,8 @@ class CostModel : public Generator> { // different cost to vectors and scalars, and a different cost // depending on whether we were inlined. Expr compute_cost = select(inlined_calls == 0, - num_scalars * relu1(1, w, n), - num_scalars * relu1(3, w, n)); + num_scalars * relu1(1, w, n), + num_scalars * relu1(3, w, n)); compute_cost = print_wrap(compute_cost, "compute_cost_initial", n, w); diff --git a/src/autoschedulers/anderson2021/test.cpp b/src/autoschedulers/anderson2021/test.cpp index 4e20ee0808fc..cfaa3052dbc0 100644 --- a/src/autoschedulers/anderson2021/test.cpp +++ b/src/autoschedulers/anderson2021/test.cpp @@ -164,7 +164,6 @@ int main(int argc, char **argv) { Pipeline(after[4]).auto_schedule(target, params); } - if (1) { Buffer im_a(1024, 1024, "a"), im_b(1024, 1024, "b"); im_a.fill(0.0f); @@ -249,7 +248,6 @@ int main(int argc, char **argv) { Pipeline(h).auto_schedule(target, params); } - if (1) { // A no-win scenario in which a Func is going to be read from // lots of times using a vector gather no matter how it is diff --git a/src/autoschedulers/anderson2021/test/bounds.cpp b/src/autoschedulers/anderson2021/test/bounds.cpp index d9bc5c599dc2..86ca90431bae 100644 --- a/src/autoschedulers/anderson2021/test/bounds.cpp +++ b/src/autoschedulers/anderson2021/test/bounds.cpp @@ -1,6 +1,6 @@ -#include "test.h" -#include "Tiling.h" #include "LoopNest.h" +#include "Tiling.h" +#include "test.h" using namespace Halide; using namespace Halide::Internal; @@ -23,9 +23,9 @@ void test_bounds() { outputs.push_back(h.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_h = &dag.nodes[0]; - const FunctionDAG::Node* node_g = &dag.nodes[1]; - const FunctionDAG::Node* node_f = &dag.nodes[2]; + const FunctionDAG::Node *node_h = &dag.nodes[0]; + const FunctionDAG::Node *node_g = &dag.nodes[1]; + const FunctionDAG::Node *node_f = &dag.nodes[2]; EXPECT_EQ(node_h->func.name(), std::string("h")); EXPECT_EQ(node_f->func.name(), std::string("f")); @@ -45,9 +45,9 @@ void test_bounds() { // Thread loop root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_f = thread->get_bounds(node_f); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_f = thread->get_bounds(node_f); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); @@ -67,9 +67,9 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[2]; - const FunctionDAG::Node* node_g = &dag.nodes[3]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[2]; + const FunctionDAG::Node *node_g = &dag.nodes[3]; std::unique_ptr root = std::make_unique(); @@ -85,9 +85,9 @@ void test_bounds() { // Thread loop root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_f = thread->get_bounds(node_f); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_f = thread->get_bounds(node_f); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 515); @@ -111,9 +111,9 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_h = &dag.nodes[1]; - const FunctionDAG::Node* node_g = &dag.nodes[2]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_h = &dag.nodes[1]; + const FunctionDAG::Node *node_g = &dag.nodes[2]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_h->func.name(), h.name()); @@ -136,9 +136,9 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_h = thread->get_bounds(node_h); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_h = thread->get_bounds(node_h); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); @@ -147,9 +147,9 @@ void test_bounds() { // If 'h' is inlined, the region_required should not change root_copy->inline_func(node_h); { - const auto& thread = root_copy->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_h = thread->get_bounds(node_h); + const auto &thread = root_copy->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_h = thread->get_bounds(node_h); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); @@ -160,8 +160,8 @@ void test_bounds() { { Func f("f"), g("g"), out("out"); g(x) = x; - f(x) = g(x - 100) + g(x + 100); // 201 points of g required for each point of f - out(x) = f(x) + g(x); // 1 point of g required for each point of out + f(x) = g(x - 100) + g(x + 100); // 201 points of g required for each point of f + out(x) = f(x) + g(x); // 1 point of g required for each point of out out.set_estimate(x, 0, 1024); @@ -169,9 +169,9 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[1]; - const FunctionDAG::Node* node_g = &dag.nodes[2]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[1]; + const FunctionDAG::Node *node_g = &dag.nodes[2]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_g->func.name(), g.name()); @@ -194,10 +194,10 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_f = thread->get_bounds(node_f); - const auto& thread_bounds_out = thread->get_bounds(node_out); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_f = thread->get_bounds(node_f); + const auto &thread_bounds_out = thread->get_bounds(node_out); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 201); EXPECT_EQ(thread_bounds_g->loops(0, 0).extent(), 201); @@ -206,8 +206,7 @@ void test_bounds() { EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 1); - - vector out_g_edge_chain; + vector out_g_edge_chain; for (const auto *e : node_g->outgoing_edges) { if (e->consumer != thread->stage) { continue; @@ -218,7 +217,7 @@ void test_bounds() { EXPECT_EQ((int)out_g_edge_chain.size(), 1); - vector out_f_g_edge_chain; + vector out_f_g_edge_chain; for (const auto *e : node_f->outgoing_edges) { if (e->consumer != thread->stage) { continue; @@ -230,29 +229,29 @@ void test_bounds() { out_f_g_edge_chain.push_back(node_f->stages[0].incoming_edges.front()); EXPECT_EQ((int)out_f_g_edge_chain.size(), 2); - const auto& thread_bounds_g_edge = thread->get_bounds_along_edge_chain(node_g, out_g_edge_chain); + const auto &thread_bounds_g_edge = thread->get_bounds_along_edge_chain(node_g, out_g_edge_chain); // This should only account for the edge from 'g' -> 'out' (and ignore the // edge from 'g' -> 'f') EXPECT_EQ(thread_bounds_g_edge->region_required(0).extent(), 1); - const auto& thread_bounds_f_g_edge = thread->get_bounds_along_edge_chain(node_g, out_f_g_edge_chain); + const auto &thread_bounds_f_g_edge = thread->get_bounds_along_edge_chain(node_g, out_f_g_edge_chain); EXPECT_EQ(thread_bounds_f_g_edge->region_required(0).extent(), 201); // If 'f' is inlined, the region_required should still produce valid results root_copy->inline_func(node_f); { - const auto& thread = root_copy->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto &thread = root_copy->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 201); - const auto& thread_bounds_g_edge = thread->get_bounds_along_edge_chain(node_g, out_g_edge_chain); + const auto &thread_bounds_g_edge = thread->get_bounds_along_edge_chain(node_g, out_g_edge_chain); EXPECT_EQ(thread_bounds_g_edge->region_required(0).extent(), 1); - const auto& thread_bounds_f_g_edge = thread->get_bounds_along_edge_chain(node_g, out_f_g_edge_chain); + const auto &thread_bounds_f_g_edge = thread->get_bounds_along_edge_chain(node_g, out_f_g_edge_chain); EXPECT_EQ(thread_bounds_f_g_edge->region_required(0).extent(), 201); } @@ -261,8 +260,8 @@ void test_bounds() { { Func f("f"), g("g"), out("out"); g(x) = x; - f(x) = g(x); // 1 point of g required for each point of f - out(x) = f(x) + g(x); // 1 point of g required for each point of out + f(x) = g(x); // 1 point of g required for each point of f + out(x) = f(x) + g(x); // 1 point of g required for each point of out out.set_estimate(x, 0, 1024); @@ -270,9 +269,9 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[1]; - const FunctionDAG::Node* node_g = &dag.nodes[2]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[1]; + const FunctionDAG::Node *node_g = &dag.nodes[2]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_g->func.name(), g.name()); @@ -295,17 +294,17 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_f = thread->get_bounds(node_f); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_f = thread->get_bounds(node_f); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); EXPECT_EQ(thread_bounds_f->region_required(0).extent(), 1); root_copy->inline_func(node_f); { - const auto& thread = root_copy->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto &thread = root_copy->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); } diff --git a/src/autoschedulers/anderson2021/test/parser.cpp b/src/autoschedulers/anderson2021/test/parser.cpp index 53d635c65ae2..e29f221e86b3 100644 --- a/src/autoschedulers/anderson2021/test/parser.cpp +++ b/src/autoschedulers/anderson2021/test/parser.cpp @@ -1,5 +1,5 @@ -#include "test.h" #include "LoopNestParser.h" +#include "test.h" using namespace Halide; using namespace Halide::Internal; diff --git a/src/autoschedulers/anderson2021/test/state.cpp b/src/autoschedulers/anderson2021/test/state.cpp index ac43ced6974a..0cc266857e47 100644 --- a/src/autoschedulers/anderson2021/test/state.cpp +++ b/src/autoschedulers/anderson2021/test/state.cpp @@ -1,6 +1,6 @@ -#include "test.h" -#include "LoopNest.h" #include "State.h" +#include "LoopNest.h" +#include "test.h" using namespace Halide; using namespace Halide::Internal; @@ -24,9 +24,9 @@ void test_state() { outputs.push_back(h.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_h = &dag.nodes[0]; - const FunctionDAG::Node* node_g = &dag.nodes[1]; - const FunctionDAG::Node* node_f = &dag.nodes[2]; + const FunctionDAG::Node *node_h = &dag.nodes[0]; + const FunctionDAG::Node *node_g = &dag.nodes[1]; + const FunctionDAG::Node *node_f = &dag.nodes[2]; EXPECT_EQ(node_h->func.name(), std::string("h")); EXPECT_EQ(node_f->func.name(), std::string("f")); diff --git a/src/autoschedulers/anderson2021/test/storage_strides.cpp b/src/autoschedulers/anderson2021/test/storage_strides.cpp index 6b43914f1fd0..dedb79a45cdf 100644 --- a/src/autoschedulers/anderson2021/test/storage_strides.cpp +++ b/src/autoschedulers/anderson2021/test/storage_strides.cpp @@ -1,5 +1,5 @@ -#include "test.h" #include "LoopNest.h" +#include "test.h" using namespace Halide; using namespace Halide::Internal; @@ -24,9 +24,9 @@ void test_bounds() { outputs.push_back(h.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_h = &dag.nodes[0]; - const FunctionDAG::Node* node_g = &dag.nodes[1]; - const FunctionDAG::Node* node_f = &dag.nodes[2]; + const FunctionDAG::Node *node_h = &dag.nodes[0]; + const FunctionDAG::Node *node_g = &dag.nodes[1]; + const FunctionDAG::Node *node_f = &dag.nodes[2]; EXPECT_EQ(node_h->func.name(), std::string("h")); EXPECT_EQ(node_f->func.name(), std::string("f")); @@ -46,10 +46,9 @@ void test_bounds() { // Thread loop root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); - - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_f = thread->get_bounds(node_f); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_f = thread->get_bounds(node_f); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 1); EXPECT_EQ(thread_bounds_g->region_required(1).extent(), 1); @@ -77,9 +76,9 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_h = &dag.nodes[1]; - const FunctionDAG::Node* node_g = &dag.nodes[2]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_h = &dag.nodes[1]; + const FunctionDAG::Node *node_g = &dag.nodes[2]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_h->func.name(), h.name()); @@ -102,9 +101,9 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_h = thread->get_bounds(node_h); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_h = thread->get_bounds(node_h); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); @@ -113,9 +112,9 @@ void test_bounds() { // If 'h' is inlined, the region_required should not change root_copy->inline_func(node_h); { - const auto& thread = root_copy->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); - const auto& thread_bounds_h = thread->get_bounds(node_h); + const auto &thread = root_copy->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); + const auto &thread_bounds_h = thread->get_bounds(node_h); EXPECT_EQ(thread_bounds_g->region_required(0).extent(), 3); @@ -135,8 +134,8 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[1]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[1]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_f->func.name(), f.name()); @@ -158,16 +157,16 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& root_bounds_f = root->get_bounds(node_f); + const auto &root_bounds_f = root->get_bounds(node_f); EXPECT_EQ(root_bounds_f->region_required(0).extent(), 1024); EXPECT_EQ(1, (int)node_f->outgoing_edges.size()); EXPECT_EQ(1, (int)node_f->outgoing_edges.front()->load_jacobians.size()); ThreadInfo thread_info{0, {32}, node_out->stages[0].loop, {32}}; - const auto& jac = node_f->outgoing_edges.front()->load_jacobians.front(); + const auto &jac = node_f->outgoing_edges.front()->load_jacobians.front(); - const auto& thread = root->children[0]->children[0]; + const auto &thread = root->children[0]->children[0]; Strides strides = thread->compute_strides(jac, 0, node_f, root_bounds_f, thread_info, verbose); GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; @@ -178,8 +177,7 @@ void test_bounds() { accumulator.add_access_info( num_requests, mem_info, - false - ); + false); EXPECT_EQ(4, mem_info.num_transactions()); } @@ -196,8 +194,8 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[1]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[1]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_f->func.name(), f.name()); @@ -219,16 +217,16 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& root_bounds_f = root->get_bounds(node_f); + const auto &root_bounds_f = root->get_bounds(node_f); EXPECT_EQ(root_bounds_f->region_required(0).extent(), 512); EXPECT_EQ(1, (int)node_f->outgoing_edges.size()); EXPECT_EQ(1, (int)node_f->outgoing_edges.front()->load_jacobians.size()); ThreadInfo thread_info{0, {32}, node_out->stages[0].loop, {32}}; - const auto& jac = node_f->outgoing_edges.front()->load_jacobians.front(); + const auto &jac = node_f->outgoing_edges.front()->load_jacobians.front(); - const auto& thread = root->children[0]->children[0]; + const auto &thread = root->children[0]->children[0]; Strides strides = thread->compute_strides(jac, 0, node_f, root_bounds_f, thread_info, verbose); GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; @@ -239,8 +237,7 @@ void test_bounds() { accumulator.add_access_info( num_requests, mem_info, - false - ); + false); EXPECT_EQ(2, mem_info.num_transactions()); } @@ -258,8 +255,8 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[1]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[1]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_f->func.name(), f.name()); @@ -284,7 +281,7 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& root_bounds_f = root->get_bounds(node_f); + const auto &root_bounds_f = root->get_bounds(node_f); EXPECT_EQ(root_bounds_f->region_required(0).extent(), 1024); EXPECT_EQ(root_bounds_f->region_required(1).extent(), 512); @@ -293,9 +290,9 @@ void test_bounds() { EXPECT_EQ(1, (int)node_f->outgoing_edges.front()->load_jacobians.size()); ThreadInfo thread_info{1, {1, 32}, node_out->stages[0].loop, {1, 32}}; - const auto& jac = node_f->outgoing_edges.front()->load_jacobians.front(); + const auto &jac = node_f->outgoing_edges.front()->load_jacobians.front(); - const auto& thread = root->children[0]->children[0]; + const auto &thread = root->children[0]->children[0]; Strides strides = thread->compute_strides(jac, 0, node_f, root_bounds_f, thread_info, verbose); strides.dump(true); @@ -307,8 +304,7 @@ void test_bounds() { accumulator.add_access_info( num_requests, mem_info, - false - ); + false); EXPECT_EQ(16, mem_info.num_transactions()); } @@ -327,9 +323,9 @@ void test_bounds() { outputs.push_back(out.function()); FunctionDAG dag(outputs, params, target); - const FunctionDAG::Node* node_out = &dag.nodes[0]; - const FunctionDAG::Node* node_f = &dag.nodes[1]; - const FunctionDAG::Node* node_g = &dag.nodes[2]; + const FunctionDAG::Node *node_out = &dag.nodes[0]; + const FunctionDAG::Node *node_f = &dag.nodes[1]; + const FunctionDAG::Node *node_g = &dag.nodes[2]; EXPECT_EQ(node_out->func.name(), out.name()); EXPECT_EQ(node_f->func.name(), f.name()); @@ -359,7 +355,7 @@ void test_bounds() { std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); - const auto& root_bounds_f = root->get_bounds(node_f); + const auto &root_bounds_f = root->get_bounds(node_f); EXPECT_EQ(root_bounds_f->region_required(0).extent(), 1024); @@ -367,10 +363,10 @@ void test_bounds() { EXPECT_EQ(1, (int)node_g->outgoing_edges.front()->load_jacobians.size()); ThreadInfo thread_info{1, {32, 1}, node_out->stages[0].loop, {32, 1}}; - const auto& jac = node_g->outgoing_edges.front()->load_jacobians.front(); + const auto &jac = node_g->outgoing_edges.front()->load_jacobians.front(); - const auto& thread = root->children[0]->children[0]; - const auto& thread_bounds_g = thread->get_bounds(node_g); + const auto &thread = root->children[0]->children[0]; + const auto &thread_bounds_g = thread->get_bounds(node_g); Strides strides = thread->compute_strides(jac, 0, node_g, thread_bounds_g, thread_info, verbose); GlobalAccessAccumulator accumulator{bytes_per_point, 1, strides, verbose}; @@ -381,8 +377,7 @@ void test_bounds() { accumulator.add_access_info( num_requests, mem_info, - false - ); + false); EXPECT_EQ(4, mem_info.num_transactions()); } diff --git a/src/autoschedulers/anderson2021/test/test.h b/src/autoschedulers/anderson2021/test/test.h index d28796029866..a5489de57837 100644 --- a/src/autoschedulers/anderson2021/test/test.h +++ b/src/autoschedulers/anderson2021/test/test.h @@ -11,16 +11,16 @@ namespace Autoscheduler { #define EXPECT_EQ(expected, actual) expect_eq(__LINE__, expected, actual) #define EXPECT(expected) expect(__LINE__, expected) -template -void expect_eq(int line, const A& expected, const B& actual) { +template +void expect_eq(int line, const A &expected, const B &actual) { user_assert(expected == actual) << "Assert failed on line " << line << "." << "\nExpected value = " << expected << "\nActual value = " << actual; } -template -void expect(int line, const A& expected) { +template +void expect(int line, const A &expected) { user_assert(expected) << "Assert failed on line " << line << "." << "\nExpected value to be true\n"; @@ -30,4 +30,4 @@ void expect(int line, const A& expected) { } // namespace Internal } // namespace Halide -#endif // TEST_H +#endif // TEST_H diff --git a/src/autoschedulers/anderson2021/test/thread_info.cpp b/src/autoschedulers/anderson2021/test/thread_info.cpp index ef4dc91cabd6..149322068ff4 100644 --- a/src/autoschedulers/anderson2021/test/thread_info.cpp +++ b/src/autoschedulers/anderson2021/test/thread_info.cpp @@ -1,6 +1,6 @@ -#include "test.h" -#include "ThreadInfo.h" #include "LoopNest.h" +#include "ThreadInfo.h" +#include "test.h" using namespace Halide; using namespace Halide::Internal; @@ -100,7 +100,6 @@ void test_thread_info() { } } - int main(int argc, char **argv) { test_thread_info(); printf("All tests passed.\n"); diff --git a/src/autoschedulers/anderson2021/test/tiling.cpp b/src/autoschedulers/anderson2021/test/tiling.cpp index 93d664c9f2cf..6e0827c9ede8 100644 --- a/src/autoschedulers/anderson2021/test/tiling.cpp +++ b/src/autoschedulers/anderson2021/test/tiling.cpp @@ -1,7 +1,7 @@ #include -#include "test.h" #include "Tiling.h" +#include "test.h" using namespace Halide; using namespace Halide::Internal; @@ -9,17 +9,17 @@ using namespace Halide::Internal::Autoscheduler; using tilings_t = vector>; -std::string to_string(const tilings_t& tilings) { +std::string to_string(const tilings_t &tilings) { std::ostringstream s; s << "[\n"; bool first_tiling = true; - for (const auto& t : tilings) { + for (const auto &t : tilings) { if (!first_tiling) { s << ",\n"; } s << " ["; bool first = true; - for (const auto& x : t) { + for (const auto &x : t) { if (!first) { s << ", "; } @@ -34,8 +34,8 @@ std::string to_string(const tilings_t& tilings) { return s.str(); } -template <> -void Halide::Internal::Autoscheduler::expect_eq(int line, const tilings_t& expected, const tilings_t& actual) { +template<> +void Halide::Internal::Autoscheduler::expect_eq(int line, const tilings_t &expected, const tilings_t &actual) { expect_eq(line, to_string(expected), to_string(actual)); } From 73b055a0413893253fc710fec4afc64cc8b4a53f Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 28 Jul 2022 01:42:23 -0400 Subject: [PATCH 03/63] clang-format 13 --- src/autoschedulers/anderson2021/GPULoopInfo.h | 2 +- src/autoschedulers/anderson2021/LoopNest.cpp | 20 +++++++++---------- .../anderson2021/SearchSpace.cpp | 2 +- .../anderson2021/cost_model_generator.cpp | 2 +- .../anderson2021/cost_model_schedule.h | 4 ---- .../anderson2021/test_function_dag.cpp | 2 +- 6 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/autoschedulers/anderson2021/GPULoopInfo.h b/src/autoschedulers/anderson2021/GPULoopInfo.h index 50696544aac9..b74ebd9a229a 100644 --- a/src/autoschedulers/anderson2021/GPULoopInfo.h +++ b/src/autoschedulers/anderson2021/GPULoopInfo.h @@ -3,7 +3,7 @@ /** \file * - * Data structure containing information about the current GPU loop nest + * Data structure containing information about the current GPU loop nest * hierarchy of blocks, threads, etc. Useful when computing GPU features */ diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index fcdc9e3ff5ee..e9abc8e02a1c 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -951,13 +951,13 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ thread_info, total_serial_loop_extents, verbose); - //feat.num_local_mem_stores_per_block = local_mem_info.num_transactions(); + // feat.num_local_mem_stores_per_block = local_mem_info.num_transactions(); if (stage->index > 0) { local_mem_loads.add(local_mem_info); } - //feat.local_mem_store_efficiency = local_mem_info.efficiency(); + // feat.local_mem_store_efficiency = local_mem_info.efficiency(); - //internal_assert(in_range_zero_one(feat.local_mem_store_efficiency)) << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency << " for " << node->func.name(); + // internal_assert(in_range_zero_one(feat.local_mem_store_efficiency)) << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency << " for " << node->func.name(); } if (verbose) { @@ -1754,8 +1754,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.shared_bytes_at_task = feat.bytes_at_realization; feat.shared_innermost_bytes_at_task = feat.innermost_bytes_at_realization; } else if (site.is_stored_in_local_mem()) { - //feat.local_bytes_at_task = feat.bytes_at_realization; - //feat.local_innermost_bytes_at_task = feat.innermost_bytes_at_realization; + // feat.local_bytes_at_task = feat.bytes_at_realization; + // feat.local_innermost_bytes_at_task = feat.innermost_bytes_at_realization; } else if (site.is_stored_in_registers()) { feat.register_bytes_at_task = feat.bytes_at_realization; feat.register_innermost_bytes_at_task = feat.innermost_bytes_at_realization; @@ -1963,8 +1963,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.shared_bytes_at_task = bytes_at_task; feat.shared_innermost_bytes_at_task = innermost_bytes_at_task; } else if (site.is_stored_in_local_mem()) { - //feat.local_bytes_at_task = bytes_at_task; - //feat.local_innermost_bytes_at_task = innermost_bytes_at_task; + // feat.local_bytes_at_task = bytes_at_task; + // feat.local_innermost_bytes_at_task = innermost_bytes_at_task; } else { internal_assert(false); } @@ -2572,9 +2572,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.unique_shared_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; feat.shared_allocation_bytes_read_per_realization += feat.bytes_at_production; } else if (consumer_site.is_stored_in_local_mem()) { - //feat.unique_local_bytes_read_per_realization += feat.bytes_at_production; - //feat.unique_local_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; - //feat.local_allocation_bytes_read_per_realization += feat.bytes_at_production; + // feat.unique_local_bytes_read_per_realization += feat.bytes_at_production; + // feat.unique_local_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + // feat.local_allocation_bytes_read_per_realization += feat.bytes_at_production; } else if (consumer_site.is_stored_in_registers()) { feat.unique_register_bytes_read_per_realization += feat.bytes_at_production; feat.unique_register_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index 46e5e7110d60..5e82955af630 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -277,7 +277,7 @@ void SearchSpace::generate_children(IntrusivePtr state, << ", which is one of the consumers of " << node->func.name(); } - //ScopedTimer scoped_timer{"generate_children() for " + node->func.name()}; + // ScopedTimer scoped_timer{"generate_children() for " + node->func.name()}; bool must_inline = inlined_nodes.contains(node); bool must_compute_root = compute_root_nodes.contains(node); diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp index ee863015e15c..64da7eb27383 100644 --- a/src/autoschedulers/anderson2021/cost_model_generator.cpp +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -380,7 +380,7 @@ class CostModel : public Generator> { Expr register_block_occupancy = print_wrap(select(inlined_calls == 0, max_active_blocks / 32.f, 1.f), "register_block_occupancy", n, w); - //compute_cost *= select(inlined_calls == 0, 1.f / register_block_occupancy, 1.f); + // compute_cost *= select(inlined_calls == 0, 1.f / register_block_occupancy, 1.f); compute_cost = print_wrap(compute_cost, "compute_cost_after_register_block_occupancy", n, w); // Next comes a long list of plausible terms to capture the cost of loads. diff --git a/src/autoschedulers/anderson2021/cost_model_schedule.h b/src/autoschedulers/anderson2021/cost_model_schedule.h index 090cd672f465..8e560e49dcd7 100644 --- a/src/autoschedulers/anderson2021/cost_model_schedule.h +++ b/src/autoschedulers/anderson2021/cost_model_schedule.h @@ -5,10 +5,6 @@ using namespace Halide; inline void do_cost_model_schedule(Halide::Pipeline pipeline) { // Generated by autoscheduler, manually remove unrolls. // Also manually replaced all RoundUp and ShiftInwards with GuardWithIf. - //for (int i = 0; i < 58; i++) { - //pipeline.get_func(i).compute_root(); - //return; - //} using ::Halide::Func; using ::Halide::MemoryType; diff --git a/src/autoschedulers/anderson2021/test_function_dag.cpp b/src/autoschedulers/anderson2021/test_function_dag.cpp index 933c7f9d5027..253307321ecc 100644 --- a/src/autoschedulers/anderson2021/test_function_dag.cpp +++ b/src/autoschedulers/anderson2021/test_function_dag.cpp @@ -76,7 +76,7 @@ void test_coeff_wise(const MachineParams ¶ms, const Target &target) { } // Disabled for now: there is still work to do to populate the jacobian - //assert(with_extern.str() == without_extern.str()); + // assert(with_extern.str() == without_extern.str()); } extern "C" int matmul( From dfe18b84e603d6b607c790eaf4bdc3bda5442862 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 14 Aug 2022 02:05:42 -0400 Subject: [PATCH 04/63] run clang-tidy, remove MachineParams, use new autoscheduler params and entry point --- apps/cuda_mat_mul/mat_mul_generator.cpp | 2 +- src/autoschedulers/anderson2021/ASLog.cpp | 4 +- .../anderson2021/AutoSchedule.cpp | 110 ++++++--- .../anderson2021/AutoSchedule.h | 14 +- src/autoschedulers/anderson2021/CostModel.h | 2 +- .../anderson2021/DefaultCostModel.cpp | 6 +- .../anderson2021/DefaultCostModel.h | 4 +- .../anderson2021/Featurization.h | 6 +- .../anderson2021/FunctionDAG.cpp | 42 ++-- src/autoschedulers/anderson2021/FunctionDAG.h | 46 ++-- src/autoschedulers/anderson2021/LoopNest.cpp | 216 ++++++++++-------- src/autoschedulers/anderson2021/LoopNest.h | 18 +- src/autoschedulers/anderson2021/Makefile | 6 +- .../anderson2021/PerfectHashMap.h | 24 +- .../anderson2021/SearchSpace.cpp | 78 +++---- src/autoschedulers/anderson2021/SearchSpace.h | 12 +- src/autoschedulers/anderson2021/State.cpp | 111 +++++---- src/autoschedulers/anderson2021/State.h | 12 +- src/autoschedulers/anderson2021/ThreadInfo.h | 8 +- src/autoschedulers/anderson2021/Tiling.cpp | 54 +++-- src/autoschedulers/anderson2021/Tiling.h | 2 +- src/autoschedulers/anderson2021/Weights.cpp | 162 +++++++++---- .../anderson2021/autotune_loop.sh | 5 +- .../anderson2021/cost_model_generator.cpp | 39 ++-- .../anderson2021/generate_data.sh | 2 +- .../anderson2021/retrain_cost_model.cpp | 29 ++- src/autoschedulers/anderson2021/test.cpp | 46 ++-- .../anderson2021/test/bounds.cpp | 31 ++- .../anderson2021/test/state.cpp | 3 +- .../anderson2021/test/storage_strides.cpp | 37 ++- .../anderson2021/test/thread_info.cpp | 49 ++-- .../anderson2021/test_perfect_hash_map.cpp | 2 +- 32 files changed, 709 insertions(+), 473 deletions(-) diff --git a/apps/cuda_mat_mul/mat_mul_generator.cpp b/apps/cuda_mat_mul/mat_mul_generator.cpp index 68001ce7a135..6f2cb17c8cd6 100644 --- a/apps/cuda_mat_mul/mat_mul_generator.cpp +++ b/apps/cuda_mat_mul/mat_mul_generator.cpp @@ -34,7 +34,7 @@ class MatMul : public Halide::Generator { Var xi, yi, xio, xii, yii, xo, yo, x_pair, xiio, ty; RVar rxo, rxi; - if (!auto_schedule) { + if (!using_autoscheduler()) { out.bound(x, 0, size) .bound(y, 0, size) .tile(x, y, xi, yi, 64, 16) diff --git a/src/autoschedulers/anderson2021/ASLog.cpp b/src/autoschedulers/anderson2021/ASLog.cpp index 601ceabd3c71..51ccd9cf72c1 100644 --- a/src/autoschedulers/anderson2021/ASLog.cpp +++ b/src/autoschedulers/anderson2021/ASLog.cpp @@ -26,7 +26,9 @@ std::string get_env_variable(char const *env_var_name) { return lvl; #else char *lvl = getenv(env_var_name); - if (lvl) return std::string(lvl); + if (lvl) { + return std::string(lvl); + } #endif return ""; diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 69b63ad1cdfb..a80c7222f81f 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -32,7 +32,7 @@ Needs to be converted to a sample file with the runtime using featurization_to_sample before it can be used to train. HL_MACHINE_PARAMS - An architecture description string. Used by Halide master to configure the cost model. We only use the first term. Set it to the number of cores to target. + An architecture description string. Used by Halide master to configure the cost model. We only use the first term. Set it to the number of SMs on the target GPU. HL_PERMIT_FAILED_UNROLL Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. @@ -89,9 +89,11 @@ #include "Featurization.h" #include "FunctionDAG.h" #include "Halide.h" +#include "HalidePlugin.h" #include "LoopNest.h" #include "LoopNestParser.h" #include "NetworkSize.h" +#include "ParamParser.h" #include "PerfectHashMap.h" #include "State.h" @@ -104,9 +106,11 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -using std::map; -using std::pair; -using std::set; +struct Anderson2021Params { + /* Maximum level of parallelism available i.e. number of SMs on target GPU */ + int parallelism = 80; +}; + using std::string; using std::vector; @@ -125,7 +129,9 @@ double get_dropout_threshold() { // training data. bool random_dropout(std::mt19937 &rng, size_t num_decisions) { static double random_dropout_threshold = std::max(0.0, get_dropout_threshold()); - if (random_dropout_threshold >= 100) return false; + if (random_dropout_threshold >= 100) { + return false; + } // The random dropout threshold is the chance that we operate // entirely greedily and never discard anything. @@ -150,14 +156,14 @@ std::string get_search_space_options() { // Configure a cost model to process a specific pipeline. void configure_pipeline_features(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, CostModel *cost_model) { cost_model->reset(); - cost_model->set_pipeline_features(dag, params); + cost_model->set_pipeline_features(dag, hardware_parallelism); } AutoSchedule::AutoSchedule(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const std::vector &outputs, std::mt19937 &rng, @@ -165,8 +171,8 @@ AutoSchedule::AutoSchedule(const FunctionDAG &dag, Statistics &stats, SearchSpace &search_space, const LoopNestParser *partial_schedule) - : dag{dag}, params{params}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} { - configure_pipeline_features(dag, params, cost_model); + : dag{dag}, hardware_parallelism{hardware_parallelism}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} { + configure_pipeline_features(dag, hardware_parallelism, cost_model); } // A single pass of coarse-to-fine beam search. @@ -312,7 +318,9 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, permitted_hashes.insert(h1); s = s->parent.get(); } - if (pending.empty()) break; + if (pending.empty()) { + break; + } state = pending.pop(); blessed++; } @@ -499,12 +507,13 @@ IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { // The main entrypoint to generate a schedule for a pipeline. void generate_schedule(const std::vector &outputs, const Target &target, - const MachineParams ¶ms, + int hardware_parallelism, AutoSchedulerResults *auto_scheduler_results) { internal_assert(target.has_gpu_feature()) << "Specified target (" << target.to_string() << ") does not support GPU"; Timer timer; aslog(0) << "generate_schedule for target=" << target.to_string() << "\n"; + aslog(0) << "hardware_parallelism = " << hardware_parallelism << "\n"; // Start a timer HALIDE_TIC; @@ -512,7 +521,7 @@ void generate_schedule(const std::vector &outputs, // Get the seed for random dropout string seed_str = get_env_variable("HL_SEED"); // Or use the time, if not set. - int seed = (int)time(NULL); + int seed = (int)time(nullptr); if (!seed_str.empty()) { seed = atoi(seed_str.c_str()); } @@ -534,7 +543,7 @@ void generate_schedule(const std::vector &outputs, bool randomize_weights = randomize_weights_str == "1"; // Analyse the Halide algorithm and construct our abstract representation of it - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); if (aslog::aslog_level() > 0) { dag.dump(); } @@ -560,9 +569,9 @@ void generate_schedule(const std::vector &outputs, } std::mt19937 rng{(uint32_t)seed}; - SearchSpace search_space{dag, params, target, get_search_space_options(), rng, cost_model.get(), stats, partial_schedule.get()}; + SearchSpace search_space{dag, hardware_parallelism, target, get_search_space_options(), rng, cost_model.get(), stats, partial_schedule.get()}; - AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()}; + AutoSchedule autoschedule{dag, hardware_parallelism, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()}; // Run beam search optimal = autoschedule.optimal_schedule(beam_size); @@ -573,10 +582,10 @@ void generate_schedule(const std::vector &outputs, aslog(1) << "** Optimal schedule:\n"; // Just to get the debugging prints to fire - optimal->calculate_cost(dag, params, target, cost_model.get(), stats, aslog::aslog_level() > 0); + optimal->calculate_cost(dag, hardware_parallelism, target, cost_model.get(), stats, aslog::aslog_level() > 0); // Apply the schedules to the pipeline - optimal->apply_schedule(dag, params, target); + optimal->apply_schedule(dag, hardware_parallelism, target); // Print out the schedule if (aslog::aslog_level() > 0) { @@ -604,17 +613,19 @@ void generate_schedule(const std::vector &outputs, if (!feature_file.empty()) { user_warning << "HL_FEATURE_FILE is deprecated; use the featurization output from Generator instead\n"; std::ofstream binfile(feature_file, std::ios::binary | std::ios_base::trunc); - optimal->save_featurization(dag, params, target, binfile); + optimal->save_featurization(dag, hardware_parallelism, target, binfile); binfile.close(); internal_assert(!binfile.fail()) << "Failed to write " << feature_file; } if (auto_scheduler_results) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto_scheduler_results->scheduler_name = "Anderson2021"; +#endif auto_scheduler_results->schedule_source = optimal->schedule_source; { std::ostringstream out; - optimal->save_featurization(dag, params, target, out); + optimal->save_featurization(dag, hardware_parallelism, target, out); auto_scheduler_results->featurization.resize(out.str().size()); memcpy(auto_scheduler_results->featurization.data(), out.str().data(), out.str().size()); } @@ -648,25 +659,58 @@ void generate_schedule(const std::vector &outputs, // Halide uses a plugin architecture for registering custom // autoschedulers. We register our autoscheduler using a static // constructor. -struct RegisterAutoscheduler { - RegisterAutoscheduler() { - aslog(1) << "Registering autoscheduler 'Anderson2021'...\n"; - Pipeline::add_autoscheduler("Anderson2021", *this); +//struct RegisterAutoscheduler { + //RegisterAutoscheduler() { + //aslog(1) << "Registering autoscheduler 'Anderson2021'...\n"; + //Pipeline::add_autoscheduler("Anderson2021", *this); + //} + + //void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms, AutoSchedulerResults *results) { + //std::vector outputs; + //for (const Func& f : p.outputs()) { + //outputs.push_back(f.function()); + //} + //Autoscheduler::generate_schedule(outputs, target, params.parallelism, results); + //} +//} register_auto_scheduler; + +struct Anderson2021 { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms_in, AutoSchedulerResults *results) { + std::vector outputs; + for (const Func &f : p.outputs()) { + outputs.push_back(f.function()); + } + Anderson2021Params params; + params.parallelism = params_in.parallelism; + Autoscheduler::generate_schedule(outputs, target, params.parallelism, results); } +#else + void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *results) { + internal_assert(params_in.name == "Anderson2021"); - void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms, AutoSchedulerResults *results) { std::vector outputs; - for (Func f : p.outputs()) { + for (const Func &f : p.outputs()) { outputs.push_back(f.function()); } - Autoscheduler::generate_schedule(outputs, target, params, results); + Anderson2021Params params; + { + ParamParser parser(params_in.extra); + parser.parse("parallelism", ¶ms.parallelism); + parser.finish(); + } + Autoscheduler::generate_schedule(outputs, target, params.parallelism, results); + results->autoscheduler_params = params_in; } -} register_auto_scheduler; +#endif +}; + +REGISTER_AUTOSCHEDULER(Anderson2021) // An alternative entrypoint for other uses void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, CostModel *cost_model, int beam_size, @@ -685,16 +729,16 @@ void find_and_apply_schedule(FunctionDAG &dag, aslog(0) << "\n"; } - SearchSpace search_space{dag, params, target, get_env_variable("HL_SEARCH_SPACE_OPTIONS"), rng, cost_model, stats, partial_schedule.get()}; - AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()}; + SearchSpace search_space{dag, hardware_parallelism, target, get_env_variable("HL_SEARCH_SPACE_OPTIONS"), rng, cost_model, stats, partial_schedule.get()}; + AutoSchedule autoschedule{dag, hardware_parallelism, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()}; IntrusivePtr optimal = autoschedule.optimal_schedule(beam_size); // Apply the schedules - optimal->apply_schedule(dag, params, target); + optimal->apply_schedule(dag, hardware_parallelism, target); if (schedule_features) { - optimal->compute_featurization(dag, params, target, schedule_features, stats); + optimal->compute_featurization(dag, hardware_parallelism, target, schedule_features, stats); } } diff --git a/src/autoschedulers/anderson2021/AutoSchedule.h b/src/autoschedulers/anderson2021/AutoSchedule.h index fff5d96563a3..be0fd865ccb9 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.h +++ b/src/autoschedulers/anderson2021/AutoSchedule.h @@ -18,10 +18,14 @@ namespace Autoscheduler { struct ProgressBar { void set(double progress) { - if (!draw_progress_bar) return; + if (!draw_progress_bar) { + return; + } counter++; const int bits = 11; - if (counter & ((1 << bits) - 1)) return; + if (counter & ((1 << bits) - 1)) { + return; + } const int pos = (int)(progress * 78); aslog(0) << '['; for (int j = 0; j < 78; j++) { @@ -59,7 +63,7 @@ typedef PerfectHashMap StageMapOfSch struct AutoSchedule { const FunctionDAG &dag; - const MachineParams ¶ms; + int hardware_parallelism; const Target ⌖ const std::vector &outputs; std::mt19937 &rng; @@ -69,7 +73,7 @@ struct AutoSchedule { const LoopNestParser *partial_schedule; AutoSchedule(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const std::vector &outputs, std::mt19937 &rng, @@ -92,7 +96,7 @@ struct AutoSchedule { IntrusivePtr optimal_schedule(int beam_size); }; -void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const MachineParams ¶ms, const Target &target, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); +void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, int hardware_parallelism, const Target &target, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); } // namespace Autoscheduler } // namespace Internal diff --git a/src/autoschedulers/anderson2021/CostModel.h b/src/autoschedulers/anderson2021/CostModel.h index 85d19caeb36f..db85072d9171 100644 --- a/src/autoschedulers/anderson2021/CostModel.h +++ b/src/autoschedulers/anderson2021/CostModel.h @@ -22,7 +22,7 @@ class CostModel { // Configure the cost model for the algorithm to be scheduled. virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - const MachineParams ¶ms) = 0; + int hardware_parallelism) = 0; // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place. // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features. diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index 8f407b8130f2..b453c4789055 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -44,7 +44,7 @@ bool ends_with(const std::string &str, const std::string &suffix) { } // namespace void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - const MachineParams ¶ms) { + int hardware_parallelism) { const int pipeline_feat_size = head1_w * head1_h; // We ignore the first seven pipeline features in the cost @@ -74,8 +74,8 @@ void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::Func } internal_assert(stage == num_stages); pipeline_feat_queue = pipeline_features; - internal_assert(params.parallelism > 0); - num_cores = params.parallelism; + internal_assert(hardware_parallelism > 0); + num_cores = hardware_parallelism; } void DefaultCostModel::set_pipeline_features(const Runtime::Buffer &pipeline_feats, int n) { diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.h b/src/autoschedulers/anderson2021/DefaultCostModel.h index 68fe0f6f4e1c..d5ff40fc5ccf 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.h +++ b/src/autoschedulers/anderson2021/DefaultCostModel.h @@ -39,11 +39,11 @@ class DefaultCostModel : public CostModel { stats{stats} { load_weights(); } - virtual ~DefaultCostModel() = default; + ~DefaultCostModel() override = default; // Configure the cost model for the algorithm to be scheduled. void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - const MachineParams ¶ms) override; + int hardware_parallelism) override; void set_pipeline_features(const Runtime::Buffer &, int n); // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of diff --git a/src/autoschedulers/anderson2021/Featurization.h b/src/autoschedulers/anderson2021/Featurization.h index 780cf0eff197..66cb476bfc88 100644 --- a/src/autoschedulers/anderson2021/Featurization.h +++ b/src/autoschedulers/anderson2021/Featurization.h @@ -1,9 +1,9 @@ #ifndef FEATURIZATION_H #define FEATURIZATION_H +#include #include #include -#include #include "ASLog.h" @@ -104,7 +104,9 @@ struct PipelineFeatures { for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) { const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"}; // Skip printing for types not used - if (!types_in_use[i]) continue; + if (!types_in_use[i]) { + continue; + } os << " Featurization for type " << type_names[i] << "\n" << " Op histogram:\n" diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp index be811f622376..b2e4f0211ee7 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.cpp +++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp @@ -1,5 +1,7 @@ #include "FunctionDAG.h" +#include + #include "ASLog.h" namespace Halide { @@ -353,7 +355,7 @@ BoundContents::Layout::~Layout() { for (auto *b : pool) { b->~BoundContents(); } - for (auto b : blocks) { + for (auto *b : blocks) { free(b); } } @@ -506,7 +508,9 @@ bool FunctionDAG::Edge::all_load_jacobian_coeffs_exist() const { void FunctionDAG::Edge::add_load_jacobian(LoadJacobian j1) { for (auto &j2 : load_jacobians) { - if (j2.merge(j1)) return; + if (j2.merge(j1)) { + return; + } } load_jacobians.emplace_back(std::move(j1)); } @@ -557,7 +561,7 @@ void FunctionDAG::Edge::expand_footprint(const Span *consumer_loop, Span *produc } } -FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams ¶ms, const Target &target) { +FunctionDAG::FunctionDAG(const vector &outputs, const Target &target) { map env = build_environment(outputs); // A mutator to apply parameter estimates to the expressions @@ -710,7 +714,9 @@ FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams &p for (size_t i = 0; i < sched.dims().size(); i++) { const auto &d = sched.dims()[i]; // Skip synthetic loops like "__outermost" - if (!stage_scope_with_symbolic_rvar_bounds.contains(d.var)) continue; + if (!stage_scope_with_symbolic_rvar_bounds.contains(d.var)) { + continue; + } Node::Loop l; l.var = d.var; @@ -818,7 +824,7 @@ FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams &p int leaves = 0; Type narrowest_type; map calls; - CheckTypes(Function f) + CheckTypes(const Function &f) : func(f) { } }; @@ -858,7 +864,7 @@ FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams &p if (node.is_output) { // Get the bounds estimate map estimates; - for (auto b : consumer.schedule().estimates()) { + for (const auto &b : consumer.schedule().estimates()) { int64_t i_min = *as_const_int(b.min); int64_t i_extent = *as_const_int(b.extent); @@ -881,7 +887,7 @@ FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams &p estimates[b.var] = Span(i_min, i_min + i_extent - 1, false); } } - for (auto b : consumer.schedule().bounds()) { + for (const auto &b : consumer.schedule().bounds()) { const int64_t *i_min = as_const_int(b.min); const int64_t *i_extent = as_const_int(b.extent); if (i_min && i_extent) { @@ -952,7 +958,7 @@ FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams &p // Initialize the memory layouts for the bounds structs for (auto &n : nodes) { - n.bounds_memory_layout.reset(new BoundContents::Layout); + n.bounds_memory_layout = std::make_unique(); auto &l = *(n.bounds_memory_layout); l.computed_offset = n.func.dimensions(); l.total_size = l.computed_offset + n.func.dimensions(); @@ -975,9 +981,9 @@ FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams &p } } - for (size_t i = 0; i < edges.size(); i++) { - edges[i].producer->outgoing_edges.push_back(&(edges[i])); - edges[i].consumer->incoming_edges.push_back(&(edges[i])); + for (auto &edge : edges) { + edge.producer->outgoing_edges.push_back(&edge); + edge.consumer->incoming_edges.push_back(&edge); } // Compute transitive dependencies @@ -1092,6 +1098,10 @@ std::ostream &FunctionDAG::dump(std::ostream &os) const { return os; } +int ExprBranching::visit(const Reinterpret *op) { + return Super::dispatch(op->value); +} + int ExprBranching::visit(const IntImm *op) { return 1; } @@ -1212,7 +1222,7 @@ int ExprBranching::visit(const Load *op) { int ExprBranching::visit_nary(const std::vector &exprs) { int total_branching = 0; - for (Expr e : exprs) { + for (const Expr &e : exprs) { int branching = Super::dispatch(e); if (branching == 0) { continue; @@ -1255,7 +1265,7 @@ int ExprBranching::compute(const Function &f) { std::vector values; values.reserve(def.values().size()); - for (auto v : def.values()) { + for (const auto &v : def.values()) { values.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form } @@ -1263,7 +1273,7 @@ int ExprBranching::compute(const Function &f) { std::vector args; args.reserve(def.args().size()); - for (auto v : def.args()) { + for (const auto &v : def.args()) { args.push_back(common_subexpression_elimination(simplify(v))); // Get things into canonical form } @@ -1274,7 +1284,9 @@ void sanitize_names(std::string &str) { bool in_quotes = false; for (auto &c : str) { in_quotes ^= (c == '"'); - if (!in_quotes && c == '$') c = '_'; + if (!in_quotes && c == '$') { + c = '_'; + } } } diff --git a/src/autoschedulers/anderson2021/FunctionDAG.h b/src/autoschedulers/anderson2021/FunctionDAG.h index ec884b525d3f..7b0b60905d7a 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.h +++ b/src/autoschedulers/anderson2021/FunctionDAG.h @@ -6,9 +6,10 @@ #define FUNCTION_DAG_H #include +#include #include -#include #include +#include #include #include "Errors.h" @@ -61,14 +62,20 @@ struct OptionalRational { } OptionalRational operator*(int64_t factor) const { - if ((*this) == 0) return *this; + if ((*this) == 0) { + return *this; + } int64_t num = numerator * factor; return OptionalRational{num, denominator}; } OptionalRational operator*(const OptionalRational &other) const { - if ((*this) == 0) return *this; - if (other == 0) return other; + if ((*this) == 0) { + return *this; + } + if (other == 0) { + return other; + } int64_t num = numerator * other.numerator; int64_t den = denominator * other.denominator; return OptionalRational{num, den}; @@ -99,12 +106,16 @@ struct OptionalRational { } bool operator>(int x) const { - if (!exists()) return false; + if (!exists()) { + return false; + } return !((*this) <= x); } bool operator>=(int x) const { - if (!exists()) return false; + if (!exists()) { + return false; + } return !((*this) < x); } @@ -183,9 +194,13 @@ class LoadJacobian { // Try to merge another LoadJacobian into this one, increasing the // count if the coefficients match. bool merge(const LoadJacobian &other) { - if (other.rows != rows || other.cols != cols) return false; + if (other.rows != rows || other.cols != cols) { + return false; + } for (size_t i = 0; i < rows * cols; i++) { - if (!(other.coeffs[i] == coeffs[i])) return false; + if (!(other.coeffs[i] == coeffs[i])) { + return false; + } } c += other.count(); return true; @@ -318,7 +333,7 @@ struct BoundContents { } BoundContents *make_copy() const { - auto b = layout->make(); + auto *b = layout->make(); size_t bytes = sizeof(data()[0]) * layout->total_size; memcpy(b->data(), data(), bytes); return b; @@ -493,7 +508,7 @@ struct FunctionDAG { }; Stage(Halide::Stage s) - : stage(s) { + : stage(std::move(s)) { } int get_loop_index_from_var(const std::string &var) const { @@ -598,19 +613,19 @@ struct FunctionDAG { // Create the function DAG, and do all the dependency and cost // analysis. This is done once up-front before the tree search. - FunctionDAG(const vector &outputs, const MachineParams ¶ms, const Target &target); + FunctionDAG(const vector &outputs, const Target &target); void dump() const; std::ostream &dump(std::ostream &os) const; -private: - // Compute the featurization for the entire DAG - void featurize(); - // This class uses a lot of internal pointers, so we'll hide the copy constructor. FunctionDAG(const FunctionDAG &other) = delete; void operator=(const FunctionDAG &other) = delete; +private: + // Compute the featurization for the entire DAG + void featurize(); + template void dump_internal(OS &os) const; }; @@ -625,6 +640,7 @@ class ExprBranching : public VariadicVisitor { const NodeMap &inlined; public: + int visit(const Reinterpret *op); int visit(const IntImm *op); int visit(const UIntImm *op); int visit(const FloatImm *op); diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index e9abc8e02a1c..d486c34ba0d6 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -115,7 +115,7 @@ bool all(const vector &v) { vector LoopNest::get_union_thread_counts(const FunctionDAG::Node *f) const { vector max_size{1, 1, 1}; // find the loop nests we just created and get max gpu_thread extents of other children - for (auto &c : children) { + for (const auto &c : children) { if (c->node != f) { if (c->gpu_label == thread) { vector lowered_size; @@ -127,7 +127,7 @@ vector LoopNest::get_union_thread_counts(const FunctionDAG::Node *f) co max_size[dim] = std::max(max_size[dim], lowered_size[dim]); } } - } else if (c->children.size() > 0) { // descend into children for thread blocks in serial loops + } else if (!c->children.empty()) { // descend into children for thread blocks in serial loops vector child_max_sizes = c->get_union_thread_counts(f); for (int dim = 0; dim < (int)(child_max_sizes.size()); dim++) { if (dim >= (int)(max_size.size())) { @@ -151,7 +151,7 @@ void LoopNest::get_stage_sizes(const FunctionDAG::Node *f, stage_sizes.resize(f->stages.size()); pure_dims.resize(f->stages.size()); vectorized_indices.resize(f->stages.size()); - for (auto &c : children) { + for (const auto &c : children) { if (c->node == f && f->dimensions > 0) { vectorized_indices[c->stage->index] = c->vectorized_loop_index; stage_sizes[c->stage->index] = c->size; @@ -182,16 +182,15 @@ void LoopNest::generate_vec_dim_serial_tilings(vector &serial_sizes) const // the newly inserted loop nests of f into a threads loop outside a serial loop. // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result. bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, - const MachineParams ¶ms, const Target &target, int v, vector> &result, - vector max_size) { + const vector &max_size) { vector> stage_sizes; vector> pure_dims; vector vectorized_indices; this->get_stage_sizes(f, stage_sizes, pure_dims, vectorized_indices); - internal_assert(stage_sizes.size() != 0); + internal_assert(!stage_sizes.empty()); auto tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, (int)(stage_sizes[0].size() - 1), vectorized_indices, true, false); bool made_child = false; for (const auto &t : tilings) { @@ -199,7 +198,7 @@ bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, new_parent->copy_from(*(this)); for (auto &c : new_parent->children) { if (c->node == f) { - c = c->parallelize_in_tiles(params, t, new_parent, target, false, false); + c = c->parallelize_in_tiles(t, new_parent, target, false, false); } } result.emplace_back(new_parent); @@ -207,8 +206,9 @@ bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, } if (!made_child) { // if we can't tile into gpu threads the inserted node, make it serial for (auto &c : children) { - if (c->node == f) + if (c->node == f) { c->gpu_label = serial; + } } } return made_child; @@ -253,7 +253,9 @@ void LoopNest::copy_from_including_features(const LoopNest &n) { // used as the hash function for the coarse-to-fine beam search in // the paper. void LoopNest::structural_hash(uint64_t &h, int depth) const { - if (depth < 0) return; + if (depth < 0) { + return; + } // Which Funcs are store_at this level? for (const auto *n : store_at) { @@ -426,7 +428,7 @@ void LoopNest::get_sites(const Target &target, bool in_block = task != nullptr; bool in_thread = current_thread_loop != nullptr; - for (auto f : store_at) { + for (const auto *f : store_at) { auto store_gpu_memory_type = get_gpu_memory_type(in_block, in_thread); for (const auto &s : f->stages) { @@ -470,7 +472,7 @@ bool LoopNest::promote_allocs_to_registers(const Target &target, StageMap return false; } - for (auto &stage : node.first->stages) { + for (const auto &stage : node.first->stages) { internal_assert(sites.get(&stage).gpu_store_memory_type == GPUMemoryType::local); sites.get(&stage).gpu_store_memory_type = GPUMemoryType::registers; } @@ -489,12 +491,12 @@ bool LoopNest::exceeds_serial_extents_limit(const Target &target, const LoopNest if (gpu_label == serial && stage->index == 0) { int64_t serial_loop_extents = 1; - for (size_t i = 0; i < stage->loop.size(); i++) { - if (!stage->loop[i].pure) { + for (const auto &i : stage->loop) { + if (!i.pure) { continue; } - serial_loop_extents *= size[stage->loop[i].pure_dim]; + serial_loop_extents *= size[i.pure_dim]; } if (parent_of_innermost) { @@ -607,9 +609,9 @@ double LoopNest::storage_stride(const LoadJacobian &jac, int innermost_storage_d std::vector storage_strides; int64_t storage_stride = 1; - for (std::size_t i = 0; i < storage_dims.size(); i++) { + for (long storage_dim : storage_dims) { storage_strides.push_back(storage_stride); - storage_stride *= store_bounds->region_required(storage_dims[i]).extent(); + storage_stride *= store_bounds->region_required(storage_dim).extent(); } int v = get_vectorized_loop_index_from_pure_stage(root); @@ -757,12 +759,12 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage if (verbose) { aslog(2) << "Storage stride: "; } - for (std::size_t i = 0; i < storage_dims.size(); i++) { + for (long storage_dim : storage_dims) { storage_strides.push_back(storage_stride); if (verbose) { aslog(2) << storage_stride << " "; } - storage_stride *= store_bounds->region_required(storage_dims[i]).extent(); + storage_stride *= store_bounds->region_required(storage_dim).extent(); } if (verbose) { aslog(2) << "\n"; @@ -1183,7 +1185,7 @@ bool LoopNest::all_paths_to_leaves_have_thread_loop() const { return true; } - if (children.size() == 0) { + if (children.empty()) { return false; } @@ -1225,7 +1227,7 @@ void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopIn } // Assume that when a block is active, all its warps are active -void LoopNest::compute_warp_and_block_occupancy(const MachineParams ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { +void LoopNest::compute_warp_and_block_occupancy(int hardware_parallelism, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { // Only compute these features for stage's that actually have a block // loop if (node != gpu_loop_info.current_block_loop->node) { @@ -1237,7 +1239,7 @@ void LoopNest::compute_warp_and_block_occupancy(const MachineParams ¶ms, Sch int64_t num_warps_per_block = gpu_loop_info.thread_info->num_warps_per_block; - int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)params.parallelism); + int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)hardware_parallelism); auto max_theoretical_active_blocks = std::min(active_block_hardware_limit, num_blocks); auto max_active_warps = std::min(active_warp_hardware_limit, max_theoretical_active_blocks * num_warps_per_block); @@ -1295,7 +1297,7 @@ std::pair LoopNest::find_innermost_and_paren return {child, parent}; } -int64_t LoopNest::points_accessed_per_thread(const MachineParams ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { +int64_t LoopNest::points_accessed_per_thread(const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { std::unique_ptr innermost_parent_clone = std::make_unique(); innermost_parent_clone->copy_from(*parent); @@ -1359,7 +1361,7 @@ int64_t LoopNest::points_accessed_per_thread(const MachineParams ¶ms, const } } - IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(params, tiling, grandparent, target, true, false, false, rvars_to_move_inward); + IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(tiling, grandparent, target, true, false, false, rvars_to_move_inward); const auto &bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain); int64_t num_points = 1; @@ -1512,14 +1514,16 @@ vector> LoopNest::collect_producers(const StageMap &sites) while (!pending.empty()) { const auto *e = pending.back(); pending.pop_back(); - if (done.count(e->producer)) continue; + if (done.count(e->producer)) { + continue; + } done.insert(e->producer); const auto &site = sites.get(&(e->producer->stages[0])); if (site.store->is_root()) { int vector_dim = (e->producer->is_input ? 0 : site.produce != nullptr ? site.produce->vector_dim : -1); - producers.push_back({e->producer->id, vector_dim}); + producers.emplace_back(e->producer->id, vector_dim); } else if (site.produce != nullptr) { // Computation must be nested inside this task or inlined into it. for (const auto &s : e->producer->stages) { @@ -1587,7 +1591,7 @@ void LoopNest::compute_working_set_from_features(int64_t *working_set, } for (const auto *node : store_at) { - auto &feat = features->get(&(node->stages[0])); + const auto &feat = features->get(&(node->stages[0])); working_set_here += feat.bytes_at_production; } @@ -1647,7 +1651,7 @@ std::pair LoopNest::compute_alloc_size_of_node_here(const Functio // Do a recursive walk over the loop nest computing features to feed the cost model. void LoopNest::compute_features(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const StageMap &sites, int64_t instances, @@ -1682,8 +1686,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, size_t i = size[idx]; loop_instances *= i; if (stage->loop[idx].pure && !in_impure) { - if (params.parallelism > 1 && - (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) { + if (hardware_parallelism > 1 && + (parallel || (parent->is_root() && parallel_tasks < hardware_parallelism))) { // Either we've picked our parallel tiling, or // it's not yet determined. Assume we'll not split // any loops and just stop after we hit the @@ -1692,9 +1696,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, // If we haven't picked out parallel tiling yet, // assume that we'll target 8*cores when we do, // which is a common rule of thumb. - if (!parallel && parallel_tasks > params.parallelism * 8) { + if (!parallel && parallel_tasks > hardware_parallelism * 8) { // We would split this loop - parallel_tasks = params.parallelism * 8; + parallel_tasks = hardware_parallelism * 8; } } } else if (i != 1) { @@ -1746,7 +1750,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.innermost_bytes_at_realization = node->bytes_per_point * innermost_storage_extent; if (!is_root()) { - auto site = sites.get(&(node->stages[0])); + const auto &site = sites.get(&(node->stages[0])); if (site.is_stored_in_global_mem()) { feat.global_bytes_at_task = feat.bytes_at_realization; feat.global_innermost_bytes_at_task = feat.innermost_bytes_at_realization; @@ -1776,7 +1780,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, const auto &entry = c->features.at(hash_of_producers); for (auto it = entry.begin(); it != entry.end(); it++) { - auto &stage = *(it.key()); + const auto &stage = *(it.key()); const auto &feat = it.value(); features->insert(&stage, feat); @@ -1795,7 +1799,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, ++stats.num_memoization_misses; } - c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + c->compute_features(dag, hardware_parallelism, target, sites, subinstances, parallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); if (use_memoized_features) { c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id); @@ -1835,7 +1839,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.working_set_at_root = working_set_here; - auto *p = sites.get(stage).produce; + const auto *p = sites.get(stage).produce; if (p) { // Extent of the innermost dimension in the storage layout int64_t innermost_storage_extent = 1; @@ -1951,7 +1955,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, } else { // How this loop will be parallelized is not yet // determined. Use optimistic values for the features. - bytes_at_task = (feat.bytes_at_realization + params.parallelism - 1) / params.parallelism; + bytes_at_task = (feat.bytes_at_realization + hardware_parallelism - 1) / hardware_parallelism; innermost_bytes_at_task = std::min(bytes_at_task, feat.innermost_bytes_at_realization); } @@ -1984,7 +1988,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, while (!pending.empty()) { const auto *e = pending.back(); pending.pop_back(); - if (done.count(e->producer)) continue; + if (done.count(e->producer)) { + continue; + } done.insert(e->producer); const auto &site = sites.get(&(e->producer->stages[0])); if (site.store->is_root()) { @@ -2045,7 +2051,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, // Recurse inwards for (const auto &c : children) { - c->compute_features(dag, params, target, sites, subinstances, subparallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + c->compute_features(dag, hardware_parallelism, target, sites, subinstances, subparallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); } for (const auto *node : store_at) { auto &feat = features->get(&(node->stages[0])); @@ -2160,7 +2166,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, auto p_pair = pending.back(); pending.pop_back(); - auto p = p_pair.first; + const auto *p = p_pair.first; const auto &next_edges = p->incoming_edges; for (const auto *e : next_edges) { @@ -2176,7 +2182,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (innermost) { if (e->consumer == stage) { - for (auto &j : e->load_jacobians) { + for (const auto &j : e->load_jacobians) { jacobians.emplace_back(j, e->producer); // Thread loops may not be innermost so in the @@ -2189,7 +2195,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, decltype(jacobians) new_jacobians; for (auto &j1 : jacobians) { if (e->consumer->node == j1.second) { - for (auto &j2 : e->load_jacobians) { + for (const auto &j2 : e->load_jacobians) { LoadJacobian j = j2 * j1.first; new_jacobians.emplace_back(j, e->producer); } @@ -2203,7 +2209,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, decltype(jacobians) new_thread_jacobians; for (auto &j1 : thread_jacobians) { if (e->consumer->node == j1.second) { - for (auto &j2 : e->load_jacobians) { + for (const auto &j2 : e->load_jacobians) { LoadJacobian j = j2 * j1.first; new_thread_jacobians.emplace_back(j, e->producer); } @@ -2268,7 +2274,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, const auto &jac = thread_jacobians[i]; const auto &serial_jac = jacobians[i]; internal_assert(jac.second == serial_jac.second); - if (jac.second != e->producer) continue; + if (jac.second != e->producer) { + continue; + } int64_t n = jac.first.count(); if (is_shared_mem) { @@ -2280,7 +2288,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); + int64_t points_accessed = points_accessed_per_thread(target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); compute_mem_load_features( jac.first, @@ -2311,7 +2319,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); + int64_t points_accessed = points_accessed_per_thread(target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); compute_mem_load_features( jac.first, @@ -2338,7 +2346,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (site.gpu_store_memory_type == GPUMemoryType::local) { internal_assert(false) << "Loop nest contains local_mem_load"; for (const auto &jac : jacobians) { - if (jac.second != e->producer) continue; + if (jac.second != e->producer) { + continue; + } int64_t n = jac.first.count(); if (verbose) { @@ -2349,7 +2359,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); + int64_t points_accessed = points_accessed_per_thread(target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); compute_mem_load_features( jac.first, @@ -2659,7 +2669,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (innermost && !is_scalar()) { compute_warp_features(feat, gpu_loop_info); - compute_warp_and_block_occupancy(params, feat, gpu_loop_info); + compute_warp_and_block_occupancy(hardware_parallelism, feat, gpu_loop_info); } } @@ -2678,8 +2688,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, // required of 'g' should be 1 point for each point of 'out' but get_bounds() // will also include the edge 'g' -> 'f' and give the result 201 points for every point // of 'out') -const Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const { - internal_assert(edge_chain.size() >= 1); +Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const { + internal_assert(!edge_chain.empty()); internal_assert(edge_chain[0]->consumer == stage) << "get_bounds_along_edge_chain must be called with an edge chain that begins from the current loop nest's node. But the given edge chain begins with " << edge_chain[0]->consumer->node->func.name() @@ -2729,7 +2739,7 @@ const Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, co producer->loop_nest_for_region(i, &(bound->region_computed(0)), &(bound->loops(i, 0))); } - bounds.push_back(bound); + bounds.emplace_back(bound); cur_consumer_bounds = bound; } @@ -2746,7 +2756,7 @@ const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const { // b->validate(); return b; } - auto bound = f->make_bound(); + auto *bound = f->make_bound(); // Compute the region required if (f->is_output && is_root()) { @@ -2869,7 +2879,7 @@ void LoopNest::dump(T &stream, string prefix, const LoopNest *parent) const { } else { stream << "\n"; } - for (auto p : store_at) { + for (const auto *p : store_at) { stream << prefix << "realize: " << p->func.name() << " ["; for (int i = 0; i < p->dimensions; i++) { if (i > 0) { @@ -2898,7 +2908,9 @@ template void LoopNest::dump(std::ostringstream &stream, string prefix, const Lo // Does this loop nest access the given Func bool LoopNest::calls(const FunctionDAG::Node *f) const { for (const auto &c : children) { - if (c->calls(f)) return true; + if (c->calls(f)) { + return true; + } } for (const auto *e : f->outgoing_edges) { if (e->consumer == stage) { @@ -2934,24 +2946,36 @@ int64_t LoopNest::max_inlined_calls() const { // out the bounds so that it won't fault. bool LoopNest::accesses_input_buffer() const { for (const auto &c : children) { - if (c->accesses_input_buffer()) return true; + if (c->accesses_input_buffer()) { + return true; + } + } + if (is_root()) { + return false; } - if (is_root()) return false; auto check = [&](const FunctionDAG::Node::Stage *s) { for (const auto *e : s->incoming_edges) { - if (e->producer->is_input) return true; + if (e->producer->is_input) { + return true; + } } for (int t = 0; t < (int)PipelineFeatures::ScalarType::NumScalarTypes; t++) { - if (s->features.op_histogram[(int)PipelineFeatures::OpType::ImageCall][t] > 0) return true; + if (s->features.op_histogram[(int)PipelineFeatures::OpType::ImageCall][t] > 0) { + return true; + } } return false; }; - if (check(stage)) return true; + if (check(stage)) { + return true; + } for (auto it = inlined.begin(); it != inlined.end(); it++) { - if (check(&(it.key()->stages[0]))) return true; + if (check(&(it.key()->stages[0]))) { + return true; + } } return false; } @@ -2965,7 +2989,9 @@ bool LoopNest::computes(const FunctionDAG::Node *f) const { return true; } for (const auto &c : children) { - if (c->computes(f)) return true; + if (c->computes(f)) { + return true; + } } return false; } @@ -2976,12 +3002,12 @@ bool LoopNest::computes(const FunctionDAG::Node *f) const { // Inline a Func into all consumers within this loop. void LoopNest::inline_func(const FunctionDAG::Node *f) { // Inline it into the children - for (size_t i = 0; i < children.size(); i++) { - if (children[i]->calls(f)) { + for (auto &i : children) { + if (i->calls(f)) { std::unique_ptr new_child{new LoopNest}; - new_child->copy_from(*children[i]); + new_child->copy_from(*i); new_child->inline_func(f); - children[i] = new_child.release(); + i = new_child.release(); } } @@ -3044,7 +3070,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, // Set up a bound for the inside of the // loop. computed/required is still the full region, but // the loop nest will be a single representative point. - auto single_point = bounds->make_copy(); + auto *single_point = bounds->make_copy(); size_t loop_dim = f->stages[s].loop.size(); node->size.resize(loop_dim); @@ -3085,7 +3111,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, } // Leave region required blank inside the computation of a Func - node->set_bounds(f, std::move(single_point)); + node->set_bounds(f, single_point); node->vector_dim = v; if (s == 0) { @@ -3104,7 +3130,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, one_vector->size.resize(loop_dim, 1); one_vector->innermost = true; one_vector->gpu_label = simd; - auto b = node->get_bounds(f)->make_copy(); + auto *b = node->get_bounds(f)->make_copy(); // Set the region computed inside this node to be the first vector lane if (node->vectorized_loop_index >= 0) { b->loops(s, node->vectorized_loop_index).set_extent(1); @@ -3128,8 +3154,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, } // Parallelize this loop according to the given tiling. -IntrusivePtr LoopNest::parallelize_in_tiles(const MachineParams ¶ms, - const vector &tiling, +IntrusivePtr LoopNest::parallelize_in_tiles(const vector &tiling, const LoopNest *parent, const Target &target, bool inner_tiling, @@ -3170,8 +3195,9 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const MachineParams outer->size = size; outer->innermost = false; - if (!target.has_gpu_feature()) + if (!target.has_gpu_feature()) { outer->parallel = true; + } outer->tileable = may_subtile(); @@ -3183,10 +3209,10 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const MachineParams inner->bounds = bounds; inner->store_at = store_at; - auto b = inner->get_bounds(node)->make_copy(); + auto *b = inner->get_bounds(node)->make_copy(); // Then move factors from the outer loop to the inner loop - auto parent_bounds = parent->get_bounds(node); + const auto &parent_bounds = parent->get_bounds(node); for (size_t i = 0; i < stage->loop.size(); i++) { int l = stage->loop[i].pure_dim; @@ -3315,7 +3341,7 @@ bool LoopNest::region_computed_shrinks(const FunctionDAG::Node *f, const LoopNes // loop marked gpu_threads, in which case f's loops cannot be gpu_threads vector> LoopNest::compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const SearchSpaceOptions &search_space_options, int v, @@ -3338,11 +3364,15 @@ vector> LoopNest::compute_in_tiles(const FunctionDA const auto &p_parent = bounds_at_parent->region_computed(v); int64_t e = p.extent(); int64_t ep = p_parent.extent(); - if (ep >= f->vector_size && e < f->vector_size) return result; + if (ep >= f->vector_size && e < f->vector_size) { + return result; + } // Don't descend into loops if the bounds required don't // shrink. - if (!region_computed_shrinks(f, parent)) return result; + if (!region_computed_shrinks(f, parent)) { + return result; + } } // Figure out which child we can fuse this into @@ -3386,7 +3416,7 @@ vector> LoopNest::compute_in_tiles(const FunctionDA // if GPU and creating a threads loop INSIDE a block loop, create child for each thread tiling if (!is_root() && !in_threads_loop && target.has_gpu_feature()) { - bool made_child = r->add_gpu_thread_tilings(f, params, target, v, result, union_counts); + bool made_child = r->add_gpu_thread_tilings(f, target, v, result, union_counts); if (!made_child) { // no good thread tilings, just keep r with the untiled loop inserted as serial result.emplace_back(r.release()); } @@ -3407,20 +3437,19 @@ vector> LoopNest::compute_in_tiles(const FunctionDA const auto &c = children[child]; int num_ones = 0; - for (size_t i = 0; i < c->size.size(); i++) { - int64_t s = c->size[i]; + for (long s : c->size) { num_ones += (s == 1) ? 1 : 0; } for (int store_here = 0; store_here < 1; store_here++) { - if (is_root() && num_ones == (int)c->size.size() && params.parallelism > 1) { + if (is_root() && num_ones == (int)c->size.size() && hardware_parallelism > 1) { // Don't fuse into serial loops, or we could never parallelize this Func. continue; } in_threads_loop |= (children[child]->gpu_label == thread); // we must pass down union thread count constraints computed at block level when computing further in - auto opts = children[child]->compute_in_tiles(f, this, params, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); + auto opts = children[child]->compute_in_tiles(f, this, hardware_parallelism, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); for (IntrusivePtr &n : opts) { // (Only valid if one child calls f) Push the // computation into the child. Possibly leaving @@ -3538,11 +3567,11 @@ bool LoopNest::producer_computed_here_or_further_in(const FunctionDAG::Node *pro void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest) const { if (is_root()) { - for (auto &c : children) { + for (const auto &c : children) { descendants.emplace(c->stage, {}); } - for (auto &c : children) { + for (const auto &c : children) { c->get_stages_computed_in_each_compute_root_loop(descendants, c.get()); } @@ -3551,7 +3580,7 @@ void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMapstage).emplace(stage, true); - for (auto &c : children) { + for (const auto &c : children) { c->get_stages_computed_in_each_compute_root_loop(descendants, compute_root_loop_nest); } } @@ -3567,7 +3596,7 @@ void LoopNest::apply(LoopLevel here, std::vector &ancestors, const NodeMap &all_inlined) const { if (is_root()) { - for (auto &c : children) { + for (const auto &c : children) { Func(c->node->func).compute_root(); c->apply(LoopLevel::root(), state_map, num_cores, 1, this, c.get(), target, ancestors, all_inlined); if (c->stage->index == 0) { @@ -3674,8 +3703,9 @@ void LoopNest::apply(LoopLevel here, if (vectorized_loop_index >= 0) { size_t i = 0; - while (!state.vars[i].innermost_pure_dim) + while (!state.vars[i].innermost_pure_dim) { i++; + } auto &v = state.vars[i]; internal_assert(v.innermost_pure_dim && v.exists) << v.var.name() << "\n"; // Is the result of a split @@ -3809,7 +3839,9 @@ void LoopNest::apply(LoopLevel here, bool found = false; for (const auto &v : state.vars) { - if (!v.exists) continue; + if (!v.exists) { + continue; + } here = LoopLevel(node->func, v.var); found = true; break; @@ -3827,7 +3859,7 @@ void LoopNest::apply(LoopLevel here, return; } - for (auto f : store_at) { + for (const auto *f : store_at) { Func(f->func).store_at(here); } for (auto s : size) { @@ -3841,7 +3873,7 @@ void LoopNest::apply(LoopLevel here, loop_level = "_at(" + here.func() + ", " + here.var().name() + ")"; } - for (auto &c : children) { + for (const auto &c : children) { if (c->node != node) { Func(c->node->func).compute_at(here); } @@ -3858,9 +3890,9 @@ void LoopNest::apply(LoopLevel here, update_producers_to_be_staged(state, all_inlined); } - for (auto f : store_at) { + for (const auto *f : store_at) { bool computed_here = false; - for (auto &c : children) { + for (const auto &c : children) { if (c->node == f) { computed_here = true; break; @@ -3884,7 +3916,7 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const No auto cur_pair = pending.back(); pending.pop_back(); - auto *s = cur_pair.first; + const auto *s = cur_pair.first; for (const auto *e : s->incoming_edges) { std::vector edge_chain = cur_pair.second; diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h index d5fc9070e6c5..9d668f51bfa5 100644 --- a/src/autoschedulers/anderson2021/LoopNest.h +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -149,7 +149,7 @@ struct LoopNest { } bool is_scalar() const { - return size.size() == 0; + return size.empty(); } // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension @@ -172,11 +172,10 @@ struct LoopNest { // the newly inserted loop nests of f into a threads loop outside a serial loop. // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result. bool add_gpu_thread_tilings(const FunctionDAG::Node *f, - const MachineParams ¶ms, const Target &target, int v, vector> &result, - vector max_size); + const vector &max_size); void copy_from(const LoopNest &n); void copy_from_including_features(const LoopNest &n); @@ -319,13 +318,13 @@ struct LoopNest { void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const; // Assume that when a block is active, all its warps are active - void compute_warp_and_block_occupancy(const MachineParams ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const; + void compute_warp_and_block_occupancy(int parallelism, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const; void compute_shared_mem_occupancy(const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; std::pair find_innermost_and_parent() const; - int64_t points_accessed_per_thread(const MachineParams ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; + int64_t points_accessed_per_thread(const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const; @@ -348,7 +347,7 @@ struct LoopNest { // Do a recursive walk over the loop nest computing features to feed the cost model. void compute_features(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const StageMap &sites, int64_t instances, @@ -386,7 +385,7 @@ struct LoopNest { // consumers along the given edge chain), from which we know what region // would be computed if it were scheduled here and what its loop nest // would be. - const Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const; + Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const; void dump() const; @@ -430,8 +429,7 @@ struct LoopNest { const Target &target); // Parallelize this loop according to the given tiling. - IntrusivePtr parallelize_in_tiles(const MachineParams ¶ms, - const vector &tiling, + IntrusivePtr parallelize_in_tiles(const vector &tiling, const LoopNest *parent, const Target &target, bool inner_tiling, @@ -453,7 +451,7 @@ struct LoopNest { // loop marked gpu_threads, in which case f's loops cannot be gpu_threads vector> compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const SearchSpaceOptions &search_space_options, int v, diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 0e54bccbabca..b1d36a9fa136 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -57,7 +57,7 @@ $(BIN)/auto_schedule_runtime.a: $(BIN)/cost_model.generator $(BIN)/cost_model/%.a: $(BIN)/cost_model.generator @mkdir -p $(@D) - $^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime auto_schedule=false enable_debug_output=$(ENABLE_DEBUG_OUTPUT) -e stmt,static_library,h,assembly + $^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime enable_debug_output=$(ENABLE_DEBUG_OUTPUT) -e stmt,static_library,h,assembly # It's important to use dynamic lookups for undefined symbols here: all of libHalide # is expected to be present (in the loading binary), so we explicitly make the symbols @@ -137,7 +137,7 @@ $(GENERATOR_BIN)/demo.generator: demo_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) @mkdir -p $(@D) HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ - $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) -s Anderson2021 + $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) autoscheduler=Anderson2021 $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a @mkdir -p $(@D) @@ -265,7 +265,7 @@ $(GENERATOR_BIN)/included_schedule_file_none.generator: included_schedule_file_g $(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_file_none.generator $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) @mkdir -p $(@D) HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ - $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* auto_schedule=true -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) -s Anderson2021 -e schedule + $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) autoscheduler=Anderson2021 -e schedule # Note that this depends on included_schedule_file.schedule.h rather than $(BIN)/%/included_schedule_file.schedule.h -- # the former should be generated by something like diff --git a/src/autoschedulers/anderson2021/PerfectHashMap.h b/src/autoschedulers/anderson2021/PerfectHashMap.h index c6315a7009e8..44fb25682b48 100644 --- a/src/autoschedulers/anderson2021/PerfectHashMap.h +++ b/src/autoschedulers/anderson2021/PerfectHashMap.h @@ -121,7 +121,9 @@ class PerfectHashMap { int find_index_small(const K *n) const { int i; for (i = 0; i < (int)occupied; i++) { - if (storage_bucket(i).first == n) return i; + if (storage_bucket(i).first == n) { + return i; + } } return i; } @@ -173,7 +175,9 @@ class PerfectHashMap { // Methods when the map is in the large state T &emplace_large(const K *n, T &&t) { auto &p = storage_bucket(n->id); - if (!p.first) occupied++; + if (!p.first) { + occupied++; + } p.first = n; p.second = std::move(t); return p.second; @@ -380,11 +384,15 @@ class PerfectHashMap { }; iterator begin() { - if (state == Empty) return end(); + if (state == Empty) { + return end(); + } iterator it; it.iter = storage.data(); it.end = it.iter + storage.size(); - if (it.key() == nullptr) it++; + if (it.key() == nullptr) { + it++; + } phm_assert(it.iter == it.end || it.key()); return it; } @@ -396,11 +404,15 @@ class PerfectHashMap { } const_iterator begin() const { - if (storage.empty()) return end(); + if (storage.empty()) { + return end(); + } const_iterator it; it.iter = storage.data(); it.end = it.iter + storage.size(); - if (it.key() == nullptr) it++; + if (it.key() == nullptr) { + it++; + } phm_assert(it.iter == it.end || it.key()); return it; } diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index 5e82955af630..a0018a55c98c 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -1,6 +1,5 @@ #include "SearchSpace.h" -using std::set; using std::vector; namespace Halide { @@ -13,14 +12,14 @@ bool use_randomized_tilings() { } SearchSpace::SearchSpace(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const std::string &search_space_options, std::mt19937 &rng, CostModel *cost_model, Statistics &stats, const LoopNestParser *partial_schedule) - : dag{dag}, params{params}, target{target}, search_space_options{search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, randomize_tilings{use_randomized_tilings()}, partial_schedule{partial_schedule} { + : dag{dag}, hardware_parallelism{hardware_parallelism}, target{target}, search_space_options{search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, randomize_tilings{use_randomized_tilings()}, partial_schedule{partial_schedule} { memoized_compute_root_blocks.make_large(dag.nodes.size()); } @@ -42,14 +41,14 @@ void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_ro for (auto &c : new_root->children) { if (c->node == node) { LoopNest *new_block = new LoopNest; - new_block->copy_from_including_features(*c.get()); - blocks.push_back(new_block); + new_block->copy_from_including_features(*c); + blocks.emplace_back(new_block); ++stats.num_block_memoization_misses; } } } -bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, +bool SearchSpace::add_states_from_memoized_blocks(const IntrusivePtr &state, std::function &&)> &accept_child, const FunctionDAG::Node *node, int &num_children) const { @@ -93,7 +92,7 @@ bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, new_root->children[block_index++] = new_block; } - if (child->calculate_cost(dag, params, target, cost_model, stats)) { + if (child->calculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { num_children++; accept_child(std::move(child)); ++stats.num_block_memoization_hits; @@ -103,14 +102,13 @@ bool SearchSpace::add_states_from_memoized_blocks(IntrusivePtr state, return true; } -vector SearchSpace::filter_parallel_tile_options(IntrusivePtr state, +vector SearchSpace::filter_parallel_tile_options(const IntrusivePtr &state, const FunctionDAG::Node *node, vector> &inner_tilings, const vector &pure_size) const { vector options; vector insufficient_parallelism; - for (size_t i = 0; i < inner_tilings.size(); i++) { - auto &t = inner_tilings[i]; + for (auto &t : inner_tilings) { SearchSpace::ParallelTileOption o; o.inner_tiling = t; @@ -127,7 +125,7 @@ vector SearchSpace::filter_parallel_tile_option if (c->node == node) { int64_t total = 1; int64_t max_available = 1; - for (auto &l : c->stage->loop) { + for (const auto &l : c->stage->loop) { if (!l.rvar) { total *= o.outer_tiling[l.pure_dim]; max_available *= c->size[l.pure_dim]; @@ -136,18 +134,18 @@ vector SearchSpace::filter_parallel_tile_option max_total = std::max(max_total, total); // If a stage does not have enough parallelism regardless of the - // tiling (i.e. its size is < params.parallelism * 2 before + // tiling (i.e. its size is < hardware_parallelism * 2 before // splitting), then the only tiling worth considering is the // one that retains the full extent in this dimension // (outer_tiling == size). In that case, skip over updating // min_total, otherwise it will be filtered out below - if (max_available >= params.parallelism * 2 || total != max_available) { + if (max_available >= hardware_parallelism * 2 || total != max_available) { if (min_total != 0) { min_total = std::min(min_total, total); } else { min_total = total; } - const double tasks_per_core = ((double)total) / params.parallelism; + const double tasks_per_core = ((double)total) / hardware_parallelism; o.idle_core_wastage = std::max(o.idle_core_wastage, std::ceil(tasks_per_core) / tasks_per_core); @@ -160,8 +158,8 @@ vector SearchSpace::filter_parallel_tile_option // Filter out the less useful options bool ok = - (min_total >= params.parallelism * 2 && - (max_total <= params.parallelism * 16 || target.has_gpu_feature())); + (min_total >= hardware_parallelism * 2 && + (max_total <= hardware_parallelism * 16 || target.has_gpu_feature())); if (!ok) { insufficient_parallelism.emplace_back(std::move(o)); @@ -171,7 +169,7 @@ vector SearchSpace::filter_parallel_tile_option options.emplace_back(std::move(o)); } - int64_t parallelism_limit = params.parallelism; + int64_t parallelism_limit = hardware_parallelism; while (options.empty()) { for (auto &o : insufficient_parallelism) { if (o.min_parallelism >= parallelism_limit) { @@ -221,7 +219,7 @@ void SearchSpace::process_pending_states(std::unordered_mapcalculate_cost(dag, params, target, cost_model, stats)) { + if (entry.second[i]->calculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { num_children++; accept_child(std::move(entry.second[i])); accepted++; @@ -235,10 +233,10 @@ void SearchSpace::process_pending_states(std::unordered_mapcalculate_cost(dag, params, target, cost_model, stats)) { + for (auto &state : entry.second) { + if (state->calculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { num_children++; - accept_child(std::move(entry.second[i])); + accept_child(std::move(state)); stats.num_tilings_accepted++; break; } @@ -246,7 +244,7 @@ void SearchSpace::process_pending_states(std::unordered_map state, +void SearchSpace::generate_children(const IntrusivePtr &state, std::function &&)> &accept_child, int pass_idx, bool is_pre_pass) { @@ -352,9 +350,9 @@ void SearchSpace::generate_children(IntrusivePtr state, must_inline &= (e->consumer->node->is_pointwise || e->consumer->node->is_boundary_condition); } - if (must_inline) { + return; - } + } if (must_compute_root) { @@ -363,7 +361,7 @@ void SearchSpace::generate_children(IntrusivePtr state, const auto &nodes = compute_root_nodes.get(node); for (const auto &n : nodes) { const auto *compute_root_loop = deep_copy_loop_nest(n.get(), NoOpMutator{}); - new_root->children.push_back(compute_root_loop); + new_root->children.emplace_back(compute_root_loop); } new_root->store_at.insert(node); @@ -400,7 +398,7 @@ void SearchSpace::generate_children(IntrusivePtr state, std::unordered_map secondary_options; for (int vector_dim : vector_dims) { Timer timer; - auto tile_options = root->compute_in_tiles(node, nullptr, params, target, search_space_options, vector_dim, false, false, is_pre_pass); + auto tile_options = root->compute_in_tiles(node, nullptr, hardware_parallelism, target, search_space_options, vector_dim, false, false, is_pre_pass); stats.compute_in_tiles_time += timer.elapsed(); timer.restart(); @@ -424,7 +422,7 @@ void SearchSpace::generate_children(IntrusivePtr state, } auto child = state->make_child(); - child->root = std::move(o.loop_nest); + child->root = o.loop_nest; child->num_decisions_made++; uint64_t h = child->structural_hash(pass_idx); @@ -446,8 +444,8 @@ void SearchSpace::generate_children(IntrusivePtr state, bool should_parallelize = false; IntrusivePtr pure_stage; - if (params.parallelism > 1) { - for (auto &c : root->children) { + if (hardware_parallelism > 1) { + for (const auto &c : root->children) { if (c->node == node && node->dimensions > 0) { if (c->stage->index == 0) { pure_stage = c; @@ -476,7 +474,7 @@ void SearchSpace::generate_children(IntrusivePtr state, // step 1) convert (none, SIMD) loops to (parallel, serial, SIMD) loops with specialized serial sizes auto parallel_tilings = generate_compute_root_serial_tilings(pure_stage, node); - internal_assert(parallel_tilings.size() > 0) << " zero parallel tilings\n"; + internal_assert(!parallel_tilings.empty()) << " zero parallel tilings\n"; std::unordered_map>> primary_options; std::unordered_map>> secondary_options; @@ -487,7 +485,7 @@ void SearchSpace::generate_children(IntrusivePtr state, // step 1) parallelize all loop nests for this node into (parallel, serial) with given serial tiles for (auto &c : parallel_root.children) { if (c->node == node) { - c = c->parallelize_in_tiles(params, parallel_t, ¶llel_root, target, false, true); + c = c->parallelize_in_tiles(parallel_t, ¶llel_root, target, false, true); } } @@ -509,7 +507,7 @@ void SearchSpace::generate_children(IntrusivePtr state, for (auto &c : new_root->children) { if (c->node == node) { vector tiling((int)(c->size.size()), 1); - c = c->parallelize_in_tiles(params, tiling, new_root, target, false, true); + c = c->parallelize_in_tiles(tiling, new_root, target, false, true); } } if (add_child(state, new_root, accept_child)) { @@ -540,7 +538,7 @@ void SearchSpace::generate_children(IntrusivePtr state, for (auto &c : new_root->children) { if (c->node == node) { - c = c->parallelize_in_tiles(params, o.inner_tiling, new_root, target, true, false); + c = c->parallelize_in_tiles(o.inner_tiling, new_root, target, true, false); } } @@ -553,7 +551,7 @@ void SearchSpace::generate_children(IntrusivePtr state, } auto child = state->make_child(); - child->root = std::move(new_root); + child->root = new_root; child->num_decisions_made++; uint64_t h = child->structural_hash(pass_idx); @@ -586,7 +584,7 @@ struct ClearInlinedMutator { } }; -void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { +void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr &best) { std::vector> node_ids_and_costs; NodeMap node_costs; size_t num_nodes = 0; @@ -608,7 +606,7 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { } for (auto it = node_costs.begin(); it != node_costs.end(); it++) { - node_ids_and_costs.push_back({it.key()->id, it.value()}); + node_ids_and_costs.emplace_back(it.key()->id, it.value()); } for (const auto &n : node_ids_and_costs) { @@ -633,8 +631,8 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr best) { for (const auto &c : best->root->children) { if (nodes_to_freeze.contains(c->node)) { - auto new_loop_nest = deep_copy_loop_nest(c, mutator); - compute_root_nodes.get_or_create(c->node).push_back(new_loop_nest); + auto *new_loop_nest = deep_copy_loop_nest(c, mutator); + compute_root_nodes.get_or_create(c->node).emplace_back(new_loop_nest); std::cerr << "Freezing as compute_root: " << c->node->func.name() << "\n"; } } @@ -657,9 +655,9 @@ bool SearchSpace::add_child(const IntrusivePtr &state, const IntrusivePtr &new_root, std::function &&)> &accept_child) const { auto child = state->make_child(); - child->root = std::move(new_root); + child->root = new_root; child->num_decisions_made++; - if (child->calculate_cost(dag, params, target, cost_model, stats)) { + if (child->calculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { accept_child(std::move(child)); return true; } diff --git a/src/autoschedulers/anderson2021/SearchSpace.h b/src/autoschedulers/anderson2021/SearchSpace.h index 49205a8e90c4..8688d0a90598 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.h +++ b/src/autoschedulers/anderson2021/SearchSpace.h @@ -23,7 +23,7 @@ namespace Autoscheduler { struct SearchSpace { using StateVector = std::vector>; const FunctionDAG &dag; - const MachineParams ¶ms; + int hardware_parallelism; const Target ⌖ SearchSpaceOptions search_space_options; std::mt19937 &rng; @@ -37,7 +37,7 @@ struct SearchSpace { NodeMap>>> memoized_compute_root_blocks; SearchSpace(const FunctionDAG &dag, - const MachineParams ¶ms, + int hardware_parallelism, const Target &target, const std::string &search_space_options, std::mt19937 &rng, @@ -64,7 +64,7 @@ struct SearchSpace { ParallelTileOption &operator=(const ParallelTileOption &) = delete; }; - vector filter_parallel_tile_options(IntrusivePtr state, + vector filter_parallel_tile_options(const IntrusivePtr &state, const FunctionDAG::Node *node, vector> &inner_tilings, const vector &pure_size) const; @@ -73,18 +73,18 @@ struct SearchSpace { void memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root); - bool add_states_from_memoized_blocks(IntrusivePtr state, + bool add_states_from_memoized_blocks(const IntrusivePtr &state, std::function &&)> &accept_child, const FunctionDAG::Node *node, int &num_children) const; // Generate successor states for given 'state' - void generate_children(IntrusivePtr state, + void generate_children(const IntrusivePtr &state, std::function &&)> &accept_child, int pass_idx, bool is_pre_pass); - void freeze_lowest_cost_stages(const IntrusivePtr best); + void freeze_lowest_cost_stages(const IntrusivePtr &best); vector> generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, const FunctionDAG::Node *node) const; diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index eeeb17650cf5..65145c7633cc 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -51,7 +51,7 @@ const LoopNest *State::deepest_valid_compute_location(const mapsize.size(); ++i) { - total *= cur_loop->size[i]; + for (long i : cur_loop->size) { + total *= i; } if (parent.count(cur_loop) == 0) { @@ -136,9 +136,15 @@ int64_t State::total_loop_extents_of_ancestors(const map> &parent, const LoopNest *a, const LoopNest *b) const { - if (a->is_root()) return a; - if (b->is_root()) return b; - if (a == b) return a; + if (a->is_root()) { + return a; + } + if (b->is_root()) { + return b; + } + if (a == b) { + return a; + } // Walk the deeper one up until they're at the same depth auto it_a = parent.find(a); @@ -153,11 +159,13 @@ const LoopNest *State::deepest_common_ancestor(const mapsecond.first; b = it_b->second.first; - if (a == b) return a; + if (a == b) { + return a; + } it_a = parent.find(a); it_b = parent.find(b); internal_assert(it_a != parent.end() && it_b != parent.end()); @@ -215,7 +223,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest int vectorized_loop_index = c->vectorized_loop_index; - if (c->size.size() == 0) { + if (c->size.empty()) { continue; } @@ -227,13 +235,13 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest vector tiling(c->node->dimensions, 1); // Split into parallelized and serial - c = c->parallelize_in_tiles(params, tiling, loop_nest, target, true, false); + c = c->parallelize_in_tiles(tiling, loop_nest, target, true, false); if (vectorized_loop_index >= 0) { tiling[vectorized_loop_index] = inner_extent; } // Split parallelized into blocks and threads - c = c->parallelize_in_tiles(params, tiling, loop_nest, target, true, false); + c = c->parallelize_in_tiles(tiling, loop_nest, target, true, false); } else { // An update stage may have more or fewer dimensions than // the pure stage, but the tiling requires its dimensions to @@ -251,7 +259,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest // For update stages, split into parallelized and serial // (parallelize_in_tiles will move any RVars inwards and // make them serial) - c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); + c = c->parallelize_in_tiles(tiling, loop_nest, target, false, true); // If vectorized_loop_index < 0, then this update stage // likely does not loop over the vectorized loop of the @@ -264,7 +272,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest // Now that the RVars have been moved inwards, we can // split the outer loop into blocks and threads - c = c->parallelize_in_tiles(params, thread_tiling, loop_nest, target, true, false); + c = c->parallelize_in_tiles(thread_tiling, loop_nest, target, true, false); } } } @@ -296,7 +304,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) // Mark as 'thread' so this loop is split into threads and // serial c->gpu_label = thread; - c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); + c = c->parallelize_in_tiles(tiling, loop_nest, target, false, true); } return; } @@ -336,17 +344,17 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) // Mark as 'thread' so this loop is split into threads and // serial c->gpu_label = thread; - c = c->parallelize_in_tiles(params, tiling, loop_nest, target, false, true); + c = c->parallelize_in_tiles(tiling, loop_nest, target, false, true); } } } -IntrusivePtr State::get_root_for_features(const MachineParams ¶ms, const Target &target) const { +IntrusivePtr State::get_root_for_features(int hardware_parallelism, const Target &target) const { if (!has_compute_root_loops_without_blocks() && !has_loop_nest_without_thread_loops()) { return root; } - FeatureLoopNestMutator mutator{params, target}; + FeatureLoopNestMutator mutator{hardware_parallelism, target}; // We copy the loop nest in 2 cases: // - If the current loop nest has compute root loops without blocks (it is @@ -355,7 +363,7 @@ IntrusivePtr State::get_root_for_features(const MachineParams &p // thread loop nest, we create a surrounding thread loop nest with // extents 1 (which Halide will do when the schedule is compiled) so // that we can more easily compute features - auto new_root = create_feature_root(mutator); + auto *new_root = create_feature_root(mutator); return new_root; } @@ -391,8 +399,8 @@ void State::set_gpu_store_site(const map *features, Statistics &stats, bool verbose) const { - auto feature_root = get_root_for_features(params, target); +bool State::compute_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, StageMap *features, Statistics &stats, bool verbose) const { + auto feature_root = get_root_for_features(hardware_parallelism, target); StageMap sites; sites.make_large(dag.nodes[0].stages[0].max_id); @@ -434,7 +442,9 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p for (const auto *e : n.outgoing_edges) { const auto &consumer_site = sites.get(e->consumer); const LoopNest *l = consumer_site.innermost; - if (!l) l = consumer_site.compute; + if (!l) { + l = consumer_site.compute; + } if (!l) { if (aslog::aslog_level() > 0) { dump(); @@ -480,7 +490,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p loop = deepest_valid_compute_location(parent, n, loop, feature_root.get(), total_shared_mem_alloc_sizes); int64_t num_realizations = total_loop_extents_of_ancestors(parent, loop); - for (auto &stage : n.stages) { + for (const auto &stage : n.stages) { auto &site = sites.get_or_create(&stage); site.compute = loop; site.store = loop; @@ -496,7 +506,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p } Timer timer; - feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, nullptr, nullptr, nullptr, features, {feature_root.get()}, true, total_shared_mem_alloc_sizes, stats, verbose); + feature_root->compute_features(dag, hardware_parallelism, target, sites, 1, 1, nullptr, nullptr, *feature_root, nullptr, nullptr, nullptr, features, {feature_root.get()}, true, total_shared_mem_alloc_sizes, stats, verbose); stats.featurization_time += timer.elapsed(); ++stats.num_featurizations; @@ -512,13 +522,15 @@ bool State::compute_featurization(const FunctionDAG &dag, const MachineParams &p return true; } -void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, std::ostream &out) const { +void State::save_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, std::ostream &out) const { StageMap features; Statistics stats; - compute_featurization(dag, params, target, &features, stats); + compute_featurization(dag, hardware_parallelism, target, &features, stats); for (const auto &n : dag.nodes) { - if (n.is_input) continue; + if (n.is_input) { + continue; + } for (size_t stage_idx = n.stages.size(); stage_idx > 0; stage_idx--) { const auto &s = n.stages[stage_idx - 1]; const size_t num_schedule_features = ScheduleFeatures::num_features(); @@ -542,7 +554,7 @@ void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ bool State::contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const { for (const auto &c : parent->children) { - if (c->store_at.size() > 0) { + if (!c->store_at.empty()) { return true; } @@ -655,7 +667,7 @@ bool State::exceeds_local_memory_limit(const Target &target) const { return false; } -bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) { +bool State::calculate_cost(const FunctionDAG &dag, int hardware_parallelism, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) { Timer timer; if (!root->has_valid_thread_extents()) { Filter(root.get()) << "Invalid thread extents\n"; @@ -681,7 +693,7 @@ bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, StageMap features; - if (!compute_featurization(dag, params, target, &features, stats, verbose)) { + if (!compute_featurization(dag, hardware_parallelism, target, &features, stats, verbose)) { Filter(root.get()) << "Contains a local allocation that likely cannot be promoted to registers\n"; return false; } @@ -690,7 +702,7 @@ bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, if (verbose) { for (auto it = features.begin(); it != features.end(); it++) { - auto &stage = *(it.key()); + const auto &stage = *(it.key()); const auto &feat = it.value(); std::string name = stage.node->func.name(); sanitize_names(name); @@ -803,10 +815,10 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c } bool marked = false; - for (size_t block_i = 0; block_i < 3; ++block_i) { - for (size_t i = 1; i < block_var_assignments[block_i].size(); ++i) { - auto inner_i = block_var_assignments[block_i][0]; - auto outer_i = block_var_assignments[block_i][i]; + for (auto & block_var_assignment : block_var_assignments) { + for (size_t i = 1; i < block_var_assignment.size(); ++i) { + auto inner_i = block_var_assignment[0]; + auto outer_i = block_var_assignment[i]; state->schedule_source << "\n .fuse(" << parallel_vars[inner_i].name() << ", " << parallel_vars[outer_i].name() << ", " << parallel_vars[inner_i].name() << ")"; @@ -815,8 +827,8 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c parallel_vars[inner_i]); } - if (block_var_assignments[block_i].size() > 0) { - auto inner_i = block_var_assignments[block_i][0]; + if (!block_var_assignment.empty()) { + auto inner_i = block_var_assignment[0]; state->schedule_source << "\n .gpu_blocks(" << parallel_vars[inner_i].name() << ")"; stage.gpu_blocks(parallel_vars[inner_i]); state->parallel = true; @@ -845,7 +857,7 @@ void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c int max_blocks[3] = {2147483647, 65535, 65535}; uint8_t n_loops_tagged_gpu_blocks = 0; - for (auto &v : parallel_vars) { + for (const auto &v : parallel_vars) { if (n_loops_tagged_gpu_blocks >= 3 || parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) { break; } @@ -969,13 +981,13 @@ bool State::can_fuse_gpu(const vector ¶llel_extents) const { // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. -void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target) { +void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, const Target &target) { StageMap> state_map; std::vector ancestors; NodeMap all_inlined; root->collect_all_inlined(all_inlined); - root->apply(LoopLevel::root(), state_map, params.parallelism, 0, nullptr, nullptr, target, ancestors, all_inlined); + root->apply(LoopLevel::root(), state_map, hardware_parallelism, 0, nullptr, nullptr, target, ancestors, all_inlined); std::ostringstream src; std::unordered_set new_serial_vars; @@ -1024,7 +1036,8 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, } for (auto &p : state_map) { - if (p.first->node->is_input) continue; + if (p.first->node->is_input) { continue; +} Stage stage(p.first->stage); @@ -1036,8 +1049,10 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, vector constant_extents; bool any_parallel_vars = false, any_parallel_rvars = false; for (auto it = p.second->vars.rbegin(); it != p.second->vars.rend(); it++) { - if (!it->exists) continue; - if (!it->parallel) break; + if (!it->exists) { continue; +} + if (!it->parallel) { break; +} any_parallel_rvars |= it->var.is_rvar; any_parallel_vars |= !it->var.is_rvar; parallel_extents.push_back(it->extent); @@ -1091,7 +1106,7 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, } p.second->schedule_source << "\n .reorder_storage("; bool first = true; - for (auto v : storage_vars) { + for (const auto& v : storage_vars) { if (!first) { p.second->schedule_source << ", "; } @@ -1114,7 +1129,8 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, continue; } - if (p.first->node->is_input) continue; + if (p.first->node->is_input) { continue; +} Stage stage(p.first->stage); @@ -1167,8 +1183,8 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, // If there are store_ats at Var::outermost(), we need to ensure // that those store_ats are retained at the Var::outermost level - vars.push_back(new_outer); - vars.push_back(Var::outermost()); + vars.emplace_back(new_outer); + vars.emplace_back(Var::outermost()); p.second->schedule_source << "\n .reorder("; bool first = true; @@ -1199,7 +1215,8 @@ void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, } for (auto &p : state_map) { - if (p.first->node->is_input) continue; + if (p.first->node->is_input) { continue; +} // Dump the schedule source string src << p.first->name diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h index 88aa69225f1b..f3574507aa3d 100644 --- a/src/autoschedulers/anderson2021/State.h +++ b/src/autoschedulers/anderson2021/State.h @@ -106,7 +106,7 @@ struct State { bool has_compute_root_loops_without_blocks() const; struct FeatureLoopNestMutator { - const MachineParams ¶ms; + int hardware_parallelism; const Target ⌖ void operator()(LoopNest *new_loop_nest) const; @@ -122,13 +122,13 @@ struct State { void add_outer_thread_loops(LoopNest *loop_nest) const; }; - IntrusivePtr get_root_for_features(const MachineParams ¶ms, const Target &target) const; + IntrusivePtr get_root_for_features(int hardware_parallelism, const Target &target) const; void set_gpu_store_site(const map> &parent, const LoopNest *loop, LoopNest::Sites &site) const; - bool compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, StageMap *features, Statistics &stats, bool verbose = false) const; + bool compute_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, StageMap *features, Statistics &stats, bool verbose = false) const; - void save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, std::ostream &out) const; + void save_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, std::ostream &out) const; bool contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const; @@ -147,7 +147,7 @@ struct State { bool exceeds_local_memory_limit(const Target &target) const; - bool calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false); + bool calculate_cost(const FunctionDAG &dag, int hardware_parallelism, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false); // Make a child copy of this state. The loop nest is const (we // make mutated copies of it, rather than mutating it), so we can @@ -170,7 +170,7 @@ struct State { // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. - void apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms, const Target &target); + void apply_schedule(const FunctionDAG &dag, int hardware_parallelism, const Target &target); bool should_always_consider_inline(const FunctionDAG::Node *node) const; void add_to_always_consider_inline_options(const FunctionDAG::Node *node); diff --git a/src/autoschedulers/anderson2021/ThreadInfo.h b/src/autoschedulers/anderson2021/ThreadInfo.h index 83a81c7d4723..803c51003667 100644 --- a/src/autoschedulers/anderson2021/ThreadInfo.h +++ b/src/autoschedulers/anderson2021/ThreadInfo.h @@ -67,8 +67,8 @@ struct ThreadInfo { loop_vars.push_back(loop[i].var); } - if (loop_indices.size() == 0) { - internal_assert(size.size() > 0); + if (loop_indices.empty()) { + internal_assert(!size.empty()); ++num_thread_loops; loop_indices.push_back(0); loop_vars.push_back(loop[0].var); @@ -77,8 +77,8 @@ struct ThreadInfo { internal_assert(num_threads <= num_threads_in_this_block); internal_assert(loop_indices.size() == num_thread_loops); internal_assert(loop_vars.size() == num_thread_loops); - internal_assert(loop_indices.size() > 0 && loop_indices.size() <= 3); - internal_assert(loop_vars.size() > 0 && loop_vars.size() <= 3); + internal_assert(!loop_indices.empty() && loop_indices.size() <= 3); + internal_assert(!loop_vars.empty() && loop_vars.size() <= 3); count_num_active_warps_per_block(); } diff --git a/src/autoschedulers/anderson2021/Tiling.cpp b/src/autoschedulers/anderson2021/Tiling.cpp index 7171a726f293..e3dae9ba72e9 100644 --- a/src/autoschedulers/anderson2021/Tiling.cpp +++ b/src/autoschedulers/anderson2021/Tiling.cpp @@ -32,7 +32,7 @@ vector> generate_serial_tilings(const vector &s, int d, bool allow_inner_ones) { vector> result; if (d == -1) { - result.push_back(vector()); + result.emplace_back(); } else { vector> v; v = generate_serial_tilings(s, d - 1, last_d, vectorized_index, vec_dim_serial_sizes, filter_small_outer_extents, allow_inner_ones); @@ -40,7 +40,7 @@ vector> generate_serial_tilings(const vector &s, int d, t.push_back(0); bool used_full_extent = false; // include odd serial sizes that encourage multiples of 16 as thread tile size - if (vec_dim_serial_sizes.size() > 0 && d == vectorized_index) { + if (!vec_dim_serial_sizes.empty() && d == vectorized_index) { for (int inner : vec_dim_serial_sizes) { int outer = (s[d] + inner - 1) / inner; if (filter_small_outer_extents && outer < 16) { @@ -95,7 +95,7 @@ vector> generate_tilings(const vector &s, int d, int fa const vector &inner_sizes) { vector> result; if (d == -1) { - result.push_back(vector()); + result.emplace_back(); } else { vector> v; v = generate_tilings(s, d - 1, factor, allow_splits); @@ -130,8 +130,12 @@ vector> generate_tilings(const vector &s, int d, int fa if (!inner_sizes.empty()) { // using fixed set of inner loop extents for (int inner : inner_sizes) { int outer = (s[d] + inner - 1) / inner; - if (is_one && outer == 1) continue; - if (is_full && outer == s[d]) continue; + if (is_one && outer == 1) { + continue; + } + if (is_full && outer == s[d]) { + continue; + } t.back() = outer; result.push_back(t); } @@ -139,10 +143,16 @@ vector> generate_tilings(const vector &s, int d, int fa int max_inner = 0; for (int inner = 1; inner < s[d]; inner *= factor) { int outer = (s[d] + inner - 1) / inner; - if (is_one && outer == 1) continue; - if (is_full && outer == s[d]) continue; + if (is_one && outer == 1) { + continue; + } + if (is_full && outer == s[d]) { + continue; + } // Stop when we hit inner sizes that would do too much recompute - if (inner > 1 && inner * outer * 7 > s[d] * 8) break; + if (inner > 1 && inner * outer * 7 > s[d] * 8) { + break; + } max_inner = inner; t.back() = outer; result.push_back(t); @@ -150,12 +160,20 @@ vector> generate_tilings(const vector &s, int d, int fa for (int outer = 1; outer <= s[d]; outer *= factor) { int inner = (s[d] + outer - 1) / outer; - if (is_one && outer == 1) continue; - if (is_full && outer == s[d]) continue; + if (is_one && outer == 1) { + continue; + } + if (is_full && outer == s[d]) { + continue; + } // Stop when we get into the regime covered by the loop above. - if (outer > 1 && inner < max_inner * 2) break; + if (outer > 1 && inner < max_inner * 2) { + break; + } // Or when the wasted compute gets too bad. - if (inner * outer * 7 > s[d] * 8) break; + if (inner * outer * 7 > s[d] * 8) { + break; + } t.back() = outer; result.push_back(t); } @@ -206,7 +224,7 @@ vector> generate_gpu_tilings(const vector> &stag int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage) { vector> result; if (d == -1) { - result.push_back(vector()); + result.emplace_back(); } else { // set max thread count 64 for now in all dims int64_t max_threads_extent = 64, total_threads_limit = 1024; // less than 1024 to limit states @@ -235,9 +253,9 @@ vector> generate_gpu_tilings(const vector> &stag vector new_max_s = max_s; for (size_t stage = 0; stage < pure_dims.size(); stage++) { vector stage_thread_t, stage_lowered_size; - for (size_t i = 0; i < pure_dims[stage].size(); i++) { - if (pure_dims[stage][i] >= 0) { - stage_thread_t.push_back(thread_t[pure_dims[stage][i]]); + for (int i : pure_dims[stage]) { + if (i >= 0) { + stage_thread_t.push_back(thread_t[i]); } else { // impure dims have extent 1 stage_thread_t.push_back(1); } @@ -297,7 +315,9 @@ vector> generate_gpu_tilings(const vector> &stag break; } int64_t other_ext = (stage_sizes[0][d] + threads_ext - 1) / threads_ext; - if (d != vectorized_indices[0] && threads_ext > 1 && threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) break; + if (d != vectorized_indices[0] && threads_ext > 1 && threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) { + break; + } t.back() = threads_ext; validity valid_result = is_valid_tiling(); if (valid_result == serial_count_err) { diff --git a/src/autoschedulers/anderson2021/Tiling.h b/src/autoschedulers/anderson2021/Tiling.h index 80ce5fcd684d..3753cffc59bf 100644 --- a/src/autoschedulers/anderson2021/Tiling.h +++ b/src/autoschedulers/anderson2021/Tiling.h @@ -1,7 +1,7 @@ #ifndef TILING_H #define TILING_H -#include +#include #include using std::vector; diff --git a/src/autoschedulers/anderson2021/Weights.cpp b/src/autoschedulers/anderson2021/Weights.cpp index adf419f0f7ea..8206410a397f 100644 --- a/src/autoschedulers/anderson2021/Weights.cpp +++ b/src/autoschedulers/anderson2021/Weights.cpp @@ -19,7 +19,7 @@ void Weights::randomize(uint32_t seed) { // Fill the weights with random values for_each_buffer([&rng](Buffer &w) { w.for_each_value([&rng](float &f) { - f = ((float)rng()) / ((float)rng.max()) - 0.5f; + f = ((float)rng()) / ((float)std::mt19937::max()) - 0.5f; }); }); } @@ -41,38 +41,64 @@ void Weights::randomize(uint32_t seed) { bool Weights::load(std::istream &i) { uint32_t signature; i.read((char *)&signature, sizeof(signature)); - if (i.fail() || signature != kSignature) return false; + if (i.fail() || signature != kSignature) { + return false; + } i.read((char *)&pipeline_features_version, sizeof(pipeline_features_version)); - if (i.fail()) return false; + if (i.fail()) { + return false; + } i.read((char *)&schedule_features_version, sizeof(schedule_features_version)); - if (i.fail()) return false; + if (i.fail()) { + return false; + } uint32_t buffer_count; i.read((char *)&buffer_count, sizeof(buffer_count)); - if (i.fail() || buffer_count != 6) return false; + if (i.fail() || buffer_count != 6) { + return false; + } const auto load_one = [&i](Buffer &buf) -> bool { uint32_t dimension_count; i.read((char *)&dimension_count, sizeof(dimension_count)); - if (i.fail() || dimension_count != (uint32_t)buf.dimensions()) return false; + if (i.fail() || dimension_count != (uint32_t)buf.dimensions()) { + return false; + } for (uint32_t d = 0; d < dimension_count; d++) { uint32_t extent; i.read((char *)&extent, sizeof(extent)); - if (i.fail() || (int)extent != (int)buf.extent(d)) return false; + if (i.fail() || (int)extent != (int)buf.extent(d)) { + return false; + } } i.read((char *)(buf.data()), buf.size_in_bytes()); - if (i.fail()) return false; + if (i.fail()) { + return false; + } return true; }; - if (!load_one(head1_filter)) return false; - if (!load_one(head1_bias)) return false; - if (!load_one(head2_filter)) return false; - if (!load_one(head2_bias)) return false; - if (!load_one(conv1_filter)) return false; - if (!load_one(conv1_bias)) return false; + if (!load_one(head1_filter)) { + return false; + } + if (!load_one(head1_bias)) { + return false; + } + if (!load_one(head2_filter)) { + return false; + } + if (!load_one(head2_bias)) { + return false; + } + if (!load_one(conv1_filter)) { + return false; + } + if (!load_one(conv1_bias)) { + return false; + } return true; } @@ -84,38 +110,64 @@ bool Weights::load_from_file(const std::string &filename) { bool Weights::save(std::ostream &o) const { const uint32_t signature = kSignature; o.write((const char *)&signature, sizeof(signature)); - if (o.fail()) return false; + if (o.fail()) { + return false; + } o.write((const char *)&pipeline_features_version, sizeof(pipeline_features_version)); - if (o.fail()) return false; + if (o.fail()) { + return false; + } o.write((const char *)&schedule_features_version, sizeof(schedule_features_version)); - if (o.fail()) return false; + if (o.fail()) { + return false; + } const uint32_t buffer_count = 6; o.write((const char *)&buffer_count, sizeof(buffer_count)); - if (o.fail()) return false; + if (o.fail()) { + return false; + } const auto save_one = [&o](const Buffer &buf) -> bool { const uint32_t dimension_count = buf.dimensions(); o.write((const char *)&dimension_count, sizeof(dimension_count)); - if (o.fail()) return false; + if (o.fail()) { + return false; + } for (uint32_t d = 0; d < dimension_count; d++) { uint32_t extent = buf.extent(d); o.write((const char *)&extent, sizeof(extent)); - if (o.fail()) return false; + if (o.fail()) { + return false; + } } o.write((const char *)(buf.data()), buf.size_in_bytes()); - if (o.fail()) return false; + if (o.fail()) { + return false; + } return true; }; - if (!save_one(head1_filter)) return false; - if (!save_one(head1_bias)) return false; - if (!save_one(head2_filter)) return false; - if (!save_one(head2_bias)) return false; - if (!save_one(conv1_filter)) return false; - if (!save_one(conv1_bias)) return false; + if (!save_one(head1_filter)) { + return false; + } + if (!save_one(head1_bias)) { + return false; + } + if (!save_one(head2_filter)) { + return false; + } + if (!save_one(head2_bias)) { + return false; + } + if (!save_one(conv1_filter)) { + return false; + } + if (!save_one(conv1_bias)) { + return false; + } return true; } @@ -130,16 +182,30 @@ bool Weights::load_from_dir(const std::string &dir) { std::ifstream i(filename, std::ios_base::binary); i.read((char *)(buf.data()), buf.size_in_bytes()); i.close(); - if (i.fail()) return false; + if (i.fail()) { + return false; + } return true; }; - if (!buffer_from_file(dir + "/head1_conv1_weight.data", head1_filter)) return false; - if (!buffer_from_file(dir + "/head1_conv1_bias.data", head1_bias)) return false; - if (!buffer_from_file(dir + "/head2_conv1_weight.data", head2_filter)) return false; - if (!buffer_from_file(dir + "/head2_conv1_bias.data", head2_bias)) return false; - if (!buffer_from_file(dir + "/trunk_conv1_weight.data", conv1_filter)) return false; - if (!buffer_from_file(dir + "/trunk_conv1_bias.data", conv1_bias)) return false; + if (!buffer_from_file(dir + "/head1_conv1_weight.data", head1_filter)) { + return false; + } + if (!buffer_from_file(dir + "/head1_conv1_bias.data", head1_bias)) { + return false; + } + if (!buffer_from_file(dir + "/head2_conv1_weight.data", head2_filter)) { + return false; + } + if (!buffer_from_file(dir + "/head2_conv1_bias.data", head2_bias)) { + return false; + } + if (!buffer_from_file(dir + "/trunk_conv1_weight.data", conv1_filter)) { + return false; + } + if (!buffer_from_file(dir + "/trunk_conv1_bias.data", conv1_bias)) { + return false; + } // Old style data doesn't record the versions, so just assume they are current pipeline_features_version = PipelineFeatures::version(); @@ -153,16 +219,30 @@ bool Weights::save_to_dir(const std::string &dir) const { std::ofstream o(filename, std::ios_base::trunc | std::ios_base::binary); o.write((const char *)(buf.data()), buf.size_in_bytes()); o.close(); - if (o.fail()) return false; + if (o.fail()) { + return false; + } return true; }; - if (!buffer_to_file(head1_filter, dir + "/head1_conv1_weight.data")) return false; - if (!buffer_to_file(head1_bias, dir + "/head1_conv1_bias.data")) return false; - if (!buffer_to_file(head2_filter, dir + "/head2_conv1_weight.data")) return false; - if (!buffer_to_file(head2_bias, dir + "/head2_conv1_bias.data")) return false; - if (!buffer_to_file(conv1_filter, dir + "/trunk_conv1_weight.data")) return false; - if (!buffer_to_file(conv1_bias, dir + "/trunk_conv1_bias.data")) return false; + if (!buffer_to_file(head1_filter, dir + "/head1_conv1_weight.data")) { + return false; + } + if (!buffer_to_file(head1_bias, dir + "/head1_conv1_bias.data")) { + return false; + } + if (!buffer_to_file(head2_filter, dir + "/head2_conv1_weight.data")) { + return false; + } + if (!buffer_to_file(head2_bias, dir + "/head2_conv1_bias.data")) { + return false; + } + if (!buffer_to_file(conv1_filter, dir + "/trunk_conv1_weight.data")) { + return false; + } + if (!buffer_to_file(conv1_bias, dir + "/trunk_conv1_bias.data")) { + return false; + } return true; } diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh index a9e5c23fa886..5e3196afdebd 100644 --- a/src/autoschedulers/anderson2021/autotune_loop.sh +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -191,7 +191,6 @@ make_featurization() { HL_BEAM_SIZE=${beam} \ HL_SHARED_MEMORY_LIMIT=${shared_memory_limit} \ HL_SHARED_MEMORY_SM_LIMIT=${shared_memory_sm_limit} \ - HL_MACHINE_PARAMS=${HL_MACHINE_PARAMS} \ HL_DEBUG_AUTOSCHEDULE=1 \ HL_DEBUG_CODEGEN=1 \ /bin/time -f 'Compile time (s): %e' ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \ @@ -201,10 +200,10 @@ make_featurization() { -o ${D} \ -e stmt,assembly,static_library,c_header,registration,schedule,featurization \ target=${HL_TARGET} \ - auto_schedule=true \ ${EXTRA_GENERATOR_ARGS} \ -p ${AUTOSCHED_BIN}/libautoschedule_anderson2021.so \ - -s Anderson2021 \ + autoscheduler=Anderson2021 \ + autoscheduler.parallelism=${HARDWARE_PARALLELISM} 2> ${D}/compile_err.txt > ${D}/compile_log.txt" FAILED=0 diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp index 64da7eb27383..e8b5e84cfc75 100644 --- a/src/autoschedulers/anderson2021/cost_model_generator.cpp +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -2,6 +2,8 @@ // templated such that it can be compiled in either forward or // backwards mode, for inference or training respectively. +#include + #include "Halide.h" #include "NetworkSize.h" @@ -21,12 +23,18 @@ struct ModelWeight : public GeneratorInput> { ModelWeight(const std::string &name, int dim) : GeneratorInput>(name, dim) { } - void backprop(const Derivative &d, Expr learning_rate, Expr timestep) { + void backprop(const Derivative &d, const Expr& learning_rate, const Expr& timestep) { } void set_shape(int s0 = 0, int s1 = 0, int s2 = 0) { - if (s0) dim(0).set_bounds(0, s0); - if (s1) dim(1).set_bounds(0, s1); - if (s2) dim(2).set_bounds(0, s2); + if (s0) { + dim(0).set_bounds(0, s0); + } + if (s1) { + dim(1).set_bounds(0, s1); + } + if (s2) { + dim(2).set_bounds(0, s2); + } } }; @@ -37,10 +45,11 @@ struct ModelWeight : public GeneratorInput> { ModelWeight(const std::string &name, int dim) : GeneratorInput>(name, dim), grad("updated_" + name, dim + 1) { } - void backprop(const Derivative &d, Expr learning_rate, Expr timestep) { + void backprop(const Derivative &d, Expr learning_rate, const Expr& timestep) { std::vector args(dimensions() + 1); - for (auto &e : args) + for (auto &e : args) { e = Var(); +} grad(args) = undef(); // We'll report back the new weights and the loss gradients, @@ -71,7 +80,7 @@ struct ModelWeight : public GeneratorInput> { Expr smoothed_second_moment_correction = 1 / (1 - pow(0.999f, timestep + 1)); // Update the weights - Expr step = learning_rate * smoothed_deriv * smoothed_deriv_correction; + Expr step = std::move(learning_rate) * smoothed_deriv * smoothed_deriv_correction; step /= sqrt(smoothed_second_moment * smoothed_second_moment_correction) + 1e-5f; new_weight = current_weight - step; @@ -114,7 +123,7 @@ class CostModel : public Generator> { using Input = GeneratorInput; template using Output = GeneratorOutput; - using Generator>::auto_schedule; + using Generator>::using_autoscheduler; using Generator>::get_pipeline; // Number of pipeline stages @@ -169,20 +178,20 @@ class CostModel : public Generator> { Output> loss_output{"loss_output", 0}; // Zero pad alone the last dimension of a Func - Func pad_stages(Func f, Expr stages) { + Func pad_stages(const Func& f, Expr stages) { Halide::Region bounds(f.dimensions()); bounds[1].min = 0; - bounds[1].extent = stages; + bounds[1].extent = std::move(stages); return BoundaryConditions::constant_exterior(f, cast(f.value().type(), 0), bounds); } - Expr activation(Expr e) { + Expr activation(const Expr& e) { // leaky relu return max(e, 0) + min(e, 0) * 1e-10f; } Expr sigmoid(Expr e) { - return 1 / (1 + exp(-e)); + return 1 / (1 + exp(-std::move(e))); } Expr print_wrap(Expr e, const std::string &out, const Var &n, const Var &w) { @@ -592,9 +601,9 @@ class CostModel : public Generator> { true_runtime.set_estimates({{0, 80}}); // SCHEDULE - if (training && !auto_schedule) { + if (training && !using_autoscheduler()) { do_cost_model_schedule(get_pipeline()); - } else if (auto_schedule) { + } else if (using_autoscheduler()) { // Do nothing. } else { // We just write down a good schedule for @@ -612,7 +621,7 @@ class CostModel : public Generator> { const int vec = 8; // A helper function for scheduling conv layers - auto schedule_conv = [&](Func conv, Func relu, RVar r_channels) { + auto schedule_conv = [&](Func conv, Func relu, const RVar& r_channels) { Var ci("ci"), wi("wi"); if (!training) { relu diff --git a/src/autoschedulers/anderson2021/generate_data.sh b/src/autoschedulers/anderson2021/generate_data.sh index 6f8cf28d9142..8ff3d9fbd402 100644 --- a/src/autoschedulers/anderson2021/generate_data.sh +++ b/src/autoschedulers/anderson2021/generate_data.sh @@ -183,7 +183,7 @@ for app in $APPS; do NUM_BATCHES=${MAX_ITERATIONS} \ TRAIN_ONLY=${TRAIN_ONLY} \ SAMPLES_DIR=${SAMPLES_DIR} \ - HL_MACHINE_PARAMS=80,1,1 \ + HARDWARE_PARALLELISM=80 \ SAMPLES_DIR=${SAMPLES_DIR} \ HL_DEBUG_CODEGEN=0 \ HL_SHARED_MEMORY_LIMIT=48 \ diff --git a/src/autoschedulers/anderson2021/retrain_cost_model.cpp b/src/autoschedulers/anderson2021/retrain_cost_model.cpp index 7f40b318f961..e688448d8ee1 100644 --- a/src/autoschedulers/anderson2021/retrain_cost_model.cpp +++ b/src/autoschedulers/anderson2021/retrain_cost_model.cpp @@ -102,8 +102,9 @@ struct Flags { std::vector parse_floats(const std::string &s) { const char *c = s.c_str(); std::vector v; - while (isspace(*c)) + while (isspace(*c)) { ++c; + } while (*c) { string f; while (*c && !isspace(*c)) { @@ -153,10 +154,14 @@ uint64_t hash_floats(uint64_t h, const float *begin, const float *end) { } bool ends_with(const string &str, const string &suffix) { - if (str.size() < suffix.size()) return false; + if (str.size() < suffix.size()) { + return false; + } size_t off = str.size() - suffix.size(); for (size_t i = 0; i < suffix.size(); i++) { - if (str[off + i] != suffix[i]) return false; + if (str[off + i] != suffix[i]) { + return false; + } } return true; } @@ -297,8 +302,8 @@ size_t load_samples(map &training_set, map(head2_w, num_stages); @@ -488,7 +493,7 @@ int main(int argc, char **argv) { std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(4); - auto seed = time(NULL); + auto seed = time(nullptr); std::mt19937 rng((uint32_t)seed); std::cout << "Iterating over " << samples.size() << " pipelines using seed = " << seed << "\n"; @@ -612,10 +617,12 @@ int main(int argc, char **argv) { int good = 0, bad = 0; for (auto &sched : sample.schedules) { auto &ref = sample.schedules[sample.fastest_schedule_hash]; - if (sched.second.prediction[model] == 0) continue; + if (sched.second.prediction[model] == 0) { continue; +} assert(sched.second.runtimes[0] >= ref.runtimes[0]); float runtime_ratio = sched.second.runtimes[0] / ref.runtimes[0]; - if (runtime_ratio <= 1.3f) continue; // Within 30% of the runtime of the best + if (runtime_ratio <= 1.3f) { continue; // Within 30% of the runtime of the best +} if (sched.second.prediction[model] >= ref.prediction[model]) { good++; } else { @@ -654,7 +661,8 @@ int main(int argc, char **argv) { for (int model = 0; model < kModels; model++) { std::cout << loss_sum[model] / loss_sum_counter[model] << " "; } - if (kModels > 1) std::cout << "\n"; + if (kModels > 1) { std::cout << "\n"; +} std::cout << " Rate: "; int best_model = 0; float best_rate = 0; @@ -678,7 +686,8 @@ int main(int argc, char **argv) { } } - if (kModels > 1) std::cout << "\n"; + if (kModels > 1) { std::cout << "\n"; +} if (!predict_only && samples.count(worst_miss_pipeline_id)) { std::cout << " Worst: " << worst_miss << " " << leaf(samples[worst_miss_pipeline_id].schedules[worst_miss_schedule_id].filename) << " "; } diff --git a/src/autoschedulers/anderson2021/test.cpp b/src/autoschedulers/anderson2021/test.cpp index cfaa3052dbc0..cd8b9dd86c6b 100644 --- a/src/autoschedulers/anderson2021/test.cpp +++ b/src/autoschedulers/anderson2021/test.cpp @@ -14,7 +14,7 @@ int main(int argc, char **argv) { Var x("x"), y("y"); - if (1) { + if (true) { // In a point-wise pipeline, everything should be fully fused. Func f("f"), g("g"), h("h"); f(x, y) = (x + y) * (x + y); @@ -26,7 +26,7 @@ int main(int argc, char **argv) { Pipeline(h).auto_schedule(target, params); } - if (1) { + if (true) { // In a pipeline with huge expensive stencils and low memory costs, nothing should be fused Func f("f"), g("g"), h("h"); f(x, y) = (x + y) * (x + 2 * y) * (x + 3 * y) * (x + 4 * y) * (x + 5 * y); @@ -46,7 +46,7 @@ int main(int argc, char **argv) { Pipeline(h).auto_schedule(target, params); } - if (1) { + if (true) { // In a pipeline with moderate isotropic stencils, there should be some square tiling Func f("f"), h("h"); f(x, y) = (x + y) * (x + 2 * y) * (x + 3 * y); @@ -60,7 +60,7 @@ int main(int argc, char **argv) { } // Smaller footprint stencil -> smaller tiles - if (1) { + if (true) { Func f("f"), g("g"), h("h"); f(x, y) = (x + y) * (x + 2 * y) * (x + 3 * y); h(x, y) = (f(x - 1, y - 1) + f(x, y - 1) + f(x + 1, y - 1) + @@ -73,7 +73,7 @@ int main(int argc, char **argv) { } // A stencil chain - if (1) { + if (true) { const int N = 8; Func f[N]; f[0](x, y) = (x + y) * (x + 2 * y) * (x + 3 * y); @@ -92,7 +92,7 @@ int main(int argc, char **argv) { } // An outer product - if (1) { + if (true) { Buffer a(2048), b(2048); Func f; f(x, y) = a(x) * b(y); @@ -103,7 +103,7 @@ int main(int argc, char **argv) { } // A separable downsample that models the start of local_laplacian - if (1) { + if (true) { Buffer in(2048, 2048); Var k; Func orig("orig"), expensive("expensive"), downy("downy"), downx("downx"); @@ -122,7 +122,7 @@ int main(int argc, char **argv) { } // A Func with multiple stages, some of which include additional loops - if (1) { + if (true) { Buffer a(1024, 1024); Func f("multiple_stages"), g("g"), h("h"); Var x, y; @@ -140,7 +140,7 @@ int main(int argc, char **argv) { Pipeline(g).auto_schedule(target, params); } - if (1) { + if (true) { // A scan with pointwise stages before and after Buffer a(1024, 1024); Func before[5]; @@ -164,7 +164,7 @@ int main(int argc, char **argv) { Pipeline(after[4]).auto_schedule(target, params); } - if (1) { + if (true) { Buffer im_a(1024, 1024, "a"), im_b(1024, 1024, "b"); im_a.fill(0.0f); im_b.fill(0.0f); @@ -183,7 +183,7 @@ int main(int argc, char **argv) { Pipeline(out).auto_schedule(target, params); } - if (1) { + if (true) { // A scan in x followed by a downsample in y, with pointwise stuff in between const int N = 3; Buffer a(1024, 1024); @@ -213,7 +213,7 @@ int main(int argc, char **argv) { Pipeline(p3[N - 1]).auto_schedule(target, params); } - if (1) { + if (true) { // A gather that only uses a small portion of a potentially // large LUT. The number of points computed should be less // than points computed minimum, and the LUT should be @@ -233,7 +233,7 @@ int main(int argc, char **argv) { Pipeline(out).auto_schedule(target, params); } - if (1) { + if (true) { // A pipeline where the vectorized dimension should alternate index Func f("f"), g("g"), h("h"); f(x, y) = x * y; @@ -248,7 +248,7 @@ int main(int argc, char **argv) { Pipeline(h).auto_schedule(target, params); } - if (1) { + if (true) { // A no-win scenario in which a Func is going to be read from // lots of times using a vector gather no matter how it is // scheduled. @@ -266,7 +266,7 @@ int main(int argc, char **argv) { Pipeline({a, b}).auto_schedule(target, params); } - if (1) { + if (true) { // Boring memcpy ImageParam im(Float(32), 2); Func f("f"), g("g"); @@ -277,7 +277,7 @@ int main(int argc, char **argv) { Pipeline(g).auto_schedule(target, params); } - if (1) { + if (true) { // A load from a tiny input image ImageParam im(Float(32), 2); Func f("f"); @@ -287,7 +287,7 @@ int main(int argc, char **argv) { Pipeline(f).auto_schedule(target, params); } - if (1) { + if (true) { // Lots of dimensions ImageParam im(Float(32), 7); Func f("f"); @@ -304,7 +304,7 @@ int main(int argc, char **argv) { Pipeline(f).auto_schedule(target, params); } - if (1) { + if (true) { // Long transpose chain. ImageParam im(Float(32), 2); Func f("f"), g("g"), h("h"); @@ -323,7 +323,7 @@ int main(int argc, char **argv) { Pipeline({out1, out2}).auto_schedule(target, params); } - if (1) { + if (true) { ImageParam im(Float(32), 2); // An inlinable Func used at the start and at the end of a long stencil chain. const int N = 8; @@ -348,7 +348,7 @@ int main(int argc, char **argv) { Pipeline(g).auto_schedule(target, params); } - if (1) { + if (true) { // Vectorizing a pure var in an update using RoundUp Func f("f"), g("g"); @@ -363,7 +363,7 @@ int main(int argc, char **argv) { Pipeline(g).auto_schedule(target, params); } - if (1) { + if (true) { ImageParam im(Float(32), 2); // A convolution pyramid @@ -395,7 +395,7 @@ int main(int argc, char **argv) { Pipeline(out).auto_schedule(target, params); } - if (1) { + if (true) { ImageParam im(Float(32), 2); Func f("f"); @@ -413,7 +413,7 @@ int main(int argc, char **argv) { Pipeline(casted).auto_schedule(target, params); } - if (1) { + if (true) { ImageParam im(Int(32), 2); Func f("f"), hist("hist"), output("output"); diff --git a/src/autoschedulers/anderson2021/test/bounds.cpp b/src/autoschedulers/anderson2021/test/bounds.cpp index 86ca90431bae..a6ff1361e1a8 100644 --- a/src/autoschedulers/anderson2021/test/bounds.cpp +++ b/src/autoschedulers/anderson2021/test/bounds.cpp @@ -7,7 +7,6 @@ using namespace Halide::Internal; using namespace Halide::Internal::Autoscheduler; void test_bounds() { - MachineParams params(80, 16000000, 40); Target target("host-cuda"); Var x("x"), y("y"); @@ -21,7 +20,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(h.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_h = &dag.nodes[0]; const FunctionDAG::Node *node_g = &dag.nodes[1]; @@ -40,10 +39,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); const auto &thread = root->children[0]->children[0]; const auto &thread_bounds_g = thread->get_bounds(node_g); @@ -65,7 +64,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[2]; @@ -80,10 +79,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(2); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); const auto &thread = root->children[0]->children[0]; const auto &thread_bounds_g = thread->get_bounds(node_g); @@ -109,7 +108,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_h = &dag.nodes[1]; @@ -128,10 +127,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -167,7 +166,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[1]; @@ -186,10 +185,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -267,7 +266,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[1]; @@ -286,10 +285,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); diff --git a/src/autoschedulers/anderson2021/test/state.cpp b/src/autoschedulers/anderson2021/test/state.cpp index 0cc266857e47..b283243cc55a 100644 --- a/src/autoschedulers/anderson2021/test/state.cpp +++ b/src/autoschedulers/anderson2021/test/state.cpp @@ -7,7 +7,6 @@ using namespace Halide::Internal; using namespace Halide::Internal::Autoscheduler; void test_state() { - MachineParams params(80, 16000000, 40); Target target("host-cuda"); // Test update_always_consider_inline_options @@ -22,7 +21,7 @@ void test_state() { std::vector outputs; outputs.push_back(h.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_h = &dag.nodes[0]; const FunctionDAG::Node *node_g = &dag.nodes[1]; diff --git a/src/autoschedulers/anderson2021/test/storage_strides.cpp b/src/autoschedulers/anderson2021/test/storage_strides.cpp index dedb79a45cdf..b1b4ed4c83db 100644 --- a/src/autoschedulers/anderson2021/test/storage_strides.cpp +++ b/src/autoschedulers/anderson2021/test/storage_strides.cpp @@ -6,7 +6,6 @@ using namespace Halide::Internal; using namespace Halide::Internal::Autoscheduler; void test_bounds() { - MachineParams params(80, 16000000, 40); Target target("host-cuda"); bool verbose = false; int bytes_per_point = 4; @@ -22,7 +21,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(h.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_h = &dag.nodes[0]; const FunctionDAG::Node *node_g = &dag.nodes[1]; @@ -41,10 +40,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); const auto &thread = root->children[0]->children[0]; const auto &thread_bounds_g = thread->get_bounds(node_g); @@ -74,7 +73,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_h = &dag.nodes[1]; @@ -93,10 +92,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -132,7 +131,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[1]; @@ -149,10 +148,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -192,7 +191,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[1]; @@ -209,10 +208,10 @@ void test_bounds() { std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -253,7 +252,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[1]; @@ -271,12 +270,12 @@ void test_bounds() { tiling.push_back(1); tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); tiling.clear(); tiling.push_back(1); tiling.push_back(32); // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -321,7 +320,7 @@ void test_bounds() { std::vector outputs; outputs.push_back(out.function()); - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); const FunctionDAG::Node *node_out = &dag.nodes[0]; const FunctionDAG::Node *node_f = &dag.nodes[1]; @@ -341,7 +340,7 @@ void test_bounds() { tiling.push_back(1); tiling.push_back(1); // Serial loop - auto thread_loop = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + auto thread_loop = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr thread_loop_copy{new LoopNest}; thread_loop_copy->copy_from(*thread_loop); thread_loop_copy->compute_here(node_f, true, 0, false, target); @@ -350,7 +349,7 @@ void test_bounds() { tiling.push_back(1); // Thread loop root->children[0] = thread_loop_copy.release(); - root->children[0] = root->children[0]->parallelize_in_tiles(params, tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); diff --git a/src/autoschedulers/anderson2021/test/thread_info.cpp b/src/autoschedulers/anderson2021/test/thread_info.cpp index 149322068ff4..697abea28bf5 100644 --- a/src/autoschedulers/anderson2021/test/thread_info.cpp +++ b/src/autoschedulers/anderson2021/test/thread_info.cpp @@ -7,7 +7,6 @@ using namespace Halide::Internal; using namespace Halide::Internal::Autoscheduler; void test_thread_info() { - MachineParams params(80, 16000000, 40); Target target("host-cuda"); Var x("x"), y("y"); @@ -25,63 +24,53 @@ void test_thread_info() { size.push_back(16); size.push_back(8); - loop_extents.push_back(16); - loop_extents.push_back(8); + //loop_extents.push_back(16); + //loop_extents.push_back(8); // 16x8 max_thread_counts.push_back(16); max_thread_counts.push_back(8); { - ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + ThreadInfo info{vectorized_loop_index, size, loop, max_thread_counts}; EXPECT_EQ(128, info.num_threads); - EXPECT_EQ(1.0, info.max_theoretical_warp_lane_utilization); EXPECT_EQ(1.0, info.warp_lane_utilization()); - EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_x()); - EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_y()); } - // Smaller stage: test that its max_theoretical_warp_lane_utilization is - // penalized because its 'size' is smaller than its loop_extents, - // indicating that it has been split: it could achieve better utilization if it had not been split + // Smaller stage: its 'size' is smaller than its loop_extents, + // indicating that it has been split; it could achieve better + // utilization if it had not been split size.clear(); size.push_back(8); size.push_back(8); { - ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + ThreadInfo info{vectorized_loop_index, size, loop, max_thread_counts}; EXPECT_EQ(64, info.num_threads); - EXPECT_EQ(0.5, info.max_theoretical_warp_lane_utilization); EXPECT_EQ(0.5, info.warp_lane_utilization()); - EXPECT_EQ(0.5, info.warp_lane_utilization_at_block_x()); - EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_y()); } - // Smaller stage: test that its max_theoretical_warp_lane_utilization is not - // penalized because its loop is smaller than the max thread loop and + // Smaller stage: its loop is smaller than the max thread loop and // cannot possibly achieve better utilization - loop_extents.clear(); - loop_extents.push_back(8); - loop_extents.push_back(8); + //loop_extents.clear(); + //loop_extents.push_back(8); + //loop_extents.push_back(8); { - ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + ThreadInfo info{vectorized_loop_index, size, loop, max_thread_counts}; EXPECT_EQ(64, info.num_threads); - EXPECT_EQ(1.0, info.max_theoretical_warp_lane_utilization); EXPECT_EQ(0.5, info.warp_lane_utilization()); - EXPECT_EQ(0.5, info.warp_lane_utilization_at_block_x()); - EXPECT_EQ(1.0, info.warp_lane_utilization_at_block_y()); } size.clear(); size.push_back(11); size.push_back(11); size.push_back(2); - loop_extents.clear(); - loop_extents.push_back(11); - loop_extents.push_back(11); - loop_extents.push_back(2); + //loop_extents.clear(); + //loop_extents.push_back(11); + //loop_extents.push_back(11); + //loop_extents.push_back(2); max_thread_counts.clear(); max_thread_counts.push_back(16); max_thread_counts.push_back(16); @@ -89,13 +78,9 @@ void test_thread_info() { loop.push_back({}); { - ThreadInfo info{vectorized_loop_index, size, loop, loop_extents, max_thread_counts}; + ThreadInfo info{vectorized_loop_index, size, loop, max_thread_counts}; EXPECT_EQ(242, info.num_threads); - EXPECT_EQ(1.0, info.max_theoretical_warp_lane_utilization); EXPECT_EQ(0.472656, info.warp_lane_utilization()); - EXPECT_EQ(0.6875, info.warp_lane_utilization_at_block_x()); - EXPECT_EQ(0.6875, info.warp_lane_utilization_at_block_y()); - EXPECT_EQ(1, info.warp_lane_utilization_at_block_z()); } } } diff --git a/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp b/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp index 9a21c6a96e58..1f81e298d3a1 100644 --- a/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp +++ b/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include From 4ad51a22e35222a875f04f9b35679d7a3ee80001 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 14 Aug 2022 02:06:31 -0400 Subject: [PATCH 05/63] remove commented code --- .../anderson2021/AutoSchedule.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index a80c7222f81f..74b701bee937 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -656,24 +656,6 @@ void generate_schedule(const std::vector &outputs, aslog(1) << "Time taken for autoscheduler (s): " << std::chrono::duration_cast(total_time).count() / 1000.0 << '\n'; } -// Halide uses a plugin architecture for registering custom -// autoschedulers. We register our autoscheduler using a static -// constructor. -//struct RegisterAutoscheduler { - //RegisterAutoscheduler() { - //aslog(1) << "Registering autoscheduler 'Anderson2021'...\n"; - //Pipeline::add_autoscheduler("Anderson2021", *this); - //} - - //void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms, AutoSchedulerResults *results) { - //std::vector outputs; - //for (const Func& f : p.outputs()) { - //outputs.push_back(f.function()); - //} - //Autoscheduler::generate_schedule(outputs, target, params.parallelism, results); - //} -//} register_auto_scheduler; - struct Anderson2021 { #ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms_in, AutoSchedulerResults *results) { From 9932266d0f154cd9fd23e92ff5079e14b6d1700e Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 15 Aug 2022 23:03:50 -0400 Subject: [PATCH 06/63] use updated api --- src/autoschedulers/anderson2021/test.cpp | 101 +++++++++++++++++- .../anderson2021/test_function_dag.cpp | 17 ++- 2 files changed, 107 insertions(+), 11 deletions(-) diff --git a/src/autoschedulers/anderson2021/test.cpp b/src/autoschedulers/anderson2021/test.cpp index cd8b9dd86c6b..7235d50f5ef8 100644 --- a/src/autoschedulers/anderson2021/test.cpp +++ b/src/autoschedulers/anderson2021/test.cpp @@ -6,9 +6,14 @@ int main(int argc, char **argv) { // Loads libautoschedule_anderson2021.so // which is presumed to be in current library search path load_plugin("autoschedule_anderson2021"); - Pipeline::set_default_autoscheduler_name("Anderson2021"); - MachineParams params(80, 1, 1); + constexpr int hardware_parallelism = 80; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + MachineParams params(hardware_parallelism, 1, 1); +#else + AutoschedulerParams params = {"Anderson2021", {{"parallelism", std::to_string(hardware_parallelism)}}}; +#endif + // Use a fixed target for the analysis to get consistent results from this test. Target target("x86-64-linux-sse41-avx-avx2-cuda"); @@ -23,7 +28,11 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(h).auto_schedule(target, params); +#else + Pipeline(h).apply_autoscheduler(target, params); +#endif } if (true) { @@ -43,7 +52,11 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(h).auto_schedule(target, params); +#else + Pipeline(h).apply_autoscheduler(target, params); +#endif } if (true) { @@ -56,7 +69,11 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(h).auto_schedule(target, params); +#else + Pipeline(h).apply_autoscheduler(target, params); +#endif } // Smaller footprint stencil -> smaller tiles @@ -69,7 +86,11 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(h).auto_schedule(target, params); +#else + Pipeline(h).apply_autoscheduler(target, params); +#endif } // A stencil chain @@ -88,7 +109,11 @@ int main(int argc, char **argv) { } f[N - 1].set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(f[N - 1]).auto_schedule(target, params); +#else + Pipeline(f[N - 1]).apply_autoscheduler(target, params); +#endif } // An outer product @@ -99,7 +124,11 @@ int main(int argc, char **argv) { f.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(f).auto_schedule(target, params); +#else + Pipeline(f).apply_autoscheduler(target, params); +#endif } // A separable downsample that models the start of local_laplacian @@ -118,7 +147,11 @@ int main(int argc, char **argv) { downx(x, y, k) = downy(2 * x - 1, y, k) + downy(2 * x, y, k) + downy(2 * x + 1, y, k) + downy(2 * x + 2, y, k); downx.set_estimate(x, 1, 1022).set_estimate(y, 1, 1022).set_estimate(k, 0, 256); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(downx).auto_schedule(target, params); +#else + Pipeline(downx).apply_autoscheduler(target, params); +#endif } // A Func with multiple stages, some of which include additional loops @@ -137,7 +170,11 @@ int main(int argc, char **argv) { g.set_estimate(x, 1, 1022).set_estimate(y, 1, 1022); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(g).auto_schedule(target, params); +#else + Pipeline(g).apply_autoscheduler(target, params); +#endif } if (true) { @@ -161,7 +198,11 @@ int main(int argc, char **argv) { after[4].set_estimate(x, 0, 1024).set_estimate(y, 0, 1024); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(after[4]).auto_schedule(target, params); +#else + Pipeline(after[4]).apply_autoscheduler(target, params); +#endif } if (true) { @@ -180,7 +221,11 @@ int main(int argc, char **argv) { out.set_estimate(j, 0, 1024).set_estimate(i, 0, 1024); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(out).auto_schedule(target, params); +#else + Pipeline(out).apply_autoscheduler(target, params); +#endif } if (true) { @@ -210,7 +255,11 @@ int main(int argc, char **argv) { p3[N - 1].set_estimate(x, 0, 1024).set_estimate(y, 0, 1024); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(p3[N - 1]).auto_schedule(target, params); +#else + Pipeline(p3[N - 1]).apply_autoscheduler(target, params); +#endif } if (true) { @@ -230,7 +279,11 @@ int main(int argc, char **argv) { out.set_estimate(x, 0, 10); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(out).auto_schedule(target, params); +#else + Pipeline(out).apply_autoscheduler(target, params); +#endif } if (true) { @@ -245,7 +298,11 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(h).auto_schedule(target, params); +#else + Pipeline(h).apply_autoscheduler(target, params); +#endif } if (true) { @@ -263,7 +320,11 @@ int main(int argc, char **argv) { a.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); b.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline({a, b}).auto_schedule(target, params); +#else + Pipeline({a, b}).apply_autoscheduler(target, params); +#endif } if (true) { @@ -274,7 +335,11 @@ int main(int argc, char **argv) { g(x, y) = f(x, y); g.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(g).auto_schedule(target, params); +#else + Pipeline(g).apply_autoscheduler(target, params); +#endif } if (true) { @@ -284,7 +349,11 @@ int main(int argc, char **argv) { f(x, y) = im(x, y) * 7; f.set_estimate(x, 0, 3).set_estimate(y, 0, 5); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(f).auto_schedule(target, params); +#else + Pipeline(f).apply_autoscheduler(target, params); +#endif } if (true) { @@ -301,7 +370,11 @@ int main(int argc, char **argv) { .set_estimate(t, 0, 3) .set_estimate(u, 0, 2) .set_estimate(v, 0, 6); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(f).auto_schedule(target, params); +#else + Pipeline(f).apply_autoscheduler(target, params); +#endif } if (true) { @@ -320,7 +393,11 @@ int main(int argc, char **argv) { out1.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); out2.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline({out1, out2}).auto_schedule(target, params); +#else + Pipeline({out1, out2}).apply_autoscheduler(target, params); +#endif } if (true) { @@ -345,7 +422,11 @@ int main(int argc, char **argv) { g(x, y) = f[N - 1](x, y) + f[0](clamp(cast(sin(x) * 10000), 0, 100000), clamp(cast(sin(x * y) * 10000), 0, 100000)); g.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(g).auto_schedule(target, params); +#else + Pipeline(g).apply_autoscheduler(target, params); +#endif } if (true) { @@ -360,7 +441,11 @@ int main(int argc, char **argv) { g(x, y) = f(x, y); g.set_estimate(x, 0, 10).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(g).auto_schedule(target, params); +#else + Pipeline(g).apply_autoscheduler(target, params); +#endif } if (true) { @@ -392,7 +477,11 @@ int main(int argc, char **argv) { out(x, y) = up[0](x, y); out.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(out).auto_schedule(target, params); +#else + Pipeline(out).apply_autoscheduler(target, params); +#endif } if (true) { @@ -410,7 +499,11 @@ int main(int argc, char **argv) { casted(x, y) = scan(x, y); casted.set_estimate(x, 0, 2000).set_estimate(y, 0, 2000); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(casted).auto_schedule(target, params); +#else + Pipeline(casted).apply_autoscheduler(target, params); +#endif } if (true) { @@ -426,7 +519,11 @@ int main(int argc, char **argv) { f.set_estimate(x, 0, 2000).set_estimate(y, 0, 2000); output.set_estimate(i, 0, 256); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API Pipeline(output).auto_schedule(target, params); +#else + Pipeline(output).apply_autoscheduler(target, params); +#endif } return 0; diff --git a/src/autoschedulers/anderson2021/test_function_dag.cpp b/src/autoschedulers/anderson2021/test_function_dag.cpp index 253307321ecc..af41b06c7752 100644 --- a/src/autoschedulers/anderson2021/test_function_dag.cpp +++ b/src/autoschedulers/anderson2021/test_function_dag.cpp @@ -31,7 +31,7 @@ extern "C" int mul_by_two( return 0; } -void test_coeff_wise(const MachineParams ¶ms, const Target &target) { +void test_coeff_wise(const Target &target) { Var x("x"), y("y"); std::ostringstream with_extern; @@ -55,7 +55,7 @@ void test_coeff_wise(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(with_extern); } @@ -70,7 +70,7 @@ void test_coeff_wise(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(without_extern); } @@ -113,7 +113,7 @@ extern "C" int matmul( return 0; } -void test_matmul(const MachineParams ¶ms, const Target &target) { +void test_matmul(const Target &target) { Var x("x"), y("y"), k("k"); RDom r(0, 200); Halide::Buffer input1(200, 200); @@ -140,7 +140,7 @@ void test_matmul(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(with_extern); } @@ -153,7 +153,7 @@ void test_matmul(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(without_extern); } @@ -164,11 +164,10 @@ void test_matmul(const MachineParams ¶ms, const Target &target) { int main(int argc, char **argv) { // Use a fixed target for the analysis to get consistent results from this test. - MachineParams params(32, 16000000, 40); Target target("x86-64-linux-sse41-avx-avx2"); - test_coeff_wise(params, target); - test_matmul(params, target); + test_coeff_wise(target); + test_matmul(target); return 0; } From e4672de67c71237213581e55bb592b0edda56715 Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 15 Aug 2022 23:04:36 -0400 Subject: [PATCH 07/63] use updated api --- .../anderson2021/included_schedule_file_generator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp b/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp index 1a5cb99a784f..7103ec6c80e8 100644 --- a/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp +++ b/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp @@ -37,7 +37,7 @@ struct IncludedScheduleFile : public Halide::Generator { relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}}); // Schedule - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else { #if defined(GENERATING_SCHEDULE) From efa1d83b97f3e594ee3ee36da0e5a1aad06263f0 Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 15 Aug 2022 23:04:53 -0400 Subject: [PATCH 08/63] clang-format --- .../anderson2021/SearchSpace.cpp | 3 +- src/autoschedulers/anderson2021/State.cpp | 29 +++++++++++-------- .../anderson2021/cost_model_generator.cpp | 12 ++++---- .../anderson2021/retrain_cost_model.cpp | 20 ++++++++----- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index a0018a55c98c..ad4ac831528a 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -351,8 +351,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, e->consumer->node->is_boundary_condition); } - return; - + return; } if (must_compute_root) { diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 65145c7633cc..217b3956acfb 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -815,7 +815,7 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c } bool marked = false; - for (auto & block_var_assignment : block_var_assignments) { + for (auto &block_var_assignment : block_var_assignments) { for (size_t i = 1; i < block_var_assignment.size(); ++i) { auto inner_i = block_var_assignment[0]; auto outer_i = block_var_assignment[i]; @@ -1036,8 +1036,9 @@ void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, con } for (auto &p : state_map) { - if (p.first->node->is_input) { continue; -} + if (p.first->node->is_input) { + continue; + } Stage stage(p.first->stage); @@ -1049,10 +1050,12 @@ void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, con vector constant_extents; bool any_parallel_vars = false, any_parallel_rvars = false; for (auto it = p.second->vars.rbegin(); it != p.second->vars.rend(); it++) { - if (!it->exists) { continue; -} - if (!it->parallel) { break; -} + if (!it->exists) { + continue; + } + if (!it->parallel) { + break; + } any_parallel_rvars |= it->var.is_rvar; any_parallel_vars |= !it->var.is_rvar; parallel_extents.push_back(it->extent); @@ -1106,7 +1109,7 @@ void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, con } p.second->schedule_source << "\n .reorder_storage("; bool first = true; - for (const auto& v : storage_vars) { + for (const auto &v : storage_vars) { if (!first) { p.second->schedule_source << ", "; } @@ -1129,8 +1132,9 @@ void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, con continue; } - if (p.first->node->is_input) { continue; -} + if (p.first->node->is_input) { + continue; + } Stage stage(p.first->stage); @@ -1215,8 +1219,9 @@ void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, con } for (auto &p : state_map) { - if (p.first->node->is_input) { continue; -} + if (p.first->node->is_input) { + continue; + } // Dump the schedule source string src << p.first->name diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp index e8b5e84cfc75..8deb2aef807e 100644 --- a/src/autoschedulers/anderson2021/cost_model_generator.cpp +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -23,7 +23,7 @@ struct ModelWeight : public GeneratorInput> { ModelWeight(const std::string &name, int dim) : GeneratorInput>(name, dim) { } - void backprop(const Derivative &d, const Expr& learning_rate, const Expr& timestep) { + void backprop(const Derivative &d, const Expr &learning_rate, const Expr ×tep) { } void set_shape(int s0 = 0, int s1 = 0, int s2 = 0) { if (s0) { @@ -45,11 +45,11 @@ struct ModelWeight : public GeneratorInput> { ModelWeight(const std::string &name, int dim) : GeneratorInput>(name, dim), grad("updated_" + name, dim + 1) { } - void backprop(const Derivative &d, Expr learning_rate, const Expr& timestep) { + void backprop(const Derivative &d, Expr learning_rate, const Expr ×tep) { std::vector args(dimensions() + 1); for (auto &e : args) { e = Var(); -} + } grad(args) = undef(); // We'll report back the new weights and the loss gradients, @@ -178,14 +178,14 @@ class CostModel : public Generator> { Output> loss_output{"loss_output", 0}; // Zero pad alone the last dimension of a Func - Func pad_stages(const Func& f, Expr stages) { + Func pad_stages(const Func &f, Expr stages) { Halide::Region bounds(f.dimensions()); bounds[1].min = 0; bounds[1].extent = std::move(stages); return BoundaryConditions::constant_exterior(f, cast(f.value().type(), 0), bounds); } - Expr activation(const Expr& e) { + Expr activation(const Expr &e) { // leaky relu return max(e, 0) + min(e, 0) * 1e-10f; } @@ -621,7 +621,7 @@ class CostModel : public Generator> { const int vec = 8; // A helper function for scheduling conv layers - auto schedule_conv = [&](Func conv, Func relu, const RVar& r_channels) { + auto schedule_conv = [&](Func conv, Func relu, const RVar &r_channels) { Var ci("ci"), wi("wi"); if (!training) { relu diff --git a/src/autoschedulers/anderson2021/retrain_cost_model.cpp b/src/autoschedulers/anderson2021/retrain_cost_model.cpp index e688448d8ee1..89ef78bebff0 100644 --- a/src/autoschedulers/anderson2021/retrain_cost_model.cpp +++ b/src/autoschedulers/anderson2021/retrain_cost_model.cpp @@ -617,12 +617,14 @@ int main(int argc, char **argv) { int good = 0, bad = 0; for (auto &sched : sample.schedules) { auto &ref = sample.schedules[sample.fastest_schedule_hash]; - if (sched.second.prediction[model] == 0) { continue; -} + if (sched.second.prediction[model] == 0) { + continue; + } assert(sched.second.runtimes[0] >= ref.runtimes[0]); float runtime_ratio = sched.second.runtimes[0] / ref.runtimes[0]; - if (runtime_ratio <= 1.3f) { continue; // Within 30% of the runtime of the best -} + if (runtime_ratio <= 1.3f) { + continue; // Within 30% of the runtime of the best + } if (sched.second.prediction[model] >= ref.prediction[model]) { good++; } else { @@ -661,8 +663,9 @@ int main(int argc, char **argv) { for (int model = 0; model < kModels; model++) { std::cout << loss_sum[model] / loss_sum_counter[model] << " "; } - if (kModels > 1) { std::cout << "\n"; -} + if (kModels > 1) { + std::cout << "\n"; + } std::cout << " Rate: "; int best_model = 0; float best_rate = 0; @@ -686,8 +689,9 @@ int main(int argc, char **argv) { } } - if (kModels > 1) { std::cout << "\n"; -} + if (kModels > 1) { + std::cout << "\n"; + } if (!predict_only && samples.count(worst_miss_pipeline_id)) { std::cout << " Worst: " << worst_miss << " " << leaf(samples[worst_miss_pipeline_id].schedules[worst_miss_schedule_id].filename) << " "; } From 937c730cd8f2ea5a558ed8c56eadce94b4976168 Mon Sep 17 00:00:00 2001 From: aekul Date: Tue, 16 Aug 2022 23:08:50 -0400 Subject: [PATCH 09/63] remove MachineParams and fix parallelism parameter --- src/autoschedulers/anderson2021/Makefile | 1 - src/autoschedulers/anderson2021/autotune_loop.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index b1d36a9fa136..bed3d09c3c94 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -150,7 +150,6 @@ demo: $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_anderson2021.$(SHAR # demonstrates an autotuning loop # (using $(BIN) and $(SRC) here seems overkill, but makes copy-n-paste elsewhere easier) autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/get_host_target $(BIN)/retrain_cost_model $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) $(SRC)/autotune_loop.sh - HL_MACHINE_PARAMS=80,1,1 \ SAMPLES_DIR=test_autotuned_samples \ bash $(SRC)/autotune_loop.sh \ $(GENERATOR_BIN)/demo.generator \ diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh index 5e3196afdebd..fc627030cfea 100644 --- a/src/autoschedulers/anderson2021/autotune_loop.sh +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -203,7 +203,7 @@ make_featurization() { ${EXTRA_GENERATOR_ARGS} \ -p ${AUTOSCHED_BIN}/libautoschedule_anderson2021.so \ autoscheduler=Anderson2021 \ - autoscheduler.parallelism=${HARDWARE_PARALLELISM} + autoscheduler.parallelism=${NUM_CORES} 2> ${D}/compile_err.txt > ${D}/compile_log.txt" FAILED=0 From bb571dd2142b5add31d1504e1956429a431dfa9e Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 18 Aug 2022 01:08:45 -0400 Subject: [PATCH 10/63] fix test --- src/autoschedulers/anderson2021/Makefile | 9 ++++++++- src/autoschedulers/anderson2021/test/test.h | 9 +++++++++ .../anderson2021/test/thread_info.cpp | 14 ++------------ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index bed3d09c3c94..977fb7138ffc 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -203,7 +203,7 @@ build: $(BIN)/$(HL_TARGET)/test \ $(BIN)/retrain_cost_model \ $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) -test: test_bounds test_tiling test_storage_strides test_parser test_state run_test test_perfect_hash_map test_function_dag demo included_schedule_file autotune +test: test_bounds test_tiling test_storage_strides test_parser test_state test_thread_info run_test test_perfect_hash_map test_function_dag demo included_schedule_file autotune TEST_DIR=$(SRC)/test @@ -242,6 +242,13 @@ $(BIN)/test_state: $(TEST_DIR)/state.cpp State.h State.cpp LoopNest.h LoopNest.c test_state: $(BIN)/test_state $^ +$(BIN)/test_thread_info: $(TEST_DIR)/thread_info.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp + @mkdir -p $(@D) + $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) + +test_thread_info: $(BIN)/test_thread_info + $^ + clean: rm -rf $(BIN) diff --git a/src/autoschedulers/anderson2021/test/test.h b/src/autoschedulers/anderson2021/test/test.h index a5489de57837..e43b390f9d5a 100644 --- a/src/autoschedulers/anderson2021/test/test.h +++ b/src/autoschedulers/anderson2021/test/test.h @@ -9,6 +9,7 @@ namespace Autoscheduler { #define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) #define EXPECT_EQ(expected, actual) expect_eq(__LINE__, expected, actual) +#define APPROX_EQ(expected, actual, epsilon) approx_eq(__LINE__, expected, actual, epsilon) #define EXPECT(expected) expect(__LINE__, expected) template @@ -19,6 +20,14 @@ void expect_eq(int line, const A &expected, const B &actual) { << "\nActual value = " << actual; } +template +void approx_eq(int line, const A &expected, const B &actual, float epsilon) { + user_assert(std::abs(expected - actual) < epsilon) + << "Assert failed on line " << line << "." + << "\nExpected value = " << expected + << "\nActual value = " << actual; +} + template void expect(int line, const A &expected) { user_assert(expected) diff --git a/src/autoschedulers/anderson2021/test/thread_info.cpp b/src/autoschedulers/anderson2021/test/thread_info.cpp index 697abea28bf5..0cab2ed95757 100644 --- a/src/autoschedulers/anderson2021/test/thread_info.cpp +++ b/src/autoschedulers/anderson2021/test/thread_info.cpp @@ -1,6 +1,7 @@ #include "LoopNest.h" #include "ThreadInfo.h" #include "test.h" +#include using namespace Halide; using namespace Halide::Internal; @@ -24,9 +25,6 @@ void test_thread_info() { size.push_back(16); size.push_back(8); - //loop_extents.push_back(16); - //loop_extents.push_back(8); - // 16x8 max_thread_counts.push_back(16); max_thread_counts.push_back(8); @@ -53,10 +51,6 @@ void test_thread_info() { // Smaller stage: its loop is smaller than the max thread loop and // cannot possibly achieve better utilization - //loop_extents.clear(); - //loop_extents.push_back(8); - //loop_extents.push_back(8); - { ThreadInfo info{vectorized_loop_index, size, loop, max_thread_counts}; EXPECT_EQ(64, info.num_threads); @@ -67,10 +61,6 @@ void test_thread_info() { size.push_back(11); size.push_back(11); size.push_back(2); - //loop_extents.clear(); - //loop_extents.push_back(11); - //loop_extents.push_back(11); - //loop_extents.push_back(2); max_thread_counts.clear(); max_thread_counts.push_back(16); max_thread_counts.push_back(16); @@ -80,7 +70,7 @@ void test_thread_info() { { ThreadInfo info{vectorized_loop_index, size, loop, max_thread_counts}; EXPECT_EQ(242, info.num_threads); - EXPECT_EQ(0.472656, info.warp_lane_utilization()); + APPROX_EQ(0.630208, info.warp_lane_utilization(), 0.00001); } } } From bddfc57301ccfd98700692db5b1f91dcfd336d15 Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 18 Aug 2022 01:26:49 -0400 Subject: [PATCH 11/63] add CMakeLists.txt --- .../anderson2021/CMakeLists.txt | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 src/autoschedulers/anderson2021/CMakeLists.txt diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt new file mode 100644 index 000000000000..19cfcdb1c657 --- /dev/null +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -0,0 +1,203 @@ +## +# Resources for the autoscheduler library +## + +add_compile_definitions(HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API) + +include_directories("${Halide_BINARY_DIR}/include/") +include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/common/") +include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021/") + +# weights +set(WF_CPP baseline.cpp) +configure_file(baseline.weights baseline.weights COPYONLY) +add_custom_command(OUTPUT ${WF_CPP} + COMMAND binary2cpp baseline_weights < baseline.weights > ${WF_CPP} + DEPENDS baseline.weights binary2cpp + VERBATIM) + +# cost_model, train_cost_model +add_executable(anderson2021-cost_model.generator cost_model_generator.cpp) +target_link_libraries(cost_model.generator PRIVATE Halide::Halide Halide::Generator) + +add_halide_library(anderson2021-cost_model FROM cost_model.generator + TARGETS cmake) +add_halide_library(anderson2021-train_cost_model FROM cost_model.generator + TARGETS cmake + USE_RUNTIME cost_model.runtime) + +## retrain_cost_model +add_executable(anderson2021-retrain_cost_model + DefaultCostModel.cpp + Weights.cpp + retrain_cost_model.cpp + ${WF_CPP}) +target_link_libraries(retrain_cost_model PRIVATE ASLog cost_model train_cost_model Halide::Halide Halide::Plugin) + +### +## Main autoscheduler library +### + +add_autoscheduler(NAME Anderson2021 + SOURCES + AutoSchedule.cpp + DefaultCostModel.cpp + FunctionDAG.cpp + GPULoopInfo.cpp + LoopNest.cpp + SearchSpace.cpp + State.cpp + Tiling.cpp + Weights.cpp + ${WF_CPP}) + +target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser cost_model train_cost_model) + +## +# Tests and demos +# TODO(#4053): move these to a separate folder since they're tests. +## + +# ================================================================= + +add_executable(anderson2021-demo.generator demo_generator.cpp) +target_link_libraries(demo.generator PRIVATE Halide::Generator) + +add_halide_library(anderson2021-demo FROM demo.generator + TARGETS cmake + AUTOSCHEDULER Halide::Anderson2021 + REGISTRATION DEMO_REGISTRATION_FILE) + +add_executable(anderson2021-demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) +target_link_libraries(demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) + +add_test(NAME demo_apps_autoscheduler + COMMAND demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) + +set_tests_properties(demo_apps_autoscheduler + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +## ================================================================= + +add_executable(anderson2021-included_schedule_file.generator included_schedule_file_generator.cpp) +target_link_libraries(included_schedule_file.generator PRIVATE Halide::Generator) + +add_halide_library(anderson2021-included_schedule_file FROM included_schedule_file.generator + TARGETS cmake + AUTOSCHEDULER Halide::Anderson2021 + REGISTRATION included_schedule_reg) + +add_executable(anderson2021-demo_included_schedule_file ${included_schedule_reg}) +target_link_libraries(demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) + +add_test(NAME demo_included_schedule_file + COMMAND demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) + +set_tests_properties(demo_included_schedule_file + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +## ==================================================== +## Auto-tuning support utilities. +## TODO(#4053): implement auto-tuning support in CMake? + +add_executable(anderson2021-featurization_to_sample featurization_to_sample.cpp) + +add_executable(anderson2021-get_host_target get_host_target.cpp) +target_link_libraries(get_host_target PRIVATE Halide::Halide) + +add_executable(anderson2021-weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) +target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) + +# ================================================================= +# Smaller tests + +if (BUILD_SHARED_LIBS) + add_executable(anderson2021-test_apps_autoscheduler test.cpp) + target_link_libraries(test_apps_autoscheduler PRIVATE Halide::Halide Halide::Tools ${CMAKE_DL_LIBS}) + + add_test(NAME test_apps_autoscheduler + COMMAND test_apps_autoscheduler $) + + set_tests_properties(test_apps_autoscheduler PROPERTIES + LABELS "Anderson2021;multithreaded" + ENVIRONMENT "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH};HL_TARGET=${Halide_TARGET}") +endif () + +## + +add_executable(anderson2021-test_perfect_hash_map test_perfect_hash_map.cpp) + +add_test(NAME test_perfect_hash_map COMMAND test_perfect_hash_map) +set_tests_properties(test_perfect_hash_map + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +## + +add_executable(anderson2021-test_function_dag test_function_dag.cpp FunctionDAG.cpp) +target_link_libraries(anderson2021-test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME test_function_dag COMMAND test_function_dag) +set_tests_properties(test_function_dag + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +add_executable(anderson2021-test_bounds test/bounds.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp Tiling.cpp) +target_link_libraries(anderson2021-test_bounds PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME anderson2021-test_bounds COMMAND test_bounds) +set_tests_properties(anderson2021-test_bounds + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +add_executable(anderson2021-test_parser test/parser.cpp ASLog.cpp) +target_link_libraries(anderson2021-test_parser PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME test_parser COMMAND anderson2021-test_parser) +set_tests_properties(test_parser + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +add_executable(anderson2021-test_state test/state.cpp ASLog.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +target_link_libraries(anderson2021-test_state PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME test_state COMMAND anderson2021-test_state) +set_tests_properties(test_state + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +add_executable(anderson2021-test_storage_strides test/storage_strides.cpp ASLog.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +target_link_libraries(anderson2021-test_storage_strides PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME test_storage_strides COMMAND anderson2021-test_storage_strides) +set_tests_properties(test_storage_strides + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +add_executable(anderson2021-test_thread_info test/thread_info.cpp LoopNest.cpp) +target_link_libraries(anderson2021-test_thread_info PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME test_thread_info COMMAND anderson2021-test_thread_info) +set_tests_properties(test_thread_info + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + +add_executable(anderson2021-test_tiling test/tiling.cpp Tiling.cpp) +target_link_libraries(anderson2021-test_tiling PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) + +add_test(NAME test_tiling COMMAND anderson2021-test_tiling) +set_tests_properties(test_tiling + PROPERTIES + LABELS Anderson2021 + ENVIRONMENT "HL_TARGET=${Halide_TARGET}") From 59617c205bbf0ce9be50d0a3aa5ce926c8cf0173 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:00:02 -0400 Subject: [PATCH 12/63] remove ASLog.h/cpp --- src/autoschedulers/anderson2021/ASLog.cpp | 54 ------------------- src/autoschedulers/anderson2021/ASLog.h | 37 ------------- .../anderson2021/CMakeLists.txt | 6 +-- src/autoschedulers/anderson2021/Makefile | 16 +++--- 4 files changed, 11 insertions(+), 102 deletions(-) delete mode 100644 src/autoschedulers/anderson2021/ASLog.cpp delete mode 100644 src/autoschedulers/anderson2021/ASLog.h diff --git a/src/autoschedulers/anderson2021/ASLog.cpp b/src/autoschedulers/anderson2021/ASLog.cpp deleted file mode 100644 index 51ccd9cf72c1..000000000000 --- a/src/autoschedulers/anderson2021/ASLog.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "ASLog.h" - -namespace Halide { -namespace Internal { - -namespace { - -std::string get_env_variable(char const *env_var_name) { - if (!env_var_name) { - return ""; - } - -#ifdef _MSC_VER - // call getenv_s without a buffer to determine the correct string length: - size_t length = 0; - if ((getenv_s(&length, NULL, 0, env_var_name) != 0) || (length == 0)) { - return ""; - } - // call it again to retrieve the value of the environment variable; - // note that 'length' already accounts for the null-terminator - std::string lvl(length - 1, '@'); - size_t read = 0; - if ((getenv_s(&read, &lvl[0], length, env_var_name) != 0) || (read != length)) { - return ""; - } - return lvl; -#else - char *lvl = getenv(env_var_name); - if (lvl) { - return std::string(lvl); - } -#endif - - return ""; -} - -} // namespace - -int aslog::aslog_level() { - static int cached_aslog_level = ([]() -> int { - // If HL_DEBUG_AUTOSCHEDULE is defined, use that value. - std::string lvl = get_env_variable("HL_DEBUG_AUTOSCHEDULE"); - if (!lvl.empty()) { - return atoi(lvl.c_str()); - } - // Otherwise, use HL_DEBUG_CODEGEN. - lvl = get_env_variable("HL_DEBUG_CODEGEN"); - return !lvl.empty() ? atoi(lvl.c_str()) : 0; - })(); - return cached_aslog_level; -} - -} // namespace Internal -} // namespace Halide diff --git a/src/autoschedulers/anderson2021/ASLog.h b/src/autoschedulers/anderson2021/ASLog.h deleted file mode 100644 index 9ba9844ce342..000000000000 --- a/src/autoschedulers/anderson2021/ASLog.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef ASLOG_H -#define ASLOG_H - -// This class is used by train_cost_model, which doesn't link to -// libHalide, so (despite the namespace) we are better off not -// including Halide.h, lest we reference something we won't have available - -#include -#include -#include - -namespace Halide { -namespace Internal { - -class aslog { - const bool logging; - -public: - aslog(int verbosity) - : logging(verbosity <= aslog_level()) { - } - - template - aslog &operator<<(T &&x) { - if (logging) { - std::cerr << std::forward(x); - } - return *this; - } - - static int aslog_level(); -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 19cfcdb1c657..354ed028a0db 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -157,7 +157,7 @@ set_tests_properties(anderson2021-test_bounds LABELS Anderson2021 ENVIRONMENT "HL_TARGET=${Halide_TARGET}") -add_executable(anderson2021-test_parser test/parser.cpp ASLog.cpp) +add_executable(anderson2021-test_parser test/parser.cpp) target_link_libraries(anderson2021-test_parser PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME test_parser COMMAND anderson2021-test_parser) @@ -166,7 +166,7 @@ set_tests_properties(test_parser LABELS Anderson2021 ENVIRONMENT "HL_TARGET=${Halide_TARGET}") -add_executable(anderson2021-test_state test/state.cpp ASLog.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +add_executable(anderson2021-test_state test/state.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) target_link_libraries(anderson2021-test_state PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME test_state COMMAND anderson2021-test_state) @@ -175,7 +175,7 @@ set_tests_properties(test_state LABELS Anderson2021 ENVIRONMENT "HL_TARGET=${Halide_TARGET}") -add_executable(anderson2021-test_storage_strides test/storage_strides.cpp ASLog.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +add_executable(anderson2021-test_storage_strides test/storage_strides.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) target_link_libraries(anderson2021-test_storage_strides PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME test_storage_strides COMMAND anderson2021-test_storage_strides) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 977fb7138ffc..3c57c4f99ed2 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -64,7 +64,7 @@ $(BIN)/cost_model/%.a: $(BIN)/cost_model.generator # undefined rather than dependent on libHalide.so. $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(SRC)/AutoSchedule.h \ - $(SRC)/ASLog.cpp \ + $(COMMON_DIR)/ASLog.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ $(SRC)/Weights.h \ @@ -97,7 +97,7 @@ $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ - $(SRC)/ASLog.cpp \ + $(COMMON_DIR)/ASLog.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ $(SRC)/Weights.h \ @@ -163,7 +163,7 @@ $(BIN)/test_perfect_hash_map: test_perfect_hash_map.cpp PerfectHashMap.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ -$(BIN)/test_function_dag: test_function_dag.cpp FunctionDAG.h FunctionDAG.cpp ASLog.h ASLog.cpp +$(BIN)/test_function_dag: test_function_dag.cpp FunctionDAG.h FunctionDAG.cpp $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) @@ -207,7 +207,7 @@ test: test_bounds test_tiling test_storage_strides test_parser test_state test_t TEST_DIR=$(SRC)/test -$(BIN)/test_bounds: $(TEST_DIR)/bounds.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp +$(BIN)/test_bounds: $(TEST_DIR)/bounds.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp @mkdir -p $(@D) $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) @@ -221,28 +221,28 @@ $(BIN)/test_tiling: $(TEST_DIR)/tiling.cpp Tiling.h Tiling.cpp test_tiling: $(BIN)/test_tiling $^ -$(BIN)/test_storage_strides: $(TEST_DIR)/storage_strides.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp +$(BIN)/test_storage_strides: $(TEST_DIR)/storage_strides.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp @mkdir -p $(@D) $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) test_storage_strides: $(BIN)/test_storage_strides $^ -$(BIN)/test_parser: $(TEST_DIR)/parser.cpp LoopNestParser.h ASLog.h ASLog.cpp +$(BIN)/test_parser: $(TEST_DIR)/parser.cpp LoopNestParser.h $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp @mkdir -p $(@D) $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) test_parser: $(BIN)/test_parser $^ -$(BIN)/test_state: $(TEST_DIR)/state.cpp State.h State.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp +$(BIN)/test_state: $(TEST_DIR)/state.cpp State.h State.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp @mkdir -p $(@D) $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) test_state: $(BIN)/test_state $^ -$(BIN)/test_thread_info: $(TEST_DIR)/thread_info.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h ASLog.h ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp +$(BIN)/test_thread_info: $(TEST_DIR)/thread_info.cpp LoopNest.h LoopNest.cpp FunctionDAG.cpp FunctionDAG.h $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp GPULoopInfo.cpp GPULoopInfo.h GPUMemInfo.h Tiling.h Tiling.cpp @mkdir -p $(@D) $(CXX) $(OPTIMIZE) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) -I$(SRC) From ec5a6ee102cfd5cb4d2456a34a93c564dea29a23 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:13:48 -0400 Subject: [PATCH 13/63] move PerfectHashMap.h to common/ --- src/autoschedulers/adams2019/Makefile | 4 ++-- src/autoschedulers/anderson2021/Makefile | 4 ++-- src/autoschedulers/{anderson2021 => common}/PerfectHashMap.h | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename src/autoschedulers/{anderson2021 => common}/PerfectHashMap.h (100%) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index c63b95c0c856..7bf5b838da15 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -74,7 +74,7 @@ $(BIN)/libautoschedule_adams2019.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(SRC)/State.h \ $(SRC)/State.cpp \ $(SRC)/Timer.h \ - $(SRC)/PerfectHashMap.h \ + $(COMMON_DIR)/PerfectHashMap.h \ $(AUTOSCHED_WEIGHT_OBJECTS) \ $(AUTOSCHED_COST_MODEL_LIBS) \ $(GENERATOR_DEPS) \ @@ -144,7 +144,7 @@ autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/ $(HALIDE_DISTRIB_PATH) \ $(BIN)/samples -$(BIN)/test_perfect_hash_map: $(SRC)/test_perfect_hash_map.cpp $(SRC)/PerfectHashMap.h +$(BIN)/test_perfect_hash_map: $(SRC)/test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHashMap.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 3c57c4f99ed2..d150cdb55db6 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -80,7 +80,7 @@ $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(SRC)/ThreadInfo.h \ $(SRC)/Featurization.h \ $(SRC)/CostModel.h \ - $(SRC)/PerfectHashMap.h \ + $(COMMON_DIR)/PerfectHashMap.h \ $(SRC)/SearchSpace.h \ $(SRC)/SearchSpace.cpp \ $(SRC)/SearchSpaceOptions.h \ @@ -159,7 +159,7 @@ autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/ $(BIN) \ 0 -$(BIN)/test_perfect_hash_map: test_perfect_hash_map.cpp PerfectHashMap.h +$(BIN)/test_perfect_hash_map: test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHashMap.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ diff --git a/src/autoschedulers/anderson2021/PerfectHashMap.h b/src/autoschedulers/common/PerfectHashMap.h similarity index 100% rename from src/autoschedulers/anderson2021/PerfectHashMap.h rename to src/autoschedulers/common/PerfectHashMap.h From 56d5e71b21ee5491da369a5c12b5d23311b3f080 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:38:00 -0400 Subject: [PATCH 14/63] move test_function_dag.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 4 +- src/autoschedulers/adams2019/Makefile | 2 +- .../adams2019/test_function_dag.cpp | 174 ------------------ .../anderson2021/CMakeLists.txt | 5 +- src/autoschedulers/anderson2021/Makefile | 2 +- .../test_function_dag.cpp | 0 6 files changed, 8 insertions(+), 179 deletions(-) delete mode 100644 src/autoschedulers/adams2019/test_function_dag.cpp rename src/autoschedulers/{anderson2021 => common}/test_function_dag.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index 0d219922bc37..76c106382519 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -2,6 +2,8 @@ # Resources for the autoscheduler library ## +set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common/") + # weights set(WF_CPP baseline.cpp) configure_file(baseline.weights baseline.weights COPYONLY) @@ -131,7 +133,7 @@ set_tests_properties(test_perfect_hash_map ## -add_executable(test_function_dag test_function_dag.cpp FunctionDAG.cpp) +add_executable(test_function_dag ${COMMON_DIR}/test_function_dag.cpp FunctionDAG.cpp) target_link_libraries(test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME test_function_dag COMMAND test_function_dag) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 7bf5b838da15..ba1c17b7f4d1 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -148,7 +148,7 @@ $(BIN)/test_perfect_hash_map: $(SRC)/test_perfect_hash_map.cpp $(COMMON_DIR)/Per @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ -$(BIN)/test_function_dag: $(SRC)/test_function_dag.cpp $(SRC)/FunctionDAG.h $(SRC)/FunctionDAG.cpp $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp +$(BIN)/test_function_dag: $(COMMON_DIR)/test_function_dag.cpp $(SRC)/FunctionDAG.h $(SRC)/FunctionDAG.cpp $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) diff --git a/src/autoschedulers/adams2019/test_function_dag.cpp b/src/autoschedulers/adams2019/test_function_dag.cpp deleted file mode 100644 index 0b4604b9500d..000000000000 --- a/src/autoschedulers/adams2019/test_function_dag.cpp +++ /dev/null @@ -1,174 +0,0 @@ -#include "Featurization.h" -#include "FunctionDAG.h" -#include "Halide.h" -#include - -using namespace Halide; - -extern "C" int mul_by_two( - halide_buffer_t *input, - halide_buffer_t *output) { - if (input->is_bounds_query()) { - // Bounds query: infer the input dimensions from the output dimensions. In - // this example, the dimensions are exactly the same - for (int i = 0; i < 2; ++i) { - input->dim[i] = output->dim[i]; - } - return 0; - } - - // Actual computation: return 2 times x as an example. The first dimension is - // the innermost, so iterate over it last to avoid inefficient memory access - // patterns. - for (int j = 0; j < input->dim[1].extent; ++j) { - for (int i = 0; i < input->dim[0].extent; ++i) { - float *out = (float *)output->host + i * output->dim[0].stride + - j * output->dim[1].stride; - float *in = (float *)input->host + i * input->dim[0].stride + - j * input->dim[1].stride; - (*out) = 2 * (*in); - } - } - return 0; -} - -void test_coeff_wise(const Target &target) { - Var x("x"), y("y"); - - std::ostringstream with_extern; - { - Func f("f"), g("g"), h("h"); - f(x, y) = (x + y) * (x + y); - - Halide::ExternFuncArgument arg = f; - std::vector vars = {x, y}; - Halide::Type input_type = Halide::Float(32); - g.define_extern( - "mul_by_two", - {arg}, - input_type, - vars, - Halide::NameMangling::C); - g.function().extern_definition_proxy_expr() = f(x, y) * 2.0f; - - h(x, y) = g(x, y) * 2 + 1; - - h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); - std::vector v; - v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, target); - - d.dump(with_extern); - } - - std::ostringstream without_extern; - { - Func f("f"), g("g"), h("h"); - f(x, y) = (x + y) * (x + y); - g(x, y) = f(x, y) * 2.0f; - h(x, y) = g(x, y) * 2 + 1; - - h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); - std::vector v; - v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, target); - - d.dump(without_extern); - } - - // Disabled for now: there is still work to do to populate the jacobian - // assert(with_extern.str() == without_extern.str()); -} - -extern "C" int matmul( - halide_buffer_t *input1, - halide_buffer_t *input2, - halide_buffer_t *output) { - if (input1->is_bounds_query() || input2->is_bounds_query()) { - // Bounds query: infer the input dimensions from the output dimensions. - // We leave the k dimension alone since we can't infer it from the output dimensions. - input1->dim[0].min = output->dim[0].min; - input1->dim[0].extent = output->dim[0].extent; - input2->dim[1].min = output->dim[1].min; - input2->dim[1].extent = output->dim[1].extent; - return 0; - } - - // Actual computation: return input1 * input2. - const int max_i = output->dim[0].min + output->dim[0].extent; - const int max_j = output->dim[1].min + output->dim[1].extent; - for (int i = output->dim[0].min; i < max_i; ++i) { - for (int j = output->dim[1].min; j < max_j; ++j) { - int pos[2] = {i, j}; - float *out = (float *)output->address_of(pos); - *out = 0.0f; - for (int k = 0; k < input1->dim[1].extent; ++k) { - int pos1[2] = {i, k}; - float *in1 = (float *)input1->address_of(pos1); - int pos2[2] = {k, j}; - float *in2 = (float *)input2->address_of(pos2); - (*out) += (*in1) * (*in2); - } - } - } - return 0; -} - -void test_matmul(const Target &target) { - Var x("x"), y("y"), k("k"); - RDom r(0, 200); - Halide::Buffer input1(200, 200); - Halide::Buffer input2(200, 200); - - std::ostringstream with_extern; - { - Func mm("mm"), h("h"); - - Halide::ExternFuncArgument arg1 = input1; - Halide::ExternFuncArgument arg2 = input2; - std::vector vars = {x, y}; - Halide::Type input_type = Halide::Float(32); - mm.define_extern( - "matmul", - {arg1, arg2}, - {input_type, input_type}, - vars, - Halide::NameMangling::C); - mm.function().extern_definition_proxy_expr() = Halide::sum(input1(x, r) * input2(r, y)); - - h(x, y) = mm(x, y); - - h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); - std::vector v; - v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, target); - - d.dump(with_extern); - } - std::ostringstream without_extern; - { - Func mm("mm"), h("h"); - mm(x, y) = Halide::sum(input1(x, r) * input2(r, y)); - h(x, y) = mm(x, y); - - h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); - std::vector v; - v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, target); - - d.dump(without_extern); - } - - std::cout << "with_extern:\n " << with_extern.str() - << "\n\nwithout_extern:\n " << without_extern.str() << "\n"; -} - -int main(int argc, char **argv) { - // Use a fixed target for the analysis to get consistent results from this test. - Target target("x86-64-linux-sse41-avx-avx2"); - - test_coeff_wise(target); - test_matmul(target); - - return 0; -} diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 354ed028a0db..4403da5ecdf2 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -4,8 +4,9 @@ add_compile_definitions(HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API) +set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common/") include_directories("${Halide_BINARY_DIR}/include/") -include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/common/") +include_directories(${COMMON_DIR}) include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021/") # weights @@ -139,7 +140,7 @@ set_tests_properties(test_perfect_hash_map ## -add_executable(anderson2021-test_function_dag test_function_dag.cpp FunctionDAG.cpp) +add_executable(anderson2021-test_function_dag ${COMMON_DIR}/test_function_dag.cpp FunctionDAG.cpp) target_link_libraries(anderson2021-test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME test_function_dag COMMAND test_function_dag) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index d150cdb55db6..9d29b1d5e4a3 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -163,7 +163,7 @@ $(BIN)/test_perfect_hash_map: test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHas @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ -$(BIN)/test_function_dag: test_function_dag.cpp FunctionDAG.h FunctionDAG.cpp $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp +$(BIN)/test_function_dag: $(COMMON_DIR)/test_function_dag.cpp FunctionDAG.h FunctionDAG.cpp $(COMMON_DIR)/ASLog.h $(COMMON_DIR)/ASLog.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) diff --git a/src/autoschedulers/anderson2021/test_function_dag.cpp b/src/autoschedulers/common/test_function_dag.cpp similarity index 100% rename from src/autoschedulers/anderson2021/test_function_dag.cpp rename to src/autoschedulers/common/test_function_dag.cpp From 6e68cd496c9f9301641d9ee0f49fff7add3d5194 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:41:43 -0400 Subject: [PATCH 15/63] move featurization_to_sample.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 4 ++-- src/autoschedulers/adams2019/Makefile | 2 +- src/autoschedulers/anderson2021/CMakeLists.txt | 4 ++-- src/autoschedulers/anderson2021/Makefile | 2 +- .../{anderson2021 => common}/featurization_to_sample.cpp | 0 5 files changed, 6 insertions(+), 6 deletions(-) rename src/autoschedulers/{anderson2021 => common}/featurization_to_sample.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index 76c106382519..d279e2285ecd 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -2,7 +2,7 @@ # Resources for the autoscheduler library ## -set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common/") +set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") # weights set(WF_CPP baseline.cpp) @@ -98,7 +98,7 @@ set_tests_properties(demo_included_schedule_file # Auto-tuning support utilities. # TODO(#4053): implement auto-tuning support in CMake? -add_executable(featurization_to_sample featurization_to_sample.cpp) +add_executable(featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) add_executable(get_host_target get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index ba1c17b7f4d1..f6b0399a30fe 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -96,7 +96,7 @@ $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ @mkdir -p $(@D) $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) $(HALIDE_RPATH_FOR_BIN) -$(BIN)/featurization_to_sample: $(SRC)/featurization_to_sample.cpp +$(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 4403da5ecdf2..ff34b00210c0 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -4,7 +4,7 @@ add_compile_definitions(HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API) -set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common/") +set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") include_directories("${Halide_BINARY_DIR}/include/") include_directories(${COMMON_DIR}) include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021/") @@ -105,7 +105,7 @@ set_tests_properties(demo_included_schedule_file ## Auto-tuning support utilities. ## TODO(#4053): implement auto-tuning support in CMake? -add_executable(anderson2021-featurization_to_sample featurization_to_sample.cpp) +add_executable(anderson2021-featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) add_executable(anderson2021-get_host_target get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 9d29b1d5e4a3..2b2c73ba6157 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -110,7 +110,7 @@ $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ @mkdir -p $(@D) $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) -$(BIN)/featurization_to_sample: $(SRC)/featurization_to_sample.cpp +$(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/featurization_to_sample.cpp b/src/autoschedulers/common/featurization_to_sample.cpp similarity index 100% rename from src/autoschedulers/anderson2021/featurization_to_sample.cpp rename to src/autoschedulers/common/featurization_to_sample.cpp From 7d3fbb43f790ff482e355d9323db385ac5351562 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:43:47 -0400 Subject: [PATCH 16/63] move test_perfect_hash_map.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 2 +- src/autoschedulers/adams2019/Makefile | 2 +- src/autoschedulers/anderson2021/CMakeLists.txt | 2 +- src/autoschedulers/anderson2021/Makefile | 2 +- .../{anderson2021 => common}/test_perfect_hash_map.cpp | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename src/autoschedulers/{anderson2021 => common}/test_perfect_hash_map.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index d279e2285ecd..189818905363 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -123,7 +123,7 @@ endif () ## -add_executable(test_perfect_hash_map test_perfect_hash_map.cpp) +add_executable(test_perfect_hash_map ${COMMON_DIR}/test_perfect_hash_map.cpp) add_test(NAME test_perfect_hash_map COMMAND test_perfect_hash_map) set_tests_properties(test_perfect_hash_map diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index f6b0399a30fe..1eec1ef0ca04 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -144,7 +144,7 @@ autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/ $(HALIDE_DISTRIB_PATH) \ $(BIN)/samples -$(BIN)/test_perfect_hash_map: $(SRC)/test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHashMap.h +$(BIN)/test_perfect_hash_map: $(COMMON_DIR)/test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHashMap.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index ff34b00210c0..7549b1170a79 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -130,7 +130,7 @@ endif () ## -add_executable(anderson2021-test_perfect_hash_map test_perfect_hash_map.cpp) +add_executable(anderson2021-test_perfect_hash_map ${COMMON_DIR}/test_perfect_hash_map.cpp) add_test(NAME test_perfect_hash_map COMMAND test_perfect_hash_map) set_tests_properties(test_perfect_hash_map diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 2b2c73ba6157..f7e25ee11104 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -159,7 +159,7 @@ autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/ $(BIN) \ 0 -$(BIN)/test_perfect_hash_map: test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHashMap.h +$(BIN)/test_perfect_hash_map: $(COMMON_DIR)/test_perfect_hash_map.cpp $(COMMON_DIR)/PerfectHashMap.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< -o $@ diff --git a/src/autoschedulers/anderson2021/test_perfect_hash_map.cpp b/src/autoschedulers/common/test_perfect_hash_map.cpp similarity index 100% rename from src/autoschedulers/anderson2021/test_perfect_hash_map.cpp rename to src/autoschedulers/common/test_perfect_hash_map.cpp From ae9b216f49d5c567ac49d579b967e966693b3244 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:45:30 -0400 Subject: [PATCH 17/63] remove Errors.h --- src/autoschedulers/anderson2021/Errors.h | 26 ------------------------ 1 file changed, 26 deletions(-) delete mode 100644 src/autoschedulers/anderson2021/Errors.h diff --git a/src/autoschedulers/anderson2021/Errors.h b/src/autoschedulers/anderson2021/Errors.h deleted file mode 100644 index 0057b2fbc3a9..000000000000 --- a/src/autoschedulers/anderson2021/Errors.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef ERRORS_H -#define ERRORS_H - -#include "Halide.h" - -#ifndef user_error -#define user_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User) -#endif - -#ifndef user_warning -#define user_warning Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Warning) -#endif - -#ifndef user_assert -#define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) -#endif - -#ifndef internal_assert -#define internal_assert(c) _halide_internal_assertion(c, 0) -#endif - -#ifndef internal_error -#define internal_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, 0) -#endif - -#endif From d7c7e5a889ab7351cfb462e9eb9f0b11e539f5eb Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:47:44 -0400 Subject: [PATCH 18/63] move get_host_target.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 2 +- src/autoschedulers/adams2019/Makefile | 2 +- src/autoschedulers/anderson2021/CMakeLists.txt | 2 +- src/autoschedulers/anderson2021/Makefile | 2 +- src/autoschedulers/{anderson2021 => common}/get_host_target.cpp | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename src/autoschedulers/{anderson2021 => common}/get_host_target.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index 189818905363..fae384761293 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -100,7 +100,7 @@ set_tests_properties(demo_included_schedule_file add_executable(featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) -add_executable(get_host_target get_host_target.cpp) +add_executable(get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) add_executable(weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 1eec1ef0ca04..109e2db49014 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -100,7 +100,7 @@ $(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ -$(BIN)/get_host_target: $(SRC)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h +$(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ $(HALIDE_RPATH_FOR_BIN) $(BIN)/weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 7549b1170a79..905f68d68a5e 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -107,7 +107,7 @@ set_tests_properties(demo_included_schedule_file add_executable(anderson2021-featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) -add_executable(anderson2021-get_host_target get_host_target.cpp) +add_executable(anderson2021-get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) add_executable(anderson2021-weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index f7e25ee11104..5b1113a8b644 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -114,7 +114,7 @@ $(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ -$(BIN)/get_host_target: $(SRC)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h +$(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/get_host_target.cpp b/src/autoschedulers/common/get_host_target.cpp similarity index 100% rename from src/autoschedulers/anderson2021/get_host_target.cpp rename to src/autoschedulers/common/get_host_target.cpp From b529aeb06e7619352bd9b25f0fe3b7ab7c12d682 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:50:19 -0400 Subject: [PATCH 19/63] move weightsdir_to_weightsfile.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 2 +- src/autoschedulers/adams2019/Makefile | 2 +- src/autoschedulers/anderson2021/CMakeLists.txt | 2 +- src/autoschedulers/anderson2021/Makefile | 2 +- .../{anderson2021 => common}/weightsdir_to_weightsfile.cpp | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename src/autoschedulers/{anderson2021 => common}/weightsdir_to_weightsfile.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index fae384761293..b1903d85b233 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -103,7 +103,7 @@ add_executable(featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp add_executable(get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) -add_executable(weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) +add_executable(weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp Weights.cpp) target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 109e2db49014..ff3ecc82ac9c 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -103,7 +103,7 @@ $(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp $(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ $(HALIDE_RPATH_FOR_BIN) -$(BIN)/weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp +$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 905f68d68a5e..b47b757d28f6 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -110,7 +110,7 @@ add_executable(anderson2021-featurization_to_sample ${COMMON_DIR}/featurization_ add_executable(anderson2021-get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) -add_executable(anderson2021-weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) +add_executable(anderson2021-weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp Weights.cpp) target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 5b1113a8b644..18e9129e1af5 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -118,7 +118,7 @@ $(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ -$(BIN)/weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp +$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/weightsdir_to_weightsfile.cpp b/src/autoschedulers/common/weightsdir_to_weightsfile.cpp similarity index 100% rename from src/autoschedulers/anderson2021/weightsdir_to_weightsfile.cpp rename to src/autoschedulers/common/weightsdir_to_weightsfile.cpp From 0a99054e5377d4bd558d8fdaa9b37e59ad62e7ac Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:51:09 -0400 Subject: [PATCH 20/63] remove MACHINE_PARAMS --- src/autoschedulers/anderson2021/AutoSchedule.cpp | 3 --- src/autoschedulers/anderson2021/Makefile | 4 ---- 2 files changed, 7 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 74b701bee937..640d21823d64 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -31,9 +31,6 @@ Write out a training featurization for the selected schedule into this file. Needs to be converted to a sample file with the runtime using featurization_to_sample before it can be used to train. - HL_MACHINE_PARAMS - An architecture description string. Used by Halide master to configure the cost model. We only use the first term. Set it to the number of SMs on the target GPU. - HL_PERMIT_FAILED_UNROLL Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 18e9129e1af5..7893947c34b5 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -122,10 +122,6 @@ $(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $( @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -# This is the value that machine_params defaults to if no custom value is specified; -# see MachineParams::generic() -HL_MACHINE_PARAMS ?= 80,25165824,160 - # A sample generator to autoschedule. Note that if it statically links # to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the # autoscheduler can't find the libHalide symbols that it needs. From 8a7923b95544b01150f72e90952d05e58829fa04 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:56:05 -0400 Subject: [PATCH 21/63] move demo_generator.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 2 +- src/autoschedulers/adams2019/Makefile | 2 +- .../adams2019/demo_generator.cpp | 51 ------------------- .../anderson2021/CMakeLists.txt | 2 +- src/autoschedulers/anderson2021/Makefile | 2 +- .../demo_generator.cpp | 0 6 files changed, 4 insertions(+), 55 deletions(-) delete mode 100644 src/autoschedulers/adams2019/demo_generator.cpp rename src/autoschedulers/{anderson2021 => common}/demo_generator.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index b1903d85b233..ff80dd480219 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -54,7 +54,7 @@ target_link_libraries(Halide_Adams2019 PRIVATE ASLog ParamParser cost_model trai # ================================================================= -add_executable(demo.generator demo_generator.cpp) +add_executable(demo.generator ${COMMON_DIR}/demo_generator.cpp) target_link_libraries(demo.generator PRIVATE Halide::Generator) add_halide_library(demo FROM demo.generator diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index ff3ecc82ac9c..19ac584d43a2 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -110,7 +110,7 @@ $(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $( # A sample generator to autoschedule. Note that if it statically links # to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the # autoscheduler can't find the libHalide symbols that it needs. -$(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS) +$(GENERATOR_BIN)/demo.generator: $(COMMON_DIR)/demo_generator.cpp $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter %.cpp,$^) -o $@ $(LIBHALIDE_LDFLAGS) diff --git a/src/autoschedulers/adams2019/demo_generator.cpp b/src/autoschedulers/adams2019/demo_generator.cpp deleted file mode 100644 index 026c2394f3f5..000000000000 --- a/src/autoschedulers/adams2019/demo_generator.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "Halide.h" - -namespace { - -using namespace Halide; - -class ConvRelu : public Halide::Generator { -public: - Input> input{"input"}; - Input> filter{"filter"}; - Input> bias{"bias"}; - Output> relu{"relu"}; - - void generate() { - const int N = 5, CI = 120, CO = 24, W = 100, H = 80; - - Var x("x"), y("y"), c("c"), n("n"); - - Func conv("conv"); - RDom r(0, CI, 0, 3, 0, 3); - conv(c, x, y, n) = bias(c); - conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); - relu(c, x, y, n) = max(0, conv(c, x, y, n)); - - relu.bound(c, 0, CO) - .bound(x, 0, W) - .bound(y, 0, H) - .bound(n, 0, N); - - relu.dim(0).set_bounds(0, CO).set_stride(1); - relu.dim(1).set_bounds(0, W).set_stride(CO); - relu.dim(2).set_bounds(0, H).set_stride(CO * W); - relu.dim(3).set_bounds(0, N).set_stride(CO * H * W); - - input.dim(0).set_bounds(0, CI).set_stride(1); - input.dim(1).set_bounds(0, W + 2).set_stride(CI); - input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2)); - input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2)); - - filter.dim(0).set_bounds(0, CO).set_stride(1); - filter.dim(1).set_bounds(0, 3).set_stride(CO); - filter.dim(2).set_bounds(0, 3).set_stride(CO * 3); - filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3); - - bias.dim(0).set_bounds(0, CO).set_stride(1); - } -}; - -} // namespace - -HALIDE_REGISTER_GENERATOR(ConvRelu, demo) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index b47b757d28f6..f4ac9a6ff830 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -61,7 +61,7 @@ target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser cost_model t # ================================================================= -add_executable(anderson2021-demo.generator demo_generator.cpp) +add_executable(anderson2021-demo.generator ${COMMON_DIR}/demo_generator.cpp) target_link_libraries(demo.generator PRIVATE Halide::Generator) add_halide_library(anderson2021-demo FROM demo.generator diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 7893947c34b5..8f86c0fd4425 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -125,7 +125,7 @@ $(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $( # A sample generator to autoschedule. Note that if it statically links # to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the # autoscheduler can't find the libHalide symbols that it needs. -$(GENERATOR_BIN)/demo.generator: demo_generator.cpp $(GENERATOR_DEPS) +$(GENERATOR_BIN)/demo.generator: $(COMMON_DIR)/demo_generator.cpp $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter %.cpp,$^) -o $@ $(LIBHALIDE_LDFLAGS) diff --git a/src/autoschedulers/anderson2021/demo_generator.cpp b/src/autoschedulers/common/demo_generator.cpp similarity index 100% rename from src/autoschedulers/anderson2021/demo_generator.cpp rename to src/autoschedulers/common/demo_generator.cpp From bee009ea3c0e75762b6863ff99da7aea271b1df3 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 01:56:53 -0400 Subject: [PATCH 22/63] remove files from Adams2019 --- .../adams2019/featurization_to_sample.cpp | 42 ----------- .../adams2019/get_host_target.cpp | 21 ------ .../adams2019/test_perfect_hash_map.cpp | 71 ------------------- .../adams2019/weightsdir_to_weightsfile.cpp | 29 -------- 4 files changed, 163 deletions(-) delete mode 100644 src/autoschedulers/adams2019/featurization_to_sample.cpp delete mode 100644 src/autoschedulers/adams2019/get_host_target.cpp delete mode 100644 src/autoschedulers/adams2019/test_perfect_hash_map.cpp delete mode 100644 src/autoschedulers/adams2019/weightsdir_to_weightsfile.cpp diff --git a/src/autoschedulers/adams2019/featurization_to_sample.cpp b/src/autoschedulers/adams2019/featurization_to_sample.cpp deleted file mode 100644 index fa94cb840cb9..000000000000 --- a/src/autoschedulers/adams2019/featurization_to_sample.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include -#include -#include - -// A sample is a featurization + a runtime + some ids, all together in one file. -// This utility concats the runtime and ids onto a featurization to produce a sample. -int main(int argc, char **argv) { - if (argc != 6) { - std::cout << "Usage: featurization_to_sample in.featurization runtime pipeline_id schedule_id out.sample\n"; - return -1; - } - - std::ifstream src(argv[1], std::ios::binary); - if (!src) { - std::cerr << "Unable to open input file: " << argv[1] << "\n"; - return -1; - } - - std::ofstream dst(argv[5], std::ios::binary); - if (!dst) { - std::cerr << "Unable to open output file: " << argv[5] << "\n"; - return -1; - } - - dst << src.rdbuf(); - - // Input runtime value is presumed to be in seconds, - // but sample file stores times in milliseconds. - float r = atof(argv[2]) * 1000.f; - int32_t pid = atoi(argv[3]); - int32_t sid = atoi(argv[4]); - - dst.write((const char *)&r, 4); - dst.write((const char *)&pid, 4); - dst.write((const char *)&sid, 4); - - src.close(); - dst.close(); - - return 0; -} diff --git a/src/autoschedulers/adams2019/get_host_target.cpp b/src/autoschedulers/adams2019/get_host_target.cpp deleted file mode 100644 index 5d0062def489..000000000000 --- a/src/autoschedulers/adams2019/get_host_target.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "Halide.h" - -using namespace Halide; - -// Print the host target to stdout. -// Any extra arguments are assumed to be features that should be stripped from -// the target (as a convenience for use in Makefiles, where string manipulation -// can be painful). -int main(int argc, char **argv) { - Target t = get_host_target(); - for (int i = 1; i < argc; ++i) { - auto f = Target::feature_from_name(argv[i]); - if (f == Target::FeatureEnd) { - fprintf(stderr, "Unknown feature: %s\n", argv[i]); - exit(1); - } - t = t.without_feature(f); - } - printf("%s", t.to_string().c_str()); - return 0; -} diff --git a/src/autoschedulers/adams2019/test_perfect_hash_map.cpp b/src/autoschedulers/adams2019/test_perfect_hash_map.cpp deleted file mode 100644 index 1f81e298d3a1..000000000000 --- a/src/autoschedulers/adams2019/test_perfect_hash_map.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include -#include -#include -#include - -#include "PerfectHashMap.h" - -using std::map; -using std::vector; - -struct Key { - int id, max_id; - Key(int i, int m) - : id(i), max_id(m) { - } -}; - -int main(int argc, char **argv) { - std::mt19937 rng(0); - int seed = argc > 1 ? atoi(argv[1]) : time(nullptr); - rng.seed(seed); - printf("seed: %d\n", seed); - - PerfectHashMap h; - std::map ref; - - std::vector keys; - const int N = 100; - - for (int i = 0; i < N; i++) { - keys.emplace_back(i, N); - } - std::shuffle(keys.begin(), keys.end(), rng); - - for (int i = 0; i < 10000; i++) { - // Insert. Possibly a duplicate of an existing item. - int next = rng() % N; - h.insert(&keys[next], next); - ref.insert({&keys[next], next}); - - // Check the map and hash map contain the same stuff in the same order - if (h.size() != ref.size()) { - fprintf(stderr, "Size mismatch: %d vs %d\n", (int)h.size(), (int)ref.size()); - return -1; - } - // Use iterators to convert PerfectHashMap to map and compare to reference map - decltype(ref) h_map; - for (auto it = h.begin(); it != h.end(); it++) { - h_map.insert({it.key(), it.value()}); - } - - auto it = h_map.begin(); - auto ref_it = ref.begin(); - while (it != h_map.end()) { - if (it->first != ref_it->first) { - fprintf(stderr, "Key mismatch: %p vs %p\n", (const void *)it->first, (const void *)ref_it->first); - return -1; - } - if (it->second != ref_it->second) { - fprintf(stderr, "Value mismatch: %d vs %d\n", it->second, ref_it->second); - return -1; - } - it++; - ref_it++; - } - } - printf("Perfect hash map test passed\n"); - return 0; -} diff --git a/src/autoschedulers/adams2019/weightsdir_to_weightsfile.cpp b/src/autoschedulers/adams2019/weightsdir_to_weightsfile.cpp deleted file mode 100644 index f266aa702c94..000000000000 --- a/src/autoschedulers/adams2019/weightsdir_to_weightsfile.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include - -#include "Weights.h" - -// Utility to convert from the old dir-of-raw-data into a new .weights file. -// Should live only long enough for downstream users to convert existing data files -// to the new format. -int main(int argc, char **argv) { - if (argc != 3) { - std::cout << "Usage: weights_dir weights_file.weights\n"; - return -1; - } - - Halide::Internal::Weights w; - if (!w.load_from_dir(argv[1])) { - std::cerr << "Unable to read input dir: " << argv[1] << "\n"; - return -1; - } - - if (!w.save_to_file(argv[2])) { - std::cerr << "Unable to save output file: " << argv[2] << "\n"; - return -1; - } - - return 0; -} From 1e3fa1106c6f6b9931cc673e0c2a9c6a36af064e Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 02:00:28 -0400 Subject: [PATCH 23/63] move included_schedule_file_generator.cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 2 +- src/autoschedulers/adams2019/Makefile | 2 +- .../included_schedule_file_generator.cpp | 54 ------------------- .../anderson2021/CMakeLists.txt | 2 +- src/autoschedulers/anderson2021/Makefile | 2 +- .../included_schedule_file_generator.cpp | 0 6 files changed, 4 insertions(+), 58 deletions(-) delete mode 100644 src/autoschedulers/adams2019/included_schedule_file_generator.cpp rename src/autoschedulers/{anderson2021 => common}/included_schedule_file_generator.cpp (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index ff80dd480219..c78687a4d2dd 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -75,7 +75,7 @@ set_tests_properties(demo_apps_autoscheduler # ================================================================= -add_executable(included_schedule_file.generator included_schedule_file_generator.cpp) +add_executable(included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) target_link_libraries(included_schedule_file.generator PRIVATE Halide::Generator) add_halide_library(included_schedule_file FROM included_schedule_file.generator diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 19ac584d43a2..8341cd82d77a 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -195,7 +195,7 @@ clean: # # We'll use the preprocessor (GENERATING_SCHEDULE) to distinguish between these two. -$(GENERATOR_BIN)/included_schedule_file_none.generator: $(SRC)/included_schedule_file_generator.cpp $(GENERATOR_DEPS) +$(GENERATOR_BIN)/included_schedule_file_none.generator: $(COMMON_DIR)/included_schedule_file_generator.cpp $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -DGENERATING_SCHEDULE -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) diff --git a/src/autoschedulers/adams2019/included_schedule_file_generator.cpp b/src/autoschedulers/adams2019/included_schedule_file_generator.cpp deleted file mode 100644 index cdd2bc7f6bf3..000000000000 --- a/src/autoschedulers/adams2019/included_schedule_file_generator.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "Halide.h" - -#if defined(GENERATING_SCHEDULE) -// nothing -#else -#include "included_schedule_file.schedule.h" -#endif - -namespace { - -// Trivial Generator for testing (and demonstrating) use of .schedule.h -// files produced by the autoschedulers; this is very similar to -// demo_generator.cpp, but packaged separately to avoid confusion for -// newcomers. -struct IncludedScheduleFile : public Halide::Generator { - Input> input{"input"}; - Input> filter{"filter"}; - Input> bias{"bias"}; - Output> relu{"relu"}; - - void generate() { - const int N = 5, CI = 120, CO = 24, W = 100, H = 80; - - Var x("x"), y("y"), c("c"), n("n"); - - // Algorithm - Func conv("conv"); - RDom r(0, CI, 0, 3, 0, 3); - conv(c, x, y, n) = bias(c); - conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); - relu(c, x, y, n) = max(0, conv(c, x, y, n)); - - // Estimates (for autoscheduler and/or RunGen) - input.set_estimates({{0, CI}, {0, W + 2}, {0, H + 2}, {0, N}}); - filter.set_estimates({{0, CO}, {0, 3}, {0, 3}, {0, CI}}); - bias.set_estimates({{0, CO}}); - relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}}); - - // Schedule - if (using_autoscheduler()) { - // nothing - } else { -#if defined(GENERATING_SCHEDULE) - abort(); -#else - apply_schedule_included_schedule_file(get_pipeline(), get_target()); -#endif - } - } -}; - -} // namespace - -HALIDE_REGISTER_GENERATOR(IncludedScheduleFile, included_schedule_file) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index f4ac9a6ff830..73eb6cf80f4c 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -82,7 +82,7 @@ set_tests_properties(demo_apps_autoscheduler ## ================================================================= -add_executable(anderson2021-included_schedule_file.generator included_schedule_file_generator.cpp) +add_executable(anderson2021-included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) target_link_libraries(included_schedule_file.generator PRIVATE Halide::Generator) add_halide_library(anderson2021-included_schedule_file FROM included_schedule_file.generator diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 8f86c0fd4425..1567efd1a741 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -257,7 +257,7 @@ clean: # # We'll use the preprocessor (GENERATING_SCHEDULE) to distinguish between these two. -$(GENERATOR_BIN)/included_schedule_file_none.generator: included_schedule_file_generator.cpp $(GENERATOR_DEPS) +$(GENERATOR_BIN)/included_schedule_file_none.generator: $(COMMON_DIR)/included_schedule_file_generator.cpp $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -DGENERATING_SCHEDULE -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) diff --git a/src/autoschedulers/anderson2021/included_schedule_file_generator.cpp b/src/autoschedulers/common/included_schedule_file_generator.cpp similarity index 100% rename from src/autoschedulers/anderson2021/included_schedule_file_generator.cpp rename to src/autoschedulers/common/included_schedule_file_generator.cpp From 489509bb496c5bb2917f66f50fc9fa98b1cdbf1b Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 19 Aug 2022 23:07:45 -0400 Subject: [PATCH 24/63] move Weights.h/cpp to common/ --- src/autoschedulers/adams2019/CMakeLists.txt | 6 +- src/autoschedulers/adams2019/Makefile | 10 +- .../anderson2021/CMakeLists.txt | 6 +- src/autoschedulers/anderson2021/Makefile | 10 +- src/autoschedulers/anderson2021/Weights.cpp | 250 ------------------ src/autoschedulers/anderson2021/Weights.h | 54 ---- .../{adams2019 => common}/Weights.cpp | 0 .../{adams2019 => common}/Weights.h | 0 8 files changed, 16 insertions(+), 320 deletions(-) delete mode 100644 src/autoschedulers/anderson2021/Weights.cpp delete mode 100644 src/autoschedulers/anderson2021/Weights.h rename src/autoschedulers/{adams2019 => common}/Weights.cpp (100%) rename src/autoschedulers/{adams2019 => common}/Weights.h (100%) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index c78687a4d2dd..1c04d01379df 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -25,7 +25,7 @@ add_halide_library(train_cost_model FROM cost_model.generator # retrain_cost_model add_executable(retrain_cost_model DefaultCostModel.cpp - Weights.cpp + ${COMMON_DIR}/Weights.cpp retrain_cost_model.cpp ${WF_CPP}) target_link_libraries(retrain_cost_model PRIVATE ASLog cost_model train_cost_model Halide::Halide Halide::Plugin) @@ -42,7 +42,7 @@ add_autoscheduler(NAME Adams2019 FunctionDAG.cpp LoopNest.cpp State.cpp - Weights.cpp + ${COMMON_DIR}/Weights.cpp ${WF_CPP}) target_link_libraries(Halide_Adams2019 PRIVATE ASLog ParamParser cost_model train_cost_model) @@ -103,7 +103,7 @@ add_executable(featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp add_executable(get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) -add_executable(weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp Weights.cpp) +add_executable(weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 8341cd82d77a..18824b848170 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -63,8 +63,8 @@ $(BIN)/libautoschedule_adams2019.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(SRC)/Cache.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ - $(SRC)/Weights.h \ - $(SRC)/Weights.cpp \ + $(COMMON_DIR)/Weights.h \ + $(COMMON_DIR)/Weights.cpp \ $(SRC)/FunctionDAG.h \ $(SRC)/FunctionDAG.cpp \ $(SRC)/LoopNest.h \ @@ -86,8 +86,8 @@ $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(COMMON_DIR)/ASLog.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ - $(SRC)/Weights.h \ - $(SRC)/Weights.cpp \ + $(COMMON_DIR)/Weights.h \ + $(COMMON_DIR)/Weights.cpp \ $(SRC)/CostModel.h \ $(SRC)/NetworkSize.h \ $(AUTOSCHED_COST_MODEL_LIBS) \ @@ -103,7 +103,7 @@ $(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp $(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ $(HALIDE_RPATH_FOR_BIN) -$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp +$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(COMMON_DIR)/Weights.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 73eb6cf80f4c..bba944b4b238 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -30,7 +30,7 @@ add_halide_library(anderson2021-train_cost_model FROM cost_model.generator ## retrain_cost_model add_executable(anderson2021-retrain_cost_model DefaultCostModel.cpp - Weights.cpp + ${COMMON_DIR}/Weights.cpp retrain_cost_model.cpp ${WF_CPP}) target_link_libraries(retrain_cost_model PRIVATE ASLog cost_model train_cost_model Halide::Halide Halide::Plugin) @@ -49,7 +49,7 @@ add_autoscheduler(NAME Anderson2021 SearchSpace.cpp State.cpp Tiling.cpp - Weights.cpp + ${COMMON_DIR}/Weights.cpp ${WF_CPP}) target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser cost_model train_cost_model) @@ -110,7 +110,7 @@ add_executable(anderson2021-featurization_to_sample ${COMMON_DIR}/featurization_ add_executable(anderson2021-get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(get_host_target PRIVATE Halide::Halide) -add_executable(anderson2021-weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp Weights.cpp) +add_executable(anderson2021-weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 1567efd1a741..cddd62229f59 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -67,8 +67,8 @@ $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(COMMON_DIR)/ASLog.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ - $(SRC)/Weights.h \ - $(SRC)/Weights.cpp \ + $(COMMON_DIR)/Weights.h \ + $(COMMON_DIR)/Weights.cpp \ $(SRC)/FunctionDAG.h \ $(SRC)/FunctionDAG.cpp \ $(SRC)/LoopNest.h \ @@ -100,8 +100,8 @@ $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(COMMON_DIR)/ASLog.cpp \ $(SRC)/DefaultCostModel.h \ $(SRC)/DefaultCostModel.cpp \ - $(SRC)/Weights.h \ - $(SRC)/Weights.cpp \ + $(COMMON_DIR)/Weights.h \ + $(COMMON_DIR)/Weights.cpp \ $(SRC)/CostModel.h \ $(SRC)/NetworkSize.h \ $(AUTOSCHED_COST_MODEL_LIBS) \ @@ -118,7 +118,7 @@ $(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ -$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(SRC)/Weights.cpp +$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(COMMON_DIR)/Weights.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ diff --git a/src/autoschedulers/anderson2021/Weights.cpp b/src/autoschedulers/anderson2021/Weights.cpp deleted file mode 100644 index 8206410a397f..000000000000 --- a/src/autoschedulers/anderson2021/Weights.cpp +++ /dev/null @@ -1,250 +0,0 @@ -#include -#include -#include - -#include "Featurization.h" -#include "HalideBuffer.h" -#include "NetworkSize.h" -#include "Weights.h" - -namespace Halide { -namespace Internal { - -using Halide::Runtime::Buffer; - -constexpr uint32_t kSignature = 0x68776631; - -void Weights::randomize(uint32_t seed) { - std::mt19937 rng(seed); - // Fill the weights with random values - for_each_buffer([&rng](Buffer &w) { - w.for_each_value([&rng](float &f) { - f = ((float)rng()) / ((float)std::mt19937::max()) - 0.5f; - }); - }); -} - -/* - Structure of the .weights file format: - - uint32 signature always 0x68776631 ('hwf1') - uint32 PipelineFeatures::version - uint32 ScheduleFeatures::version - uint32 buffer-count - uint32 dimension-count - uint32x(dimension-count) dimension-extent - float32x(element-count) data - - (all values little-endian) -*/ - -bool Weights::load(std::istream &i) { - uint32_t signature; - i.read((char *)&signature, sizeof(signature)); - if (i.fail() || signature != kSignature) { - return false; - } - - i.read((char *)&pipeline_features_version, sizeof(pipeline_features_version)); - if (i.fail()) { - return false; - } - - i.read((char *)&schedule_features_version, sizeof(schedule_features_version)); - if (i.fail()) { - return false; - } - - uint32_t buffer_count; - i.read((char *)&buffer_count, sizeof(buffer_count)); - if (i.fail() || buffer_count != 6) { - return false; - } - - const auto load_one = [&i](Buffer &buf) -> bool { - uint32_t dimension_count; - i.read((char *)&dimension_count, sizeof(dimension_count)); - if (i.fail() || dimension_count != (uint32_t)buf.dimensions()) { - return false; - } - for (uint32_t d = 0; d < dimension_count; d++) { - uint32_t extent; - i.read((char *)&extent, sizeof(extent)); - if (i.fail() || (int)extent != (int)buf.extent(d)) { - return false; - } - } - i.read((char *)(buf.data()), buf.size_in_bytes()); - if (i.fail()) { - return false; - } - return true; - }; - - if (!load_one(head1_filter)) { - return false; - } - if (!load_one(head1_bias)) { - return false; - } - if (!load_one(head2_filter)) { - return false; - } - if (!load_one(head2_bias)) { - return false; - } - if (!load_one(conv1_filter)) { - return false; - } - if (!load_one(conv1_bias)) { - return false; - } - - return true; -} -bool Weights::load_from_file(const std::string &filename) { - std::ifstream i(filename, std::ios_base::binary); - return load(i); -} - -bool Weights::save(std::ostream &o) const { - const uint32_t signature = kSignature; - o.write((const char *)&signature, sizeof(signature)); - if (o.fail()) { - return false; - } - - o.write((const char *)&pipeline_features_version, sizeof(pipeline_features_version)); - if (o.fail()) { - return false; - } - - o.write((const char *)&schedule_features_version, sizeof(schedule_features_version)); - if (o.fail()) { - return false; - } - - const uint32_t buffer_count = 6; - o.write((const char *)&buffer_count, sizeof(buffer_count)); - if (o.fail()) { - return false; - } - - const auto save_one = [&o](const Buffer &buf) -> bool { - const uint32_t dimension_count = buf.dimensions(); - o.write((const char *)&dimension_count, sizeof(dimension_count)); - if (o.fail()) { - return false; - } - for (uint32_t d = 0; d < dimension_count; d++) { - uint32_t extent = buf.extent(d); - o.write((const char *)&extent, sizeof(extent)); - if (o.fail()) { - return false; - } - } - o.write((const char *)(buf.data()), buf.size_in_bytes()); - if (o.fail()) { - return false; - } - return true; - }; - - if (!save_one(head1_filter)) { - return false; - } - if (!save_one(head1_bias)) { - return false; - } - if (!save_one(head2_filter)) { - return false; - } - if (!save_one(head2_bias)) { - return false; - } - if (!save_one(conv1_filter)) { - return false; - } - if (!save_one(conv1_bias)) { - return false; - } - - return true; -} - -bool Weights::save_to_file(const std::string &filename) const { - std::ofstream o(filename, std::ios_base::trunc | std::ios_base::binary); - return save(o); -} - -bool Weights::load_from_dir(const std::string &dir) { - const auto buffer_from_file = [](const std::string &filename, Buffer &buf) -> bool { - std::ifstream i(filename, std::ios_base::binary); - i.read((char *)(buf.data()), buf.size_in_bytes()); - i.close(); - if (i.fail()) { - return false; - } - return true; - }; - - if (!buffer_from_file(dir + "/head1_conv1_weight.data", head1_filter)) { - return false; - } - if (!buffer_from_file(dir + "/head1_conv1_bias.data", head1_bias)) { - return false; - } - if (!buffer_from_file(dir + "/head2_conv1_weight.data", head2_filter)) { - return false; - } - if (!buffer_from_file(dir + "/head2_conv1_bias.data", head2_bias)) { - return false; - } - if (!buffer_from_file(dir + "/trunk_conv1_weight.data", conv1_filter)) { - return false; - } - if (!buffer_from_file(dir + "/trunk_conv1_bias.data", conv1_bias)) { - return false; - } - - // Old style data doesn't record the versions, so just assume they are current - pipeline_features_version = PipelineFeatures::version(); - schedule_features_version = ScheduleFeatures::version(); - - return true; -} - -bool Weights::save_to_dir(const std::string &dir) const { - const auto buffer_to_file = [](const Buffer &buf, const std::string &filename) -> bool { - std::ofstream o(filename, std::ios_base::trunc | std::ios_base::binary); - o.write((const char *)(buf.data()), buf.size_in_bytes()); - o.close(); - if (o.fail()) { - return false; - } - return true; - }; - - if (!buffer_to_file(head1_filter, dir + "/head1_conv1_weight.data")) { - return false; - } - if (!buffer_to_file(head1_bias, dir + "/head1_conv1_bias.data")) { - return false; - } - if (!buffer_to_file(head2_filter, dir + "/head2_conv1_weight.data")) { - return false; - } - if (!buffer_to_file(head2_bias, dir + "/head2_conv1_bias.data")) { - return false; - } - if (!buffer_to_file(conv1_filter, dir + "/trunk_conv1_weight.data")) { - return false; - } - if (!buffer_to_file(conv1_bias, dir + "/trunk_conv1_bias.data")) { - return false; - } - return true; -} - -} // namespace Internal -} // namespace Halide diff --git a/src/autoschedulers/anderson2021/Weights.h b/src/autoschedulers/anderson2021/Weights.h deleted file mode 100644 index c2d2220a03c2..000000000000 --- a/src/autoschedulers/anderson2021/Weights.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _WEIGHTS -#define _WEIGHTS - -#include -#include -#include - -#include "Featurization.h" -#include "HalideBuffer.h" -#include "NetworkSize.h" - -namespace Halide { -namespace Internal { - -struct Weights { - uint32_t pipeline_features_version = PipelineFeatures::version(); - uint32_t schedule_features_version = ScheduleFeatures::version(); - - Halide::Runtime::Buffer head1_filter{head1_channels, head1_w, head1_h}; - Halide::Runtime::Buffer head1_bias{head1_channels}; - - Halide::Runtime::Buffer head2_filter{head2_channels, head2_w}; - Halide::Runtime::Buffer head2_bias{head2_channels}; - - Halide::Runtime::Buffer conv1_filter{conv1_channels, head1_channels + head2_channels}; - Halide::Runtime::Buffer conv1_bias{conv1_channels}; - - template - void for_each_buffer(F f) { - f(head1_filter); - f(head1_bias); - f(head2_filter); - f(head2_bias); - f(conv1_filter); - f(conv1_bias); - } - - void randomize(uint32_t seed); - - bool load(std::istream &i); - bool save(std::ostream &o) const; - - bool load_from_file(const std::string &filename); - bool save_to_file(const std::string &filename) const; - - // Load/save from the 'classic' form of six raw data files - bool load_from_dir(const std::string &dir); - bool save_to_dir(const std::string &dir) const; -}; - -} // namespace Internal -} // namespace Halide - -#endif // _WEIGHTS diff --git a/src/autoschedulers/adams2019/Weights.cpp b/src/autoschedulers/common/Weights.cpp similarity index 100% rename from src/autoschedulers/adams2019/Weights.cpp rename to src/autoschedulers/common/Weights.cpp diff --git a/src/autoschedulers/adams2019/Weights.h b/src/autoschedulers/common/Weights.h similarity index 100% rename from src/autoschedulers/adams2019/Weights.h rename to src/autoschedulers/common/Weights.h From 6e8e4c1da423718dd54c7de3f2e698ee0a490232 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 21 Aug 2022 12:46:30 -0400 Subject: [PATCH 25/63] tidy up --- src/autoschedulers/adams2019/CMakeLists.txt | 1 + src/autoschedulers/adams2019/Makefile | 2 +- src/autoschedulers/anderson2021/CMakeLists.txt | 4 ++-- src/autoschedulers/anderson2021/Makefile | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index 1c04d01379df..6399a3990cff 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -3,6 +3,7 @@ ## set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") +include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") # weights set(WF_CPP baseline.cpp) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 18824b848170..22541eba77a8 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -216,7 +216,7 @@ $(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_f # make bin/host/included_schedule_file.schedule.h # cp bin/host/included_schedule_file.schedule.h included_schedule_file.schedule.h # -$(GENERATOR_BIN)/included_schedule_file.generator: $(SRC)/included_schedule_file_generator.cpp $(SRC)/included_schedule_file.schedule.h $(GENERATOR_DEPS) +$(GENERATOR_BIN)/included_schedule_file.generator: $(COMMON_DIR)/included_schedule_file_generator.cpp $(SRC)/included_schedule_file.schedule.h $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index bba944b4b238..f6db09312582 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -5,9 +5,9 @@ add_compile_definitions(HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API) set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") -include_directories("${Halide_BINARY_DIR}/include/") +include_directories("${Halide_BINARY_DIR}/include") include_directories(${COMMON_DIR}) -include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021/") +include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") # weights set(WF_CPP baseline.cpp) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index cddd62229f59..536cd088116c 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -267,7 +267,7 @@ $(GENERATOR_BIN)/included_schedule_file_none.generator: $(COMMON_DIR)/included_s $(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_file_none.generator $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) @mkdir -p $(@D) HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ - $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) autoscheduler=Anderson2021 -e schedule + $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* -p $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) autoscheduler=Anderson2021 autoscheduler.parallelism=80 -e schedule # Note that this depends on included_schedule_file.schedule.h rather than $(BIN)/%/included_schedule_file.schedule.h -- # the former should be generated by something like @@ -275,7 +275,7 @@ $(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_f # make bin/host/included_schedule_file.schedule.h # cp bin/host/included_schedule_file.schedule.h included_schedule_file.schedule.h # -$(GENERATOR_BIN)/included_schedule_file.generator: included_schedule_file_generator.cpp included_schedule_file.schedule.h $(GENERATOR_DEPS) +$(GENERATOR_BIN)/included_schedule_file.generator: $(COMMON_DIR)/included_schedule_file_generator.cpp included_schedule_file.schedule.h $(GENERATOR_DEPS) @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) From 1c4e2a66ec5a9cd62042e41404b46ba29110bd70 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 4 Sep 2022 02:13:52 -0400 Subject: [PATCH 26/63] add input images --- apps/images/low_res_in.png | Bin 0 -> 313206 bytes apps/images/matrix_3200.mat | Bin 0 -> 248 bytes apps/images/matrix_7000.mat | Bin 0 -> 248 bytes 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 apps/images/low_res_in.png create mode 100644 apps/images/matrix_3200.mat create mode 100644 apps/images/matrix_7000.mat diff --git a/apps/images/low_res_in.png b/apps/images/low_res_in.png new file mode 100644 index 0000000000000000000000000000000000000000..63941881205b511abb7e5d4ac87b58802e0449ea GIT binary patch literal 313206 zcmV)jK%u{hP)9ZMP%t}UCAPuVv@}kQzmoCWF;`ndB`-Do~{j4u+qE(g!oc+$^-Hby(Nuo z=L=n@HO>xal_apVt$d$2l(?Q)qU~Ab>`xRpC!N>qcWZ_x=G1&-r#g=FW1?>24`*LO z^cl&b6Xo`m#1C3pbKZ9AS!YyYG;Y1-{LBJfnD|$-Oc#BLxUP3z2l^$(;Ma0{Jz?U` zOYD|hbCB@?Me!8CtC^YO1!LK7BL2CecXg~Eo^AM9=7llubNSk(8W4dm$TCBl)YrBOSj>- zmpIKOOZ(7XSIZ*l$tqpuyrFYtJp-h(_Ru{H*B_jdte5L%s8*1peFzfQ&cx*(ViYri z71%7`Njy51SBc@(nPf4*X+luv(uoj{lZ{_jJ1-F-r0wx?Uf<#<%~*oXQRf7uco;?o zanqbM()1KI7Jh0t&oXRM$e}J#vN=K(d3ebrKpOi9bC!1c9&NDn8vCV0lE~A1=LxnF zrV|rN) zIz$GtTN}wMSVUwaK2q683L|wkr?gNCiRcqLm^PHsjxx5Ai%lD1q!VQxacmaR3`=r} zlSL1pRF`my7`KpyTW52b^K7M@Ji3ubhBPCa4EnJ_EUm=WVoLE4q8zWR_faDG9CRh!Soe&vT{l=C%U{VaCm)jeNz<(uh1Q!%snWUDh*~Yp zzqL9hPc&VVrX_7PrRJKeQvPx+OMXKhvU(*yV%2snx6;i4YmRG}L|wO9oy}o$v#Y;6 z%ImDsVLT%5>IKun-0Awwd~4n?--Hpnmhlo-<2gYgA8Z3vnKIT&BVk5&aTA7#L>hyC+z&!Y2d`1b~=&w zv_Nl^G%94j2J{lOoUP6m4Aj^4H=bpw{g@7vkbZ>CYfe)xQLUfRoN_0|U-FHP#3L(p zDz)`R=Pi!%2hVa9c$Wwb^#Bby$2p?>%^G%bhBs+UUv9-u5kC-SFMDw?7~GuXBo}e; zkW7SXV)UdZJ-LaS_?zZ5Cy0@IFfxugRLf>*Nkf`T5J!HO7DO3jw$Rq}H@oN{ea+)^ zl!IoU%y)g^T0uL~jhB3x>3W9Qvd=_hv%DrL+%0#=<1E$&jL^2Ul`hnn6{Z1obT3if z(GyJID)UTxHt1NUQOv`7T3?jz(g?V$-5957TFBk}BtMW(j;8Um%;RGsRC5~_WfC7# zZJL`RPH1Ola7DZ7Q0j4s*;LY4j}eiDQk#70=#NxzH-Ax0JsHO-TIe#8fs-_#0ZmEf zq{N95(*i=&m4(#fVMbv1Nn3J~`sOR5T+mNBM+;fVRbtGGsC{fL?pS;JMbNXF(KDdq+)=|}t{Kbw~^ zdV+eKVTp{V8F$N3uId1CNx>%{VY8dv9On`pX(F?+IU^ zrm^g$4(oJ1M`$F|@c?hqk{tPti=@ey)S} zg3VRF!kAeYq9Qy%FBan=#Ax6TYH>3bchQ=wEZ{*J(w0kvQg9t336OIa7O;35}$lZCFRVPq1?na{-q8+xkht2 zgF}?7TqMAMcnnJdBy&;D$v_-(Wg8aDxs8kb!R>fl$qzNyXnuF)T0h`QdeQYWnbvQ~ z=mBU326bZ@a!W z$+E!pqtumC@(#DjTXJ50l|g2?c~D+wq`u63<^Uz8x5+lovqlQc+xmh$$`)-)u9>6v z(3c;aJsftDou~C6(-KK`we5ADb)HN7px5L@`X_F2A_@UzUQ>md|jfS$|PsAy_9oYbL!)lzFI@F{Gh*clsV2H zBQYTL_sJwFXBiQG<{V#}0EV8tM5ginwf>vlD*87 zN7%tp7UITBC+?y*!0s%r_v>)7`HFf(^gS*TBf@$v z$`!c=k|GUgtbel=7rDF!^wV5U%X(RXGM15ea5-;tiA!`KF7I)T*4(9;B;l2rL^#7s zT*Pn{4{1Dyp{a!M@sEB=HK(bjDfRh-s~n{>S4n0&O;FmAOci4}Np1O&ZrI#J1UE-G zP80gjoPF{L)x@=kb2MNf4t2DYWCGaO451#D+`}Cl<}ib(qwmW~F3EecfVRA^Gx&w8 zaw|=A0*f#-VF(^-$wDIfoV-q)xqL-^O1OuADFPZYnY(dFqaPh~G}j2Rh4xs~mM3UM znM}fvL_coe9EG@PBzpH4|zIMZ`SUl>k0Wi zvBq&~+9w(~&nNzIMmq1XLR%-=I9;_$KhUSOpRaa!S>!XYJX}Z_ecFr|n{A zaN6{pCXYs0Ydrb{RT4r0ijDX`1lZTv!l;hT=c%9~n z+N82RQI`#Lb{-^!N~bGFi0g8KY}HLbhPK7cRHowLB^^cw!mK76j}D-OII}rRoc)CU z4>th<1SqDMV*L2=)0xh6rVCxjlqoWWRoK|Xi86r<(xovTelbsB$#lu1CB>}8CS-Lb zN5ifb_++O%j7?`Lmr3SLYcn4*$jl-p^QE24Fip%H+37s%T+%+WQ_|?kMD431rHd3v z77)kMC2Z6pZpM;q=V2X2%;}>ibfTVh?qr#)<|Wc~g0oLoYj^#MAf-$v6|XKPgdaaa zNk}n1$z>yPDj9)?IIj}bBI4M1@p6iZi33=CFNHMF%Zc~MVF?SUz^7BNvGL-eOivJ_ zjh^K^r}QXMVzPuNA)QN{2k?*zWD(;i!-!H_rxT#FqyZIrf)FQg107^BUgFL$;>4u^ zHd`4?m<&COMV_|BONp+=2dp7NE?z3|&>24&GM{qkK`{dvN`J~FjD=S|qW~{%`5U+P z;%n;ChN+xjA2!`|w$x@T<0T(273AY3&LV7GqnC*jHDA$+-IP&A2U?NFR#tEjAAVA4 zKz~{hCyN;8S;~1@>J(ZLXAS$g%w-~^5h0ZbQ9@+vG`bLH6_*J!h$voW;U$N3+(aoQ zL{t~!Az!9pisJgpdpe z+F|47gqCA5MiO`l>19%I%Q3DI(({y(K>#lsfFQNCDR5R!lR-D)#DIC&6i|#u-qdN> ze9Z?yhzcsTR7*)EEE)JrQi;`9nk6q!x*~Oxo=b}Jn>EXIjiX6v$={o)Nmr7}P5a~& zSAqQQF0a)h>F=avwSKZ*usS7mc9powT#e0Zu1;p2ypq({)l?3+`k8Zb%zPsMNUqd2 zC(WCB1M{43PIH!G5!G5$@F5jg+ErcbQ+1vr#s|(0ia_gQ<0qA)l<9IpoYfgXmM+0Z zlpHD<%y?|Plyj6UHn52i6mg1f_$Vhth!Cly;v-6wDA{C_O*-kM0}z)&DWpn+8e|MZ z7>$q5Wx;9&z9gxIIG2xF0jO_dxY z%sM?wDUeDg&4>{o>@1*?2wziyrRVX{#yNpK(nTxqP(_R|Y2qi2EyJ;;uPosdA#6(V%XPej zB?*fpnM)3C(+V#SNIq`DG8(UpWfAF8DF2YkK?%~13g!@JGOy8DdP_E0WN0yaT=m?I zy{>UdZIY&#RBM}+ZZfSelg^kv)@9c-W{m5I^@eM{nPj~n4;!rS&3qHL-esNfxn`LU zl{F--52H0K}cNR?h-2dy1LJ~#zLv{7Or`zUgf^l3hFHe-v|`J9uoSO;(? z|LPjb80vJPDfen4;?i2P3Gd}Z*RB}#dlSP1MFr1^5tAtsC;X1v!%I|FD z95yx$@!+8zoe2=9f)Lf*&D}Jo6P>8TBqkA-JES8a{$eDCm@J?Ut)v^rxy#I@rJOK@ z$3tZ$6 zuF)Nb)^aD%S+e+-`><)lT78VOw3X8|;))LBDy{ef4|Qb-Uc&r=p&Jk2p_1lYB+k2B zB85z%#HBADCQy&=oTDpe*ozy7OoF6xieAL&sUKkz=2qhDkpg1!Gxu?obV~4Y8`XH_ zA~z8tMh(@{SX$AEZ&=O|DItfLEEnMxjNC$%M&xdi%m6vhE&3G&9LJW+c;tSmB86wz z$VK@;ig1`lTihIwe`$uB9@M24?TKPDmX^fj4o+~DYSOr-?Kw+hIZZWJWh7~YfP?rM z!&PZSKjQKtUy;f6D28Ay~CT;wpwKC-!!#09O}@9`vD_210_u25}jy6vMi7U>9Sfr!Y#{q z6d&Knw!PGdp@xkxo*2y=(*r=cw5Rbp5?${8-naxT)HyMYlrKvc@)I(;m2Ptso9 zVKqwX=UO0>+2<5n?X41%Y#wrTHxI~9uAB8+6Ew#pVGg?1QOE#uN~R~qF-oJ(trW{9 zc^?l&u0gJba#bGJs1$1M+7;$! zy0bg6pMEUWQ_g7}%c{gq+H03Yu2zzf_*9-?yYn}NJg(EJ(0+8sL)0lGicfE*4e5H2 zC~lcal5TVk5GO~E6Bj?7sL;XG0xVu7n>cUcr5O_ek50$p6s3TLg@*_cO2{IMEW$u4 zed$Y_E_7ii*<`a{0uo?7+qpnz)|tA*=_!BVrMoPmv!uy7ynOF!f-MuIl`fGtTn|&q z0Qr|Nxu&@k>O^Lf&sQosTk<89(atVsJN;z4=|x+efuG%pEIqHCNYz@bH^=!Jw|=O9 z(_I#GgNE^IR7Mk1#UiT5wM3>-NmwJ!JR*dNYJ_qgVI^;CcS00001tP`Jbe5l|B{VI z_psg!GDFGG%{q;!<2Y|pAz9eO7{Uvr62*;MC&@S>M0uVl77c-6qh7h8q z45ySzU4{oQt8rt=3{tVQ4|TbZHB{l(D%|X6FmY_$RB}{j;-xNQ@kxLa9L3Kn;yM-| z7GZ`Er!GOltd|~G_!xl?kFEqll7o$pI9^g2L6B5gO*Sv+Bx3C3d-l_r3cAVn@)@@F zpa&5)Y7ZL7BB{$|e7b=k@C`O~Ifktjyvt>Z*-8rK`UzD;u!)dO5-CI|;1v5<0JM~B z+%(YHWa0+SWAhUY@M}H+qSB8j*^)&YnZ_D?lpj_TZgRCx zDrP*t$W997W8O3)7=&Mw=%GKzxAL30%}h4UUF8HdP4`erq3dZ^ZRsv8zY?UtBOP^0nPJF0e>In{;Q#zUZ z%s}zWRNa_(J+aCf5%6y2J*M?&^yUar}%E?wa?c8)s-wO8VtGn8`YK;k`$<*F)3 zLi-To0!xYOoz4lMoH3kW4-04@t9hTZxV1jbv>XdtI%CsCi*z<_HZu`8&j^B~Q9(D( z>te2PmUTo45+sO)g@q3vK2k{~l{C_@$YBB%(pP#@&QwNGDJ|tm^7%zN5aw2Siw;bZ zO1!#O{)@#zX+|mmJ|ti6k%p|4R%RrfWt^Om0=;Z%4x|r1%Ph*Z zg+9b47V{lZLfl3dUCBfi>$~KTDqcQkGBfpQO7*&Ysf(SEUKO{x^cCiFvp!CMn|W4? z7(##6OPCk%^R$_UUx#U3!jj1tLb6Eu0)B>*qb?mlluoR}%Li;B$Z$E0m$)7z%w7T< zm8Xf*j7swGvQ{ea(p|H$2y-(&-6qYkS+Bnl;5_R{!%KaRkgcV(VFkCa3Ljq5so)54 z+VCxz9MQ`}s02JTpa+(g(ua7vHaU_@J|T_q1X1yd8#hB3jm0)L z;sxS($kQoQ%J0&bE>f38xLG7SiEA&NPYf?ZsKP5ViAszY=|F-kBC?laT4^COu{p2x zslqEgv5Cl7>QYVx4T!LXbi90k8;fR~z;@*(vl~Vcx=XnA*W5^@ML z#@we7Ns~O8W^GCO$h38BwFbC;G^5NbvRn?EwysCnBrDD1+K5z{!EpIR-dC4)*B^B= z1@f}#Cbybo=LCK5(p@*2$FwP{%==c{wZb*aS>fE_yf3Q>6P81KMuA?_8|D;b8v=38=IX~-qMmlfl$;4l@bm}|D6D2jz+56*9UH`6T z)^)3T%k^KYkJg+>tgSv*Gmls8=WFiNjoKpdsf^~y#B4&&uEf*2UN1Taa1(THB~@z@ zZ73y9FWkD`nMDO_oS_^gu1!gmvCaT2jq6BCn5~!baGsa&5~qSzv?R)9_UT|s$igO@ z|G(PQ{y!9z3?eKST!V4*x%rjeJRv=0f>tqxcFqc7?5Bb(ybJ^4x`q&OS;HHI=_a2OlqoC% z{A?gB{~?1omhJ}Uq;nUxb3w~=52d=13|?S7w)AEh5sm5qDv1#$CcUxnYk|6`WQhDp zRDR=6{E{YXiOUlD;?=Q|Po>_kM~O*Y7U9t)I*t&V^m#lwS6u`s(!M~9K7@$sO7`nu zt)d;J`UN(dG#{Io*5?!{`ZI3av?4~f*5DzCpGr}mGdwQu(gJj*tFsTUJl|W!fIaPGM(ifT56$O z!Da_BEK=BygVbnQo+l92tUJI+Kf~Gqn+w zoMkR?@^n4{na``Zbu2L|$fkOWC0%tDS}>ErnDvymV%=8#R4zJz3kT)|?3{7EYvNH1)Pw3N$01UH>!J?)5V z78_)}tYHi#TE!kZGYFee{@-j_T%Egg;Fiu90;=3&>{ zGJ_0xS+>xFZRVh~HMdJK4@e)m;QFum!#Sy&T=z>RVW}(kX&?U5pv=%3-f-U02ral- zcR06dD`zB?2zSt(K|C$jX~|{cnf$Ad;uE)S#jj&&ODSj-ZiWydBBQCo3mnEx1Q*>n#S^T< zM=D)tr_*qgO`IN7Q3RB7gmU`R0Y3#4U=t)ph72T3SSw^Z9z6U>4qc@)A!_qEZgNO) zkRh^@Rx;nbPn2WwDIW8_xeMETVtyu~Mdnd5q_4@rr%UCiWD(&_#^^ep=ajVNN&0Jz z6cQqXIV_M#>Lt~jk!L8?r^zH+Z`1cFl|xb&FCS2et+Vx%E|Y77@QI6ZBAP3639&~P zQ>xQ-38@UDj4ZnFGEwcNN}Md&Ne)Gn5mbxyY?2_mX(_d(8>uv43^CeplBjmow?G!N zmdnItJgM}i6I*p4m9!+P+wpQ%`Y?i9bQPO$%m3&?D}9YPA^LKb6T}JAjiscsiCSzX zOau#LD>90001BWNklS}VC>NtAT zF}hJ#Yd_PFZ(Y}1_fqWoNV>`2)=1Y8J~R8Qh>S4f%tR(L!rU#*c*j5KHm6&T6sz zW43FN6mUu_uw@J(xyRgYl9M`Hl~$vqLTkHgpE+lh$}t(@x=R+~)q9+VL_uru00p$b zCPpa{{PdzoW~d8GvUC@K&LE3Qr!|{3rlo|4Vo`>NAZ{Aa44Zv2mK~JH7FNk3nMsUA zyh%C%a%d?zl0!@~7PT(o?ql_b$KXtPfJN29obiQ_SKE}s9{hA8aIXfsN#c4_eO9zud4n6|% zJ@*l!H=BvcXR;RyAEyZ8#*K$UJXlzCz|#5b!6rx*7L$o#$rhdhc?igq24n!;bTr+_ zpg(RbS%`-yA5ubwwB$)rC6zd3OywZy+LkbxT1bS0_^7~QFIl+RLVz|50&HTWQ_d*D zq{{#AT3COmfK}PiN}{b`TJfyK9|$fWwN}0hk=rUO;D~7(XjkW)Yafw=sTVWF$Wc&wQ` z#v}4Lhv~sC{hnLom@}I1WkljOC#>IUBc74NvPY-L!?IRVTuaR!=_tss*sCnHfyPdG;sy%PVk&m~Sd=MuH- zPqmRIC6;lw^J}6tVNG^kA;gW^mMrQfzTgNSGMpRu+UdXrea^`w#BnU5%*9JCLC#Qw zpA~rM#P4LwBYeskz6J6ye8XS-M|Y7X`Eoxer9w)nBOjSH++=o{Rg%pD8O7t~5!Vz(>U0^+d%Bdp zEK7W&etwk|GM|>xo{&5!1=3kA>yxB#zt-17T-HhaKy&?@e^^I4sWgth^bv=q{jECzCs7%woFKe(O4H0H1t;-?9JVrU`X5@3hUCoY5J8xm5? zU0juRa*ZbXmJY|!$Jj<4v&z(>9@W|zulz$#+^pbB;yRivV%$o1V!Y3m0nM*f!XxJm>4 zf+Xe=Axt+}NK7^o;iN2}I}dPG-sdPN*B=alvsfUNN{K;DW;txsS z=we=>4kvX!hJWN5P@8W^<^-*YV>Fj2QEnqZ9r|N&i7fu$qBh5&7(a0`d5t=6_G+$vzVLZ2SJ5xwv7pFN&YiWvuTi&OktKqjT`qw@p0d}cN+7NSw zGLjdi+-aZ-h zqf8eDOC4R$uXN){=WbnMI?4bui{|pD5zFnet^c|9OPLHZ#oQrf`WRQ75BW|T>3J-D zSJU-7y+O0|PMyPRnyxAOp7WqxlK4Wua8Ak)ouj{U-0b9lKCO2qF4T~CHZjSWTOF=( zJ0qM>BB8^a4K=;EyrIn1+hp59arz2JL(MLI@Kk^(F$8`!p3YbL`j_YL3 z@E5-llN@bCIy3Y=qCCo@G{Ma{+G2Q&HUO-X^|G3iG^YvaG^RO!F@{kXI+BW!`LdYC z;*kq@xP>o4+L!}0mD;8?F?rAoqX`GiztRz8EE9CFS-^CE~1{zOX*?TJ#KyNF`T0t)ymw162NM!)$xhgZM#RU3uMefpF+$gWhEx6f1N0MpC zQey079p!{&EDiCHK?Ue{G^7p}v2pV(2dLKBoTGy*<5p@*1(&$UAu2e-2%@|u4vlF| zls>P&WWrwlhMxzTdIcW$cbBe(09`ZedU zoCZ#|Y_La24{2sr$RWv>BpGE!@gsg3N=LgxVy^8mRj#wH8;I-E`i0J*LjTrVWg7GJ zDQAr{LB4iPHxsd(XPy08sTXyJzTi}wCz6I*-@8^N?UZe%D>vX~A}YV@kKE)K^I75@ zX6n6~z>-gh>02BYTVB!kWQ>0493z#De85_s)+sdBZ}ll>MB*2x+L`UFl-4z;oJ^gm zJDq>HPtz0s#gTNqof64mFWsokP@<&keUxF*g=WlQ2wo!2IzkjMo&Y|bNt6XVPguUx zl>{_S2^Ew|54_Tw3feK6$tJn`cvQ}>qbeEOqr=CoasCd1j%8C9@aIKab5b-&MDKCQpha& z>a%QPy@n~mO+<%~AzApvrvoU~8EoVfow-k**Cq7URRn26kT`h~0{Kin$0O7Dk`gQ# zO{vb*0{jveAyq~*96z}dB&v;ALqrR;1u-^DZQ_J<2X5Ib`>}P76C;(NK0_Hk$pKPX zh#xPlsZCtF0Wn$25dw^*4KWrHp-hXg$dZ}(i7=cfc{&;|8yG+-mIScy%4}>R)Fwuz zzD5ofUUGD%45F?KVGR-4DF!$1>U!d`jbDh$)D-b+sm)7z4?WT%I8dhflabA$y1dG0Jq3PGSf#93qr(lrnON;+Gy2 z5zrC}@#EKxgsoQcU)-{olbqJp5+WcCt!|{qyHZ4k^%&2|bZe0$B-ho|>O+oepB$px zHQgG(b7r$y%6_@X!$f46-cQKtBvW*joDm->`k8rx!`3S4ApK0GX~J@y!YjI*0)1Jh z%kw&v7zL(@uF;NkB*i>$dQqxX<_;V>Y85H8(c38`Q$-xRe4Xw2FglAqDV|N(!ZHqnr~unK-Q|!U4TX7@ygUhZqNO z2$4q`gBXpEF4$z#f(){_Ec1xrmm=cCiEvuS6D6umbqDEeU=`>IJVfM>B$FBnvu;^P7@@ALqAzeIyPQBY}DI{vQct~6QVazOprKUe7N!P zIjuOYV~K!HA(euEHQF^GQypSN2;s+}HMJ=;Zxh5#M`F~*4cIydFU_O}J~DJXUScc+ zqI!+Ccy&0Au8~976cDEZx2z+syNFTds&Tl{Me80@)3h;fyWWwgREf}u3+61N_}JW~ zHTYeRX}MX#6~4qGBojr5;j!M6WHEM6D=Zx>BTwml=6CBJ{f#nNt^r-9Xl;5i2v}sz zvbNa|*-uzy_CwYw>paiNbgj;_RvP`RpLs*O;-!<`s;|(*JZyZ_)<4W2raaL-@g47J zSX>149@)g6#AAt3=BGpr)5`h92~be^a@AMPJBe$RZgYRtgH`=a$Hd@74yk5#VlE+l zmu{4@NbkW-C8MY%LGGX)Ax@!ETMftM1c zT57eU%JqV^UvlMD8Az$?E!QI4))m(cI&+?-?6Y2wW4t5CUXmyvMMlV5)++gh_A*Gy zIH&^&Tbr%vJi$wBq$$%F$#?pcHsKMu!I#veiN44wHfT20n4qJG$&0ij#wOZRhi|z; zgb?d#N0z=yCV$HS+&WDk?P4H8V-qhp=5=3YsgRvP&3b!y&N^#SUAdRr-Ojw5U zK9#&dB?gatOfSlKiAu8QLqzVED&khQHHxk>oEUB8FS*PW`eAX>Dwi8{qB;SL%;E|b zJM|7O$y;0jKH&};$bY3dHMor~wABIh;5v)g&Q&sLz;*eWRWxHduMp%Do*+US?!-c5 zHUo!&m%O$mNj@tw`S0aGCt!v zrzqnR4P^_hILQg(T$g3kVYoD<2|hBpLI%yLq<}V5;|x9`G>|P+!zPZ8tNcJE=j9NW z_=6od^q2Nn(wS4V=LvZnw|q((hsb6KxAOx}Q7(Bhh#Ib%GaEl4^<@*E@PT}w4_b?e zXaP$S%}q77>3lPk3;L_RCQs3y2|8NxxLaz-^L*o)Eqkm_TmgR4r7YwQU8DcC+&YhB zZJ-0hG!@I6I*se!h)vXdMlP=uMZ)ziE}ZP5YWmXNf+?Uh@OCsbiY3zT)K@wGu}wf4ec(Ia2vc)fQdm z9IPtWr7ET!J~?{KJM7_DT8^#{q) zQHB$9{o&eAfQgJDU20ezxySm*bw+-%*0??+?mFgjag+6{b%wS&Lhj~ks<@Nk@(8`~ zTM6quV(e!IO|%LB$9XA~bGnW}k}hWwz1775U1YB4rxMXLQsq6ffnRw*`%#ClG@Vpg zqvQFFCwK?sFyE5Gk9rqw&eD(|)&A8!kKko2ui_z~bLq@rzT;MI78h6K4t;}t(n#(l zEXAy%g*+;UaP)fu-R+sl{<_mgQ`wgFGaD zTFYwLM|Jw~1~K`X7;cHkcKmXj&YWigoe9gGWK)wdl1>Y`!G5mGT=||OvVxtomgP*Q zhWt)7>PScV5{LPW;%`3X2u*l}I6eaW1)SwREbf!(l*wI$xl=!7Ip5KUU-9x2rCj1> zZlVq@+GFvv{D zDBI01y1(+4s)&=NADc~yae6Q@+xevGUT2E)gl;!qYwJY9i8IpNrAO&iQTN7A6~W5y zD*HH2?B|M&iJW-RjUUa-#2KeT+vwBg2MQ=KV@PF#smTgTbS~*S$Na$s*}-mXhUkBA z=&t2NNYe;$(quF##Iz+5vNQvSE~F4;5BX$~$5zVl;Uh*WsdS@&0w6?)3;fJ%%J9ns z`iqAEr};|$p_%o!HJN}kls=SMkGU!c=t64~j&;f!DAQf1T%Xd9E>?XUS98~C=4&mU zp(_*RA~}>2rImSHtMfV?q(n<~E;fZ^SpSvjTC8ORbu5d(yrWz3nVN}r@$!&X=obCf z>?EX3O(RNmyee^`+M575gmFu6UZhm>B$*y8*9nxcR@;-R3m8a@IPX)$R!04Ar-h$v z*+E?P$`SlhBEJ)(n+zgsO|^a^2OlY<(hom7i0O2q^w;JTOS1gPW9EJH4F#$U!L7?# zi%$mYEQ%;)AU3(gNYfGK043T~uL9f5!(70t*Jwyg`%+GvE%@;33Bbo-f|L@$CKC@K znI%FWd~CsyANhnTV(h1_PGJ{89i!bjDx2gES`ovG2QRO1+4>s~af&sMnEY;aB3Ul7 zgd9HCHk3#=Yz=;n$VA zh7fV-%{3WME)gxD9-Z*>J#O;ND2@`uN00 zM2O0VxIxnKaRDE;{3!`MWxbuWQCo6Ac1l}< zx+bZEJnPzHy=5g!H~EtrGTv3)^5sajkRBtp14I)Vj^O&9y?llozbCl7?4@ zn)>=kqDG>pX_xrR@i;l=in&?6=0|f%|4)x5eoQ2rD~Xfl?2Z38A?LYxPQ{M+{&?Pv zlg_*wQ!1yLqm|n$zo)AcOZ3xis%8a6lB>UB>l*VVaf0dsdYF4SD4FEbRvYUzHZYav zNRb^fnp8GwbxL)k?!v~ax8o;UQVB9#?+1F4&LE1}PMAJ893%q|QCUYHddl6hiwgag zJfb`-H}F_zB^|6gtfh3Im$eT^XSrU(Yj1b8Cd+Cr4JBY5aw)51uQi64hGeV`w~osY z38;%gElMml2k>B94JC!C@&P_d$kmPdg)Ek_+5-+UJwe=T zGL}x#=K29Ac#f&G!p90?GM05@>pJblW{UWPPFhz7P(*|2texmrq? zeCaKMPa0b1DbaCk#>WOSh{~UG3tnj}QwUohNhfS3$j_wcn`Sj7>?VK*58H|9ex~5p z;kukMeA<^xw&|yo%2)>BH@BHn#3^GdVa+$!sUXGtO+9vyMmc4Al@K|MB!FKlD8)k$ zvLpi!aUcear8O~WhliM|yi8kGYkS=Ek&VQ0jB9{e&wGAY;Tx{Mn5DB-lsk^)-kO6^TtM#vg2<6|o6l;Dt}1*|0wl(JvE z45JUJ+Lr^mg;7Lgr!?eYybr*+s6S)+X03>qR?HdN78uEZ1dn%yreOlQdTzkbW}F`rNvXEhptk z*2y8fl4Q0z>k?10S-ZH~}zEDkcae8QBB9Yj{Vm>y1;>DwmU&(>Z3Z^bwbw5 zI9;F{wLqI%zsT6cZc>TsBNXa9?J7Z{rlx#_U+!bK4CZ_Do$hctBxVt2e_|M^+Sqi& zZANK#HsY5QqS$6TjtnM-N89K%V&rj@6msMYL7k@u$R?((DPR|2ykrr@jh8UlY$S_q zGRqncWXTl#6iR=**n}yPU*uh~tcN5)RFY*O9-YQxlat-w!Em6+Btor&Tw4?jCx%Qz0IUlKUX)MeBoLq6xK=4op};wOlY`BF+l+~jjX7D_tF z@&O@m4IrO1`9il5#wPMP001BWNklNKG;6ghrKS$Ouw|$GPL%ED1V`zELkF$J zOaPZaXuq~=Q?}4SFOWitp3r%GulJHdn2^jPg>3|J$dMz2$S0FjJh}}Bw011~(BRx(r8_iI9O!n2;`0CC)B3Q!b0;BK7D^9#uL^2a+V2 zvYr4oZ*W2<=@rTtD@W)+h!9tC2!r&~={R`tQ%`p=lf(FRBb|t81Ilq&gu{Dmqlh7_ z$0lD-5FndOf~4>^DFkt7OA&pj-~zCNW~30nBlYm%mKu1-U?6UM@+@J(3?@#I%)*bK z86@MkhH%+cbw>UGjleaF=Y>&-qnanXN==!9z4B z%w;YT)N+tse2v9TI-Y8BSYGBjOSBnLeEfq)3h9HNH2x-thZKCoI7>BV&{B~UDqAfWJZZt5h|@F)_r7JUt14y*6MEUr;%jKom{c! zyGBbNdD&{g1!l-Y^kR_Y%i}Ib#?ek!XhI%msdT~5`*Mvd+CbJYfDBT(Rq9zm8Olp$ zC{cCka#?0xGeIVCTC1d;ey_V3!D$t)b6FeGNZ#cOZk2lSJ(oDH^XNw|zfz5L6cLrP zSa`UHsl+*}0cdQFl7#F{CiksO< zv2@0Vk*0EnQT&JFoR_y~h|Myl(#ed`Tz;n}In>0#!*%&o=5Uj|tJC?7TFhh&jb$St z_S2gy)RgU1a!Sq-rJ=aU1bK^^bd(Aj^D>W6iH+e;Zk5NW#bDyZWDfsOQ$CYlI4m{g z40R~P#^DYgqk+`5YH%y{Sxu1n+(iRfBWI{f7E#Xexjx7huJRW_Im;FNTB4mfF1N@= z+RF~6^QSbHmvN}cJ}z-hf1we`Be=QF$ApMd4-eJo2K+`h{^m~>;UUZzE&&$xIV+EI zowm}E_B4{E#AE=!5|vXjgWKg@0)*rTn$eW;T<5rWm$d!W)z@`ga^*kPK*G|D1E!Il zpo4};(Fb+Ab~ktGKFKhZW+o5oEPW80)n+X=59?9J%QERLBdnz|l3-Fl>m#Us&{fNt zY(2|g(kbbjHQTkv8bzUOlum>xm5of%HrT3q)cS*kI?DQ+Pk6{&;&z#1HfvOpO&9aN zo=6NbbIo_G)Qvi-s<{)<(}{+OS!7i-h>uG=d*euD8-4ypi^{uop_c1bRuVMTxTuB%uk7P`1P7jmlU0&6}nt^8I;Rs1VApz9JD4nItNJQdoGj4RFZj8 zju7BY?M^WT(t{)*jcdAvPSgW^l+%(c=jf?BOeNc~*-R8KF)}5KIAu5tqnnm#dy0v$ zg*ZF3D`mu4OEwoMB2Ftt667-bv57DeKk3?$AaVJS6jDgV=7gLi#td0WC-D>Ipq9!% zw8g=}ku<)+VS=2*W&wTzmf5ZCS?_c(=fQlFXs9jqVB#GjiRDgE#yTgR4C`Sh;_To< z-Jhr-d2~0gNEgRrhUi2yRX?C0k(_u~8z(+=dg?4)<{VP<^C$~>df39^_qWN|NPx=kOT6|ZPV zPBT!OvsEYQ2^_q-gP5!W(j=L87$Sd(5TFm4cqNZAF31`jlI0i~v|uWJ+;R>tUPfaR zC6!_Hr$DB$oi#+bsuS4-MERZ~vL%BkhphDsp)3ColZLM6iCUKRA2Ot&wSg2O)_ICV ztS0z*NT$jxw(&kA(L8DHGL@v@ zCL;5>S5@Yjj)@xMhpfW+<8D;L%$mSg&DmnThr7~ETknOrlU#7sSBubjzt9v9}*O~$1)25oM3-xx!;L*kUCT_M-gAD5! z`(<3xUTG#@>O9kuM_uJOI;mP6>vvgWHNm5!?YP`Zjue=7bhN|PD*2WUI)e6+PN`N0 zeLz>rB(0Kstsq;6CWcYM>*g18fw*IvhT7HqZ4Tg6jc@|gF!yOYMpwR3d53PVTI%f5 zx=!2~PeePKT6lE4zD1Y=TAgy8rlSbSIKaaaSwet;+KVX9(1)ljWIS!9mCWJ-QSxb} zt#v34Jqh4tGK)B@c{&z1lL=D74o+ih1V5SLqXngWBNOOkjpQpF`AwP;;6>{lkYlc< z*w$TESG@LPuI~7(wytuby3%z42anZ3+|O`BrlmNoeKShF}lCtjozac#q4(&Px8WR`Us9yw&aj;$+ML`1f@ z4wI>awS;Yy;*d!W-9i2&0M;sNI8i!DJ|3=`CkW7=73`pge1cCcv!5`w492U=v;iJA z5TTenZAvRr2;gS6j>V_<>OF+0YpxO0EwsUw1=<#@v0o-o!64}aM71R~=tnnf z`CJwg=ct(syu@~{N+vI1^NlRSEeqsF>Y2B78RZnoELt#%UHEmL4rC9GPNEs(SXZ^ANLG#oXfgQrV(K~UZ8`n zAV#JSK>Qu8`b7cb)wNPtmlAgBa=uMJk-6kK(Cbr7rY74*TUK#9{5qWZbmWDYTFrGC=20 zOs1|Q4mflmu0aCWRx5^bRcgvK>RH{a_wegLt34&wD^_1}U9Y)rB_d*N$JVf%k`d~X zN)5?0X(1`5nv7+HWn=(-^jjVxS-&Ea71DyANwr$ggLF-z2NB)C(+rlznn!;U`Wjwc zZ`$cJ%JiCiWrioFv4bo6AOT`}8@W0~UF4HuexppXt%m?xYtfK28Al3!h7+Mwr*n|` zcxa2m3R1A88>teItz_bJ&9rJWwMKpWuiDe=WYl@s*85iXLJ2;Yn*hKRXb|;pbUlQL}UAi&4Vq-KCeWoh;Mxv^zJjv(en7`!+Mdml1#Q_Rs zJY9(C?Q|nU0z~lXU_yM%6Qsymb`#g1*+zvF>J~Z>($8>Hsug@sHyuYh2edi|aVWyk z#rig3da)HR9>&lg8!ur}iF2Ao>;h)!H>8n4CLVT4Lt>;d6r0xM5!D-Vj)-)p9@+d( zDluI`LxR>S>jP>t(-abxE__H@;v*eJiK~SakfvL-8yV^%PM+4qVI<4QU_V(BB2I{U zbYdj4ff7;(Q7Sv|;o&99@bCf-hvghmY|?P3Aye=XVKyGZyhsH;%?BdX#vz4KcqEk= z4zu+rX$)ZlW!Q8gUDvUL04;Sqd-MsG;gvKQjH4}e3@7M;ix^|I7U_g}kHgY~*>uu* z6ys2$wE&0dAU7n9XK1Z+*rUr?NsKUkiOY0x6VPoGQKAL9k#ZSK9x-BCg~K2zpaU?Q zgZS8mmo%QFgc4bdU;FB3IOJI^Cx-lhc(iAL)+*y9W80{k=;P{SsPq8@s;$nXIfM7 z*&gd7YlD5aoVUKUZk25`=D&%KRpr(k+Uv{KTJ1+fR`CJH~UG+u}F3I=N(Ou`ZR-x=6?AFf+vj@Nz{*Fqm!nElPoG&pp=U_qz(*zm5GZ z{al~&!>1m&oO$lp+2Ivs*Xze0(>bO#g*t>U@Z)0)5qi)9FZqn5P+G`HGB~3zkV1$T zL4RZcF1BbqR2{`f%G#<)BP5ubHQ{ zgyDQdkY^=IR6f>z#B@LXX~87+bCey-=L9cONJuvVZkZ^#-p;KgPW zW$eL^j}DBd1p!IIW)E#C#!V4!d>q23K^n7P{*jURS))zy6Xi_~$aFrzgF}c;Wb+}1 zHOgWJ=~WrPF3IIidP@%9kTg&J~ga|DN6VotmHWR`}P?r(LD?k4G2^u90H-5r85r?=uPYv?O z<}#@|frbR}aG9ZElVRg`2I&aeYHb?mMQVs!{?teKM(XHt4N{H1GSqcI(h18~9A${r z$LeX-(Ykyp-&nWGdFf^UN-Etk3R+2|l|IiAG zX+~GPY-0l+Rh__UtG2A9r>-YRx?bQY2c-k4_-KrqRN9h;BO*WJd9&A zF8tPgat`vvx}%XEZ=OaZxX579G`vZJsv2v0NMLPsGip#8cW)TbkiSd5XID z$&y+)jFjcNj;G~ka4m2zaW$>8zWyV%*VI4XWQN~+%hA@pv_2U})7G|mxn-}Wt6P@_ z3X^x=)TsW~wePF5*ws2|ZnaymWK>cuSASPWYn^*^^{ds|Gu1lmYT?8ZC5Z=}c;)L= zpSoJtx?Fpm?^FA{<*T+L>GdRab*8Os>d>zJlXtvy&z?S2S0DWIQn>KUjgpU#cm1tw zf9E583d@eYbuxPDtxNqcW?q<3QTA8uYZX>AGflhdaNSOs`Y0fx*T^MEK0n~&G67C7 zL0U0{QawdXcCwC`?$YI)W)YL|XemKTWj7ZHu)!>)lmhvOk>tuW*2)A%5aT)PH9ByR zvy{u9l7yQ#r31}4#~V0$muW!2JSEd8#7{ClozByENh1X>X(HH(hjgk=mbUU19{EGx zCq?@39zh+AhYQk7CUBMM+M2f7o38js<2k@@4I<874Pi?I5p2zqyYR7t7m4CX4f+sc zJ*PQAh|?U<3gQg27Se(cA$*d_JmUED3_f-0&OBMUV_8 zvz@rEB!>`%JdIoY6yn#Q{EkgfTN9)PFJa?n9T1?5Gz!TkM5(%PsKAGp2rqLO4;6Ts zLMENqC0Se}L<(W4QKNZ#*Eam>VrkN=@~(WQkDDV}CAH}(>#cjN8B)k&45owC$a+iK zT2t9=4Y9tno|E4Q;3K8lOxFl|i$qyUnro~4M^4(A))MO`S2iz6Znei;?=o8+mY&*_ zmsoDzu`12#L?hBFHYDCct*$d{3MRUa5_rA>{KCK66-` zvQ(zXcv_IETR^MG<1`+?mO}*b(v56_oFGby=I}ZVWfHSFOpc786-#)A4C@V7UwTWj zq?2#`Ae#u=Io7QVW{1@dbeZcu@e#FtP{&o}3Q4xsl^eRno@~{W3|CWY34`Tjd0dVq zrtzq{OeKEaA*{0~Gl%smAIls$BPSDo>8~2BYHH@{Wj5;VlBq>zI9_wzoYY-rYGNma z6ye1q`>dK|)LL5a)fyL@4N6T5cruG``!VC`?iIJyzwOi1H&ddiUF%M&o!syXH#J|X zx60Mr?qBmhQmty$o0<QQU%1gS*bEuFoIwuvK^$DMfP-FAfay1+bF zdHzQ4NZUVqU(Sw%u1vmC?eAyfYh!atCY(JL%>1L>u_k|9KYr7RHb-B)xHq&b^h`V> zI_vNMQ&(k!L_M~d-R$Rxwx%0C^8_y)p~OoY7vybu>G0>osl40XC3L68Z87K0K0xM^7-H6XvMi zPCed~aqOWXtMG7@B0Q8zK4JRv7=y@^V-ypYiwwg{j64EbmsG2dbOX)RYed_!Ly5_sa)xyJ@IJQwD!nOZC7+X|E6o7F!z)w~(}R?eq3!X=B%Op$mk}jIPk1tth35Zs zi`-@vSwatOtN%!EV%E1DV-=}#J}KFn>nh_f^8ts+lOK4|jNz(jFRzgy`(>mxOux~t zJS5{~BQ2zr^`i8#hPxhg-JxBy4bNyT)0^Fii*i`IkSWR9T|UgoK6D8gHjwKAmSf5ZtI*;5Z-bmNk<_=vQFV~IwYvLCA>Kv-e%krQOsQ#sVX(~;(L@RmsujeXXt-O;SY>>89 z2b^C_Uni`aO*^}{_-S|TblmWJ^@jEr_FFagBz1A!Zg;R=v6s~ zP%GvcSFK^q|7YpmA9F1H$B#c>*L~Q%xi@3OFbt#7DU`}NNm8VfQ>92M zry?m7Qj|m9QN*0Gm@I~27`wOK+wN`m;kvHZ_lNK2-+2D;e4LBjUt?X}?G0udAsOiC z++j$@sfX5Do5SPGLH^XTXZ#P}ZCJFZVB&=yd1wFoEUV4mk7f=(@xs~9PB!;V%1gW( z$46BK;TojrbfBO0CwZiE6O#aK@oFBa9F#ROo>V&XDsV#HCWklKPa@0hKAhD_a+f3q za2cO`L@YkW%40ZrQ-*`yA(_StD6^wWC%ehvl55T#rfxHDoLw0DW0UTOC3QJ{dp{?l51| z8G=Xu$5DE-pK?6fjb`{sB!HhA1UbzTvdJck6yh~P0jcyLmC>xGE@eaAa6@6 z8tX!i;>4qy@lhaW@ZljqKe}^@dh86&l%rx7Q! zjT^49ve~cIWF_O(N!87+Z^vo zs-_t)kgO{yuy1P}GsnCVIcq;@=Gx=z{dO0UBi-yz^rrcexr#QX*Yq(gXhzCyBYY^b zN|#zotv!69%dMdV?UD8l0%o#(PIIj}_HS&LHmu`3DV(M|lj%f2d+A^bB%W?~b*iLd zG7ukkWYYH5aR2}y07*naR6QAt*3|?lV4jX;3oss!4rd;wF@m;uMetCU|H)m_SSJ>y zv^1LIB~k8@YZ%5qbdzavi)_}&=j6yl$56%_O^k2wIxahQW6BN3zmlPaEYj7EBgR0< zl)C&xGrJ-CFs)`%3+$8Hp66(zRisHLz97e1XXQkX z9VfMZym5`kDmzwdS1--q*m~gk#ICjNdbgeF(W|RfFFj%1jXqq~JY47+qx==8ofMDhdK|1-R4AV3_&{_ZYM36s~g~ZE#5T-~?VLp903oUEcIo3}*tPXpobtK+Bk}e*ecHaTq29DI6Me*{UEWt8Yv z`v>w#>~upm2CttH32U<_{nDgAL5nSB+`v6-9?Z#GL~#iJOnvP0jau250fu5 zOJUm+?v`sldtWvYH0mlx-xl zk0cI>yJr3Mjy;Z!3^r=JD;+Hz4;f9Qg;rP9Yx<-NmBz*~V~E@`&M{Xv@{!|5S#6w= z@986@j^naa8oL%6bL3lsbg)0^0UI9yinNE4s?MWliBw@cGaVyT19BwN&ebuZC;$ zg%xXkptH@t?9X^jYg%97vmewCbf0}pkKnW$>LjxDNse<|nsb`YvWGmb*@H-<1@p)x zo--tJTr7Grf&zT(;yK+;B0)SjnJmd%Ca4GT7;f20T>>%%uXdGN^fnqg7K8DL)L@U! z=OC#vQ~Hr*d~Xbs`TXG+sd1)_>g*EnBQt{ij zYRG<)fyjyiZnl zRIkI!?uQbcRZ|?zYh2Mmq>K1Fgkxl2)w|Y}=+4fd@uW-4EBd#qqWo0oJLg+g2m4{Y zXb!N3+P_(^My7_tWQLzD-%xQS{6?g`wU32TWX;llX(P)VbzI-tgLvI|$m*a0Z$wX)m*uugtQK}0PX^bApACmgE(X1!p^+u_ z96cE`H2NL)?u2#T;nlj-{y46GTszPC=ydOXxj;|`JMyf$q5b-(NyJ}VSydiv9jIkB zv1)j>yJOtvqVF2D9n)efm>K!gb<6ZrzE*gs=#RVa{PoGjZ*r3VC_D1hv6dG$XKnqb zsI+$BcE3mKMAli&^%`(f7RqKIku>u4D7)x@lT>~qo1UbIOTMJLaXy-6!!S&Oph0Vj`q-*N zWfW*Hp5_$YV75hQ!iYxAH!hxsNP@B#Mt(`XbE;(t%VWKq9roGPaS*5!#YS z7Rx!vHWEo@yH+DeB2G*?ag;P!sQ0m#?&MR8Pe~z_WK!9!o%AwoIjiFdvWsC9lCInF zGg=qmNIGFN(1gA**`CcU&5`GAuk88QhkdE}5v9(fcnogL)sIKCl~ zOq_V}lS~0l@`=YMw@AdK08=)~KR9`snRxXn9>$4}b>zuTc5;S&SK~6~1jBvBbW|RO?;4l7L-o zU)9rAv|W>IKGmfp+P~^Wz9GO7;4h9+i+Q>WkJbmS5i7l^D{bjXyw>?YImW<{h$Lxcd`6m_kTc{tx*F^7@Vs$`6bU(& zkm#s#1j&_dMgt0@NN&*6enIBqW0%gA6wTHs&2+Rgy31VqQ{E@ZH1!$XsLFn_6O-!f zbTuBZrE{qM@iF)MyxBRo<?fRas_HmK}XRL`amnz0uU&V#Z z$@GeHlsyrC+*Rt_=lIT^%k#s7Gt*}CC%6nE;RaR`hU(8*VCRv&#>rQ(M8S^5*_!4 z;~R6F&Xx|wy~q~wU27d5y0$x=buN2OXv7(C4|FaMU)2ovl7QDtalTi6J~YDWA3hgZ z9-J8NVl_5TyBcWyNQtY+UJyE?=PSAgvx6({*1cWR*Z$76ylMHa!b2t1kEWhmo;~yL z=GomYT#2*^91IO)CePt#tFFSwH!L9?kJO?-&fuh#tTb-pkq$DFLgP)^(7^a!{vlQV zW-wl(o(v$3=X4Mr-AER>pl{QLjyg+vYqqY^efBy1nbR_!SEaXgflOU!uao)g)vg3- zLo%;YOD{`X%1Mc{*2ik&1_doF$zujLGR{Ky7UDZr1PWD+qshz}ph z_zBRPPl#u`Y{y9&xg_#1-7u-50e-g1Xq@D8ka#j=GmuF?JQ|cf)Nsmdb;ud*&Unq? zihd)L^(TV#)!q7&=2D{VjefdMN{tS3kbR7bWZ8e{qjG{lq-%XXvVM1b&M^i#;v8p< zi}q0ai2f$095tn>^_o>|*W(;}wVJeMfxIAP^pq7;lFxjtZ;!TCNTmj}qdsE~()Kz* zCutWtaE6n7s2#K^=`vGu@XKWxE{l!U@|P^LztBapO_z}`7xZ08=ViXq)pjplq=2<_ zrH`KGAV0|CnuAG@TwoH()ZjN=Pm+$M6KC}r*_g&~*@##7kWaa`)*Mb5BV+&_G>ai5 z>M|Jxj3yJmY|*djhF5kIEz0pW>jSNX8Q|Ib_a>+18^KhMngr1JV;jcVW)TrO^`45{N_SgS2=~{!Xp47P3 z3E$JrlN$SZ`KZVm{n!qL%EDfwjt+IiRN6dm4A2ZmSC@olN9N1lrF*M>ieyK+S`{Ie zz1(ac*&B>E^LQZAO5b!0)xB0LyF*cfz|mlzs)F$H;N2*PbE|fUc_FHG%yO3KVcxHr z7oHO8X+}jpG{^Xg%O49h2$;d66-~XbJE!Qsacg6{d#2f!>^a(}@?3CZWKJL{T+Gdq zBjG>HSMC0~*^a9Hd2DazU(uH17iVU~Y5o)$5g1@%rr1qnOl2@UB{DCxwRC6r5#u4x zuTlMC&KVP9@3N=H@R*Ut_hdSAsy#?OM#fL@9CZ)(yz1z0jNp9mq2Rr+?O#&yM&xM3 zQTjmHOGQ_TP2YBZn_F%Ey;Qx*-em4<`*GUFX6`moX`An>cH*7%`&wFi%Kzwh_@nE2 zS6;g{R;yX3G*u5WfqdzyM@Z3qyw4q4=pwv?IERyLVvZI*| zz_&7%APcnsn0r zNv`cS7TLd9E`HRG_Mf`m-fjJD*CRzI*|+sF>#Yy;re4;|_JjH^z3jj2VxDG=%#{7c zF}Wbi=p#FIyxcN&a!US^QhJj_3rzO#jb_UqB$F;K2IJK%64{LtCmub^R${dghwzYu zkMk@dpHn1qlTIQ@jOGZZB}6ka zaOyztByig*%3-R=I4EhVjkeYMWr>y^z-^WM$S74wX_R3n^@ z`7>&ytH4T-HRNIrQVdAD`)5-lM8|k@Q~JzG17jGc!h= zA9kR7gU;LAU3I_Y@wk5_lXb)cBXG)g*-s+tbs7bn<{$xbbvDU-K@#g^lq|y|zmOsE zB$7@lQ*|i;J|PjOPSnRR`BLJ^lkGe&ooGuZlB8JH%K#Z64^zWF#94wmo-~{UaB^P$ zkyUu`=uKKkPf~H>m#+!ZSZ?9PLtoIPWaGoBZFpFw0bW_bWS}Rf3CKPg5X#$Bh=+Kb1W6~JRMKRHtR_jv>TL2fPxoMA>L`%jbR|eKKKyKuSdt0g!AA!9n0V=l zSC;W557S#l;g!{NBZ*v{OOSZkN?n5dLr~l|6IMC)88sd6IzBK?8kd=Fv~!#?b{Ip9 zucfXWkXgn##}7t#qb6zgUfnB$?f+RfSY%HXk$#daUpw=R8BBCsGfv9aMwxt|^^K1l zv&djLCR1I%$V&RTZy4=lnoK6B9Ubq9a8z@(%KBS3(Nqta{j8tuu-(qCr_bqH`+fPt z`otvwCi9<+CmfZtlI5_rhcv(g>K>8VFn9|v}55MubWZ^Y-%QMnj&x(s6ZH%z4GU_=VB~4QJmQ?!( znF)?^XDa6XWvSDg>tFwA?7>>kKisclwFly&m&Li8w9uy3qDZfb@2XllJr%BSuee{s z-^nL6Kd)@(T$s32r9e!uy?t`O!Ch66;z$a8SGp2N-(GqnP7~mwl{im%o*n-=TnZuA<@Auv0jdR z5xH2kr*c!p>-vn{g~9GOoS6=d`_+BYxaa6<)U?}`tG!b-^oZ%zi`GVKzdH1gJgx6l zuj9#fF0EE4YF^9%*IdKaYL0;QS=q|Sho-OcWBZ(a%(<%S2a(uBYoPtJ{Y8vho{sd0 z9c{!|14Cb$d2*qm1LIwtJ*I1jqueviX*#`i5)v*a^-T6B-K@7bYGibcxTh<3o3Zwj z$EEA!z?qfH|3EfDV%w`y;$kz5WAc^Pk z=xplap_`rt5(#o1lTRs-Wm-iNIl79VHU+%wqdhtQXRdW3og^Kr-T42ExRYG603W~M zr+^Nam^zgtN+g3kM)1-9)gY5Jx{)ZuWg=dD`0)Ti{6HsyAb*goM|Bqg>CI`JERtvn zSi%M}D3Bsd^6`;}PyQrXU(<^OsVm)ycgWbNE%J)~Wxu9dcwP5uKF9TADtVZi)Rs5s zM0=F%Wa=MM0e^D zb?9b2z$+RtqNSC#XC&RVndazqUeR^x;D7c?J6*rhUv;STwny7P>P)?-4*A%`D%G{t zL)wSW%pa_Ww6#uDH+6KVosO*!GeO?9C+e>>w{I|uGxj#Fl7mSTyt;rt2+*1PsK+Jx zV6ufR6w`rz)aEJjxxqc^lcznIh#wCR5R_%|ISGu^gWRT-oT4dfWe&$BO(s%DM`CkH zy2~pvR7cT}DYTVM{4U?gZ<48R$SVC^O7Y;8>#V_$G`WvIqaTU+21cImQ^Wb$$7`nK z)F06P&kl8C9nR*_Pu8zpRlj_1<@U0;vgaZ{RxJ*iPNQm+`G9A9z|dvUQgzrUn5;IkEbtNy9lQPJ<-eg0kL%>wRHy{Q$|BhQ4s z_J@P{<}(!+0}qxT;kIqtOG(#eIwUgMsuAv1_IX8{;2Zvo@)s-J;Y)#*W#gr;U8Z}j zFXXHNR8BrxxuthE9X(_*$X4Xb%pgINldaIb=x{2G{Uo1j|9H5)|hXH z2fI&^6gU<;$$75q_vnVMT8_H{r&?p|j*-XA`^s*@70``kkKgN=0_Ectl_>*CEwmjsrN+unYh8}M;b3nof-db z+^gPT&dKY?uXhRkQ+YXbS_)(q_ZY%kw2Y6j zeSvGtU@Jd~K@$;%Xf3@%b-uQDSP9lKJHZm2qkq{?XoLj2jUCYWBxzW_)2?n#_h<|bODWma`N01nr(+Lli%)ylZ@>5y3$4#@Ek!SLwZuv$dElG)=#NhBB2j+CWYPaOs_Ngooa;% z&o_Ln>1G;79Pu+s<`*4^I#nFw|Hb{F`KoozH8I@DtZr15_lsV z-DYDBS365ysI{v-E3Nm8wT2y~kqSG-jt{I2HmrE6^h#*Cy`bWS&|8teLhpn=35}@u z!#Zvst9aVp!DrU<{LG@zB71e9Kt7VXfx*(i7#7)WZNf#H>i`IY08b*HSqxzL`b zuj(C1k-Dz)#%kMZJz?yQ{+v|TsrVJfd#;yaU-48r5*)EH2c&*5PFq?1Dq_n!M0%OM z1DDLDj$deQ9CeI{|1;{ECdB;}^}6e<@joNpT^yVr`Py1pIyHEpG9mbT_|4!o!t#W? zXnbc3GWHwEGMK~ebI#9QizV7I%DplAa?I^&b!vT-Fu%_D@&;CCxssQ1qsAqp=GvHK z_rJ|ABtMaItm+kQ7T%asGq>v67(HZl(c5ffIVZJ0wP~#Bq_IO=(;JIFs3eiI8UdQH zh5~~608V*>he+c+rjRZTm_~CQ%vlcTW;uYHZ2I8ioYW>BThcLj-uR4~x>Gij$Z*ZZ zD~shDu5gLV_?au+h^8T3NTD{p@yd6+MlzN9ELW*3Z_t$9QkR-WKUs`H4%t|=B!*P} zVjeenmH^FU24ko}ea_L6Hq_*>j3I!Z-Z~XS-jMm!<2JvLE621O$Wr!`M{hdPTvqTc z9%^%#ddy-UsoaonDVEpxh$ehM5~d{6oOnF)9mP74rzxgFmJ_BiD@deLrW2+)0h}bV z216F`9Zuq?Fn{{HAR25YHGUew_n!^y`>kmqYjcY ze1nfs`W5eMJPyuSzuIwHDDByVX+Nlgw0k7Y`k$`XK3a{J>>-wehMH&%l38}DRZe%i z%<9Kp>zv&~J6bEWEjiW#tDhdW8`@7YoPTwhIxN8H^{2>2i$fyw5HIv?ZR~Boa$|Ng$1@xJe{WK9jGB$4e(* z5f4yPW=Jdg$Q3z+lQgMdBwrGZU-O8elhl!0{2jH_eU^uwn9)Pzx6jk(+>V{y?NrZ? zV5~B{W$#9gg$9H#TCbJ$cI}htRbSRvDZc8x4ITAB}4&bE$N`*oo)K)y6$ z%yjDn^~`R1-T6egrul~TlG9`M4{VM7#aUogvonp}t}m-X_7p}H2SPtY)&<){ehZDH z!2GjfcBrmZ7-?+A$Te9KUEkd^$})dus^fS#!yIGp<(!>uUyi;LbJLxX{ArzTb$3pU$?u|9N^yNx>C9iVE?>F(@U8jxdflB}(o_D_g?Jf1`XR!TW#grz<)vXM`Sr=={Vfzy%TY`XPMy8r+n z07*naRO>ikAF~}=Q^Hcs9-?3C4<*tCp@(wemEmiPjnNtU~8?`?lR#o|Wx}$PW3B zGuu=DDb-T6EkQ&pJw)`bKbUYd6NSrd?c#{dboAHvN zgY7OFPZB$czj-d>g z(d3Db$bY5vjAF)D`>V;&#AZ0Hn)^^BfWL@&C4{eIHkqeP=k!SQP z`?6h=dzJ3ci}Z}Nuqr~2g)6O<{1|iBF}2#5NMC!rIk0e0piB9^+r_0_ioYsfRyLqA zujG-+E>gdIm31I`w%&5R?>ufdkrC_*H;SCq!^QimO#La;y?kEDUfCu^j^*^=3EA)J z>&$We?HZ{!D|bfKF+R3y%JZhbVzU+NN{Dtvb(P(6-E$<8UN+QS$?MkN;gP%*`PUW2 zIUx* zk(Dw!;X_A-F*$m)u|`fr4T`EVzL&n%k49O=S!<<9D6{;@%8}M!+M1ua9=4h}w!3Bq zDy$v4CG=D1MsfS98=7ZbjH&bSQleS;kx8NKtj-&uL@Y-w&` zmq5O_I6;Q=Wf0A@seK2h@tKwYojHw{0N|Ax9K@98>5YpV0<_VgnD}*~9^@ci29V5f zS&oOdm<)_!JZI^~Fg$ePIG1HM`{_W4H}Pni##18Wr7<0Jk2c~*5GRc#Lsl__zWO!L z8Iu4`e#66I0(i)k{iLAwlia~WUrdQ6le*+-C(6lV2^VFW93`N=vZI-Mknf!&H>{OY)Cxd8^g>E|3K9v6A+kzc4z zzT9Ot@5pYsz&Y_qO#-r+KD3rw(wPi0j2`66SsBL-UgatBoR=IkDH!!t?`qg@C(L=k zxL57x`x-Tkq9K7mJUK3f= z`>i)*v%sIuCE7PsUk^lUk1sjHUm;V{|N2-@T6KWb@*NaiXxFvyoKGL{EuOigYZuXCLzWUIVJqE03s zc!dINd4_>9)c8U&S*U4DAWxBUlO=R?M2(1RC<)|LH)z(O`=w5XTn_2tBjFnn94}w3$kU3%rH(dj&xd4tmpZZpk#8871B`J zN|tex>XfmDE_gJLuXVlMP+#GqW!T>`&t7d$(Din2dlJL#XgiBc`;^_9jdo{UL0F6J z_Zg;#7^f@k6Y61$=aJd!p2iPP8>h5m2BOlhnY$YevWfP){>=b z>BQsmoqW&VjHNdB5h2VC$)Oc)mQz4JW%4!+i%V zQC3?whW`#_I2M=es2CUi@9w_JmrHk)AB?mPEwu`*SDojq2P5tIu6RsfZSmn7M~m9# zJ$-XXQDN~rk(Fj{w6;BgPgaz20NR-gx{`sEi@*a6xw6{UG-V% zRo7GI>WY~$T^*A(w%WsvUn)O#&Xtd<7FE0x`L^Qi@Ii)(rn$ z^*`UR@ZaW%a9Vi3{ZquJtF4l%iT0OvhB?Hrt%{(JRgu_mr;5qu9=oh!ycy?=)9=i7 zRsWO?tK1uSs-S5_R%qY#_7zV$+UM^K4fAZfdw=C(*R5cm(3^TFR5#Mzv~9P&i?F?t z8}eJ}-r$W$!u6`sK~n!35$+W+X8b%Decu zKrHoS5Q+S)mwBB(a7aEaI7l8%^pwuwLAij7eCZ)gNtZgbB9}|#5u?evg6rxfk8`p| zx>H{tk!|?pA-RKtDw^XXAR8%`csWihDsgg&4ot&Mm3F5lLHtCsm1ns~m^Res3mr&x zxl0~y-XN8D%827QJK2tx3Qlob#_%BVWbhV=IJplu*YRRG4bWgf(%pEHAd_~kbOJRtipxh=8y znL!q*MnIa#*IWnMaawLumlPV)kR;hem`2hFzmrvT=UdsL71D)O@+7YrbBx8Tw#w|g zdR$Uv1Zli&Jjg;lLnRKGrPJ|hduueAoYoc`(jkvguW*VrmUWq_{aHqBkF7@SJ{W*QBR8#4aY& zep%nu)Alrb1OYA4Ba+NV`XtZUgEbkSj$}7}-oir*tUhl|jmJ2M&(7ERw2?{l#YrE@z^4m2Pnxu4I402qxXCn~PqfzOZ7xcGNu!_E z;%Rb?R!pGVhz?yJmuc@j@zQy1UGq?j#&J9G+b4^&Fw1)ty~63TOO4J|ObfJawBPq= zacyxG)hhc?*AyKLWCt!5%&mO3^uh28)*1Ky&~g2S>A|2i(AkW+x-L*pdTEvYtZ!50 z<&vtx6%{X(*|wnxjy@DP9;@~{)94}FLT$~7k@ex$q4&$TST%K>@rL=2b&79;nZZ$^ zPwnqR=SwyiEgbV436Vc|ExJ{trjGZVkarw+T+8f!krBoMJ0|jp(Z90HdCfW4anjjW zOEi&oEHE1ANn@Er>=&%-W>z>{`JPp7jSfApmmM!j|EfvG$5D+cpI}#1dALM=@kfe( zl>g}{XM%B-nf6uVk~=1R&yg8D-)#(>>#82poQ+YN9C3~wuI?Hyk44&7 zlm(kr+BX)K)U+#I)uQ`Ew+>E|H0Q(7jUw;a^{d6kyc|2;ebarv=kX}j)T$fdBJ&}C zlW2+QRox#|!!t83Rc6+H$D8Plk9x~9d}4P0Eq8n5CFkwFR_&kq|Gwk9sRq?KBzdf- zA0OyxkZ;*X0bk2zCeevc$m1nm2j1rskl*AZ{Dfo$ZOD*lymlE$x*n89-kDELkgSm z$X%&LU7Qr@Dkks|K3%AzIFCn!6mpq~Df^@)UWYp+zK(0O^pcg_lO?R;aowu@$dMHs z#Uz2XJV2%nmP?YuZCNC31ZD zpJ0-}J%SXI!##53SK?@_v&jTmDZi78Nq0Q59vh!rlGC_(o@o4XpIoBY$dClON(IAs zw7~~;Ymxu?1FtFF8qDf=CCX1QYGuRmR94i>UrxAmqPNK0P_ec9g6jkB&8RuxPI=KE z&g?4_e3hl|XXh4NEqo=voqub1PszuXW2MKvFUoqCT`h^N=vBJ-R@3qi3p*AZEdQyv ztgK7c`Rt zE4Nn-D*E2uXdMasDB1SZs$Zh!SjpvgtfI;;B}eV8;q0>CNiy4oPC8!DAEMhkn^;fS zTcoe`qUm>Ta3s2#d(TT7Yf0@>&K~8XV)sk4iWwTQ`UghqHFHy7ypeCO4=uL(u)x@C zUzIDeD)NFo%PNxp8K!fqG0Kq;wbJ#eb5*s!s=X8fPX_;vWMSKJvO8vn zY>;cNQGs{Drz2;B#g<=QbieKxAN8$Nxwb~li?v+;IsSKJ)9ttZTYLW3OS^C1PU4WhoMizsbO+g;aCz*ayJ5D<`#3ta9!=e)l^eE*ThBLewwM;0c*l`1e0 zEijnUk&Ya39|pXK2^vz61qbXXM|IT1Hx!^L%|#ZbiX-r%Z2A*ju~YO#I&`=Un;0*8 z;w^&E814Y}!44h9BLMd>0}P=cFi6m$0(-CraTH67;Egdf1{MlJT|7htYT+$irZ{v! zU$lo676ih~Ca~pD=sMLxTg0D8Z*64Z}Gx=)MB;zygXUs9g zNF$}Y`nT*d>nV-LO0=ghqCFmB5?;`6Sc538>+TvUUBo!Ch^mPg?RPOz6lj}7B;}wO z>69VH(m`^=DGa5PYzKa%OH_$y_JX^j9+#wO^x)zA8=Jvy@g%fiE!i46EuE1zQUL`s zj?pZNoum1hoDn6671fZ0T(Ju6@EjQkf+l)k8hns0&cTOv(PSiIIW2;O zcQQ79T9911c~f&hfUm5$PNSm`TTLV_7NNXIPfLpQdV&cTP)Q7~q) zj>v$*l9>-)&aXdgH*{Tkc6*2m2M(%4&<=0DU6+Sw2c{Ak9~$^(tnaS-^*L#U#+ipk}Bb^`<`Qf@`6GFp?V&wL zJYqJ=W&tikb?2lkNhdYqT~&dPn6DLzzp#rNsTTXBJ<~FgU%9cWzx`9GZ>8C`xU!q$ z0P?F8$2BZ3Dl2m;pKl9wysOGIKGMa=*?42v=-R;NsLzPN(Oy64=QwJl=rZ4aEwN|W z4ya@48=4>&%jpm_$`e;%rSS+rjA)D|xQ^Mh1pagaZcwn9%CQd1VaEZgMjmK_L^Of| z2Q5P#(Ny%r6AVCixUp9hiacx(en`YU($EjBk$@J+!BkwwK19J6D=-3Q#7(G3r**iC z8mPcNNQe^GkpmC7z#UBxghaTb5INA}9zqd_x{xpk-I0ZwI1C*P5(AM#2f(0V7S2P1 z1)ER@rRb05&_Rbt+C_`-g8qdkO3?*TFrgOeArRAOCp0>R98|FV{0WBYTO0mjiI&|K zPlH=Prtd_HBYalC6R!)ke1bOnkF0+nY+_(s-Bw{O11HqW2wq;}P}KVx;{qybO!I#0 z_1!1dwYj0I@wPr*9;U6Nqsky!$A(^OwYSh0_0LhjHEy4wAe=R*h+T6ge!*ARn$c~-s2@K;*s#fU3lXPVl)>m3fJH& z8p2bQi2LZFCWy6Syk-{-aSE@{A2!jGIw1``;DTbTKyBop0SxE}hTcd=6-J=}+JGS# zv+x?Ov|D#x%~cRW^Ni~sW?y`zYi`_bKjTku53}c#OiO~Y`awMV_R5dq6Bp~|3z(UyAN+_QQ6ULvx|$g z#J_{bR+nF@e>SBXEM60AA@(@=t5;Lk5j^0SxGxmRGEXEW1jpE7T6Lw(*gcrcb1tPXv} zgDSEDFG^bJ`OqrehKdQTUYc(ft*C1;SJ>(XUo%T;JO6pk!|~p?ler;UxKA~7L5TZu z$rC20N7PxQ%D*s4TW-&ycgEvY-NXRB#1_yA%4W%Yq{|z1IA+UhD<`Uz^lR0xVjOQR z8t}$W6HFJRxpIO2DBD6yy!FO{|AE4&(?A-PMxISqFE z)zs7wYZ;=;5wo0vbe(v-LDQG%^3jcdkZ#(8wU4TmBc5dwU+)H@7_~t|m3u+a3-&-@Y z%7ZD<>7TxCFa2nBaWq4)n2BN{YKK5NNy%`>G@1!34vM?DgnwuYTGBg`P>L}69TN7S zIdWmbEL5PrIF2mT!Fwpwp7x-n=z!@+Lp^LlBU*&^V3Z7hyhTf#M{nGw`FM{C@fdo{ zMiQD*j(C7fj24d(f!fH09(56fDr!QpaDxR6AmJI}P>Nhwk&n6vfsMwBRwxF;U+@-j z;!k+O7qMWlV;HP7kOra_TA>y4F%?r$8zZP1Qb@*fgwR!bhn9S_d;;Cv2fJ(fZ=oY= zymEiq=tkIz(7cBIY9g$t!K!fQP_=%YnlD1T)~gq84T-5;EjTE+Pt?zWVS$FA*}g0M z{QSmQnz;D82Rb<$cDn=^pPS;1J>&)YDnlRnhZ% zk7iRW{Uc;tg^v7KGUkvgy+&7>%XhF|w3uDOTIR|J(f}s$QLHhK6{^gzM6{{}g z;4>O96aB^f`5AUce8CejlNs3|I?8^98Sbn%O{bkGr6Gu7R_aeq!c8=wo?3_~M}5sK z?qZg@OIrgk@lw3Q3vpD~a9zEk)e|R0Bk>x!waTgshg4SYh{p^Xflp{k7g30DXaXyG!G=Vdj~o=xC+dU`Gy}2F(Q&xL zhRx`MVzj_q7-$Fmh+LAHAHt_@>3I*=x@>Dn@Tp8M^+&+F2MKQMR#xYnv4-t$ho{}H zNW0Ph^~gm37a?EwW{>#3w0K2vMA_K#(DLQwovbay_{wn=7pc1LCmN;Stjl2zb0?2` z#_(z#v}#I}E{G;+5$1x7VZ~a!%?prDQku=pe8$&MIPm~1IPOes4=2i z8<#}S9o1viRUak?J<(^V)oR{#uCArk&ahzqoxyy^xt@XK-`?er+TC1VAFjolU+C=SKCbU{P0UC19?~a6jAEeb z276Ist-EfgeKkwb*OOl|Cck5W!Xf6cImp-MO8eO|^^NGtJ$OgQ3D&`|p>jPn(q}94 zs7U{f|4K%7(&?7;k&U&?HMpSAxLLoJ74u-}2R2(dfwBB2wVCzhqp_+9*++qZmu zWl^3-`Ecuj;`!w}N`6N()mXKd?||7cK+F)aVv6m(IKb|fRaH4D|5N`#AkCy6?3(n6 zuQPpi&JS+jn-W+jV!l@mX>rlJlIZX8SugV+lrO}3@ecjb9nE0J9<%~OKbnj>$VV&W z&_oh2VHh=nr*OwZC}M*+0vk3^6!PJWr;srdLy>?{^bn1(9AEGp&oBpG_(U8MmWj65 z4`+m<8*XC<;^{3;BMT-3p$2BrAus^Z2!cdCk%>-X1A^cUKTN`LoWn)D5YJ$xyI2F0 zh@+3NAeB;)ixP~388zt)YEvvtMv7P`)&Vq-2BH|7un8||F1{{gjY;Z#GjF&4UUHO4yhmM8$79cQP{yiOSNe=y7iY2(tifA$<(MemWd6QXaViE<4Qe?7u)D(M# zvsOrZM3UNEe4u|Y89OzDu%U<2MZF^yX%lb~r%??nVHKgsL=3vY6K2s1$FLkD@dBPO z!wgBh5HHXL$+(Lg45xpv9|72m)^HVfQ3nAu3{S*yctVC5K`@{f3^34KYy*T|OhjWO z(tH?bE?z^yV7i2SZ5}!x((R>rFLm$MqTwZRQ7N*2L~_=_>;?$Sm9p1rGg1u&Q*-|O zxHEsE*qBn0yIo`DkE+f5<8{T$}Y;D=F`G;lJh3F#jeL(JAD25+uocfiGR)*2B81|AOJ~3K~#M) zeBM#7jaQJVjoaTLA>KEwLCUr7n)4_+;JDyvch5yF=UhX3B-1=?6@2*(aS+8+M>WA$ zIKdbBq=@e5pd}yz55-d4g`%pW8S?cBEEkIZ8e;(z|2`h+rETmB>D&8aTd1Rk^Wn0& zs;6IWRi#$FZkJL37r@T>pD{oZPtr%7`&CAWK=vTRslYuAjj=E1y34DtDKv!7wsd6p; zQ5UF-L}z5u&*)mFRt>Mp)_Jm~)LYrBT&-&B;$u>c^IdY~{`!uN#`be+NJR_9QLd_M z>;?8Q);X?&EF1l^J@>m0Hf?ar4LTDsv-zg(+V9(a6WliK9sFPA<^5-^x0@w3bF5P= zxQSudgJc|r1r`8_kVHSR9>v&=)u>MjVgS5>4M8GJTtotnBNlapCu-q2`iYi!4+|9d z!576)uoC+bLkDOpKxa@NzGAqjha6Ex-O(I9#5{mvC>HtR2!I^qh;8_cDVTu=XoWDm zf+s96z(Uao5Wk5M13@e6;`;h+x#-^ zsaED$VX6_iKI~KVN6|S!>uPikKJ1_CJJ@%RS7(pwmO|&pE`s; zHm2&O{Ro$jy8b9{VP~02RlEtU!&@%%KVjk*nMVJzz0@3$+{*mP#O&+_2C(;(1S31k zjX&V7xIj#t)Lbx2 z)J8A1mCn%>Orz7H0sJUkEyqfls_hW*w2bN^j83RN!WFOSx|oa#F%O674>3o?U_AcA zVick#!te<1F$?wKjYN3TO!@^cFdk#!LfhznFw##V6B?~XGc?5_NbrV%_TW0A(HHfo zFPb2Q8dDQ^;b#QFN*BljxpWAjxQ|J62reka00dw?Y(S$QYc`Xgj=$HnQoOgVS9>8g zW4TPF}@>?CWDz=xr;42*OM7*=B*8{g{HB%n} z`eE`vZsVQWI1lsiGn-tP*YN7=JbG43tTZ})QH~fcuwvb-;_QkQN(14+uJe--S3{Qg zw-QBETO&MSZ;kalzPms0Y#8s6xb)M_xVg_3Joc;HRFqMW|Lt1m1wfiH><+oxlp_Pv4&f1H6g1ibgU&A&5Y9w?~x)>)q9AcJC2qBTYz17 zLf83zsC<&H1q?Jq+KRVWLh0}pzH|bT8mxK3*EY;K~ID(l34DUEm`g&wX5MJOFY5>AmfmBE0Y(GRr{fLqWZ0ySv4m?@$WLX}vAj@X5% zh{H6>M{Nv&8FnP%GSU$#l3_(>{DgMUArl(@5U)`T7r4L-3x?n*t)Mdqa8X>m^hwd( zgFE@wuU!x%YMc%K?AO`nctD}oJ@-nFTCN`~H(fMyj?*!x22R51SA)&?r?E<}>W}E! z%CbCIDwTX$DIZAvQJyCD$!Ma?Ddro z5_>~?u$v8~F|bN9|B=GQW|4we97Y$KgWc3fBr$hnil*pIs}RnbQ*)HF&Roa3Qe)am zekf+QahQjAO#UKC%@tT4lQGM|O0XTv<+(oe{Mg*NBGd7|>Vi162 zL?fJLq8TKy31-;n2vs2##qbu{+LDd~K)pomNkn|V-)_QG|g)W2aI?NRt-A_7|8WJZ|n2dKY7v&J;Dq z!)Pk4>QHq!amA;?qAyxar&m&M3UYeSmx<5e<3w*{yBsyPMS$cbha-otlv^T_ZDDD! zh!5I*Tpbu*<{)Pf07NJ;7 zM04beL83jH!-n?Aqj|IfK@?5}P*5P6!Ga&q43X%Jhv+6MR3L z2U&Z<#tBz0vt4`=?WKE2Wq;9A=_Za~HERYlox?aR6|VS+=2IK=Rv3$(vgCyAAil#h-Dj^$dSmajdAr>H}lX}-1=iU_5dypoQK`PwJ3jV?=>WW*h< z9;VVkaYr0i-)MF5f<}rpxFBw6-_Z(JaR3TtU<%CQCe=qWeiFYS27_o2mf|!lXa)(< z$j2)b$G((3V_n&@)Q#Eu^QJk+JKgu_ zultvq`1zO5QUeq9wTAh*cqBx;iTYH#jj^jG*sYJtqiXM}zYm~*$bj5zkyR(FGOx|o z*NLIlvEMh9tyLB(RxqbD`8?iaTv5!J=rRitXsv6*bV#Jy@~E*$>tnS(Snc3^%HN*-L2oX5xkyhJl|sfMALXJ zdS@T5ot8cnWjHR$XR8J{MmUyYwE9$QnA4?frX#P|-CnOE*cf6Mug&NCd7io&6WK!D z4~n~JByD8gLN69^gVw@csqcup4$kU95kuuC8WWlPiD)T5RR_Q-MerKDmoZCDR;@j1p)I5!*ul@?#od z1NPEF{#;b?PJD|roJa5xe3|ru_mbvIH@Gi%@KkP*cFPq~vfNSH$^H08iTy~iDCQED z&{R4it_lhN(k0P|C$JVw)-*9pOJgT+3-#y=b)c&-Q**>oPqqcS*crZ_Irv4IM(t@l zC1EjN&I0*t)}FuN7qzbPADZtCrNf;m7efCtr*=!S9V_sU>>}! z{6d*Q>&TsQ9nUCO91_8F7l*Vr$}~#E4$+0R!z$@G&7}sS5lzEJ(S@#~xpq+Wq{cju zrixACrWS}0Q5UyGu2>=Jpco}+j2z)BE+PR*^bk8R0cnsBgoDU|4acCsiUE*tfCVET z0ALV%pdy=+s4*0PM!|qEss(F>R<*&@CB0vMu-~4{^(D6*K2J7&IG>(gHng;x$Y8!S z0ymBSnLawTHO|o~mU#DL9@)O_%C{;ZT18&H+&Q^FdQA0R;|aI*=2vD@<$mpkNOKKg zyKunP)6vhCY{_?7V7eS}Iq=``uHTH`>*Q@$_p1Bs13iS>anEt`4qb)Q$oRxmk57I5 zJ{sn7ecg|dt3r0mW6jl_1YWS&a_jPxs_*ubUwCz(>H!s*AG-k{eMDlAXfkc;xuiF~3nl@i0r2K{>Li=C2P5V_FX?P^`%timFZVNAz3wTq_q`etY!eRZ zCI=XrDPOAmwW5Tr-71c?)3r?^X9kV73XPxKRWv5r=tnV5=^xPnB~M?QLqZm^Jqc!5-|uP}nd4Gms}{!V^|ZfiHZ; zDB*)Ecu2jGDUOIun1>knB8<#5@&5_3|Nl+Vfu5omzN{&Wqyy4w{z@|I!r4@=M?8h# zfCv)p#X;3i^-~vXOVwa?jkuy+al8_7>PmH+_LKTZtzvhjY^fO0bc$|bHT}zm;4juk z8%y758Qo$ZnG27jCoG6xp;v4^tAK+t*-)*OxGAoxe%c`IvZU@&w~JR4%}io8@6Cjm z$-dA~il!CfFiz49#8WmK$?Za|HZccB)k)$Uo6E+i zk5Nd^PdajcX5xT7;t#0#xZy`?hkvDl>bKrgz)JkgAn!bOaxdg2s)6?wD;exfIf z#Z=8*Ybf4{DA5F&s4GJ8TKg*6!73ueJJg32F|Z;GDFD*&9+~2y7z2rhVHkoi00Usg zNTeYRMyi0X_8gaxFWIDTm}EZYe9>1C4|J&wu6-L*cC74GYDMWE_S+J43kz2o`Wq(E zR8zVuxBP%jraInR45^k;?$_**MQuudAwQl~aW8*o>Aa$z#x(s`y|o~z*tu**&^OO_ zVcGIEDVp6$Igy)@dBA+o_)Q-gSsuU5H9Ly|B0H8W9n)3i*gM%YOR?? z5N1l9g-#vd+=czAKIGnV9g-_+vMOn%mdUbe*3pIbO zj(SGhphaozplZu71Zmm=VbnILjnt#s68l2sonwRThbq-}zoMb-Vnq~}KbqgOt!x=> z7ZL0u-KJPpjrr104cI1r5qfk%G@rxYv%xHo22mn$>On0KgboP8GWvs^VK3+qKgZly z6I#vuX&8G;MRbclLl@S9yR$3oFUD9FbD=TFq&&)GDQpkh$TZr@8dFVr$3o~8wd6(G zR=i*mV=RXMjV!i?57wqo97VI9d@&x;UcARgb{b77Pef51>LZw{(6-~+!sA@9-hz`AZoo`&l|DkUB zZ>Cme;q}4gs%N?XcjIRd;Z@E4zI<*~Xh}=!snU7cW92{l1m_L9)$;nvD|YWHpVG%A z7fL$^f2m;%@X^1KBjtO!ON#AzJDd`9_2sVCimLXuKT^H2)@RjyJvQl|#K6StFV4x$ zBQFMX|MQVM!poyxGK0RrRR2li_YD$y-E93}%Dwd=%Pn_xn`2uxYTjr`>sw80w|EiR zIplt*Kit(y;iwv3=2B8+onG3jbfc7l7_F&$zF~;)bNi~Ffk)2W3~310ePy%ZiFe{V z%##TDLE|8|K~yU`x^M?96!Hu;oJR|_mtDrzZ9M5v<};~g2oNvyQBpxr!4d&~S; z5R%nYCK-2BPSUoqTt_`6S{cIkYnQbn1`7{o!=2aYf~B>_wvt`?!+2g^DU+_I^g`cL zXO+L|{iNf30b8wig+=}96sq&&*|@_}xRdRP`Xi1i?bRd^s?}1`G>Cr?$9HRE*+*%d z^q3o^>Bc8gZ>gD~Sd!RE{b1c)p2=NVJ#?}Uam=%)S30Se=#li%_}Tfs>AHI>zxlp< z{I!6zFT0b&-u_yAs^EM{Z8}C#)K1)lMLa}Xc;F!&p_3Sa2MEDS?H=kP2ft$-%wRAf z5ARWkXRsm%-eBm2bnKv?;Dp5(hX=4hK@D6HuDC~WsDTD(kKWXd>eCGjB91vw#3FRS zXMCcc;U^}F(|Aaos5eZQi&{`%6H8Ev9{2Myk_`KE|iB$L?6w@X0qS- zJ**Jj$(yCJp6mph#m4XuQt2wr(|c5-iF68{v{ej63`@gCilC3^f{*MmeWFMvAsAsu zW;$xic8cS0h~wHk923=8G2hL{%Q80e|1p`WYYU`#6fFMbZFp_oN}Z}UWpR!V(ilgP z=UU!b^lh!aw+ohggd=JY;R*j1Fu%TGKwBhQ4JE$1rWZ zs0kLsg6E_-j!b1kY@vg=m6 zS6AM8r;q!(EMs>}P?#ZTQ12^kYP2|C{ax_zVA=TE)ytz_V*J<9>E8F7J)HM&Q~mcf zLaSx9*IK8yJna(Wb<+P$!P64A@|HKd#NUkD^{!rA((}svy}4%!o-19eW{ZoT&*tuv zHzi*xzM}2!RT^eg0?ljWUcy&@gWp0LkK|vGFNU!|NZK|GLUV1sc!PX#opnYf8Ku#P zhPVO)`5|1c&O>4Fi`KUh+!#iSi*45BT0413*;VD1M}bQN)RT|c z`%|!fY2{B<2P@08-F&Zj;gZ0cI#M)eN>{_nPOFpUJGo8l1}VNglNCzGtA55np3VMa zb>*MMaQd6g(^{&Nw0-P5K8Q+g!UMI?SW|LUChNbjnRFNy^#>)b;!x#RWrfy`w>2!n zB&Oq0;(v~Yj_uM~dXA~O`)oFb>63IzrGo}f*Sl(l+D^S0RaQScLCLMkQ{bKC;Px{@4R6)}j|YVL-l!Lowp;6MO;M22h0f z3Hb;>A|hae1s0JA8)Eq%>@j9J4U`W_W%MJ}q$X+$B};4RsO?Bn+u27roE)iYruw&X zz`j)}bbR5S_HMk3dJ+-z!oE`DVi%5A{b&1I*{SZWdTa};SWsE3VoP~R#lPjdDxX)J zt{752+4`bnx%H-XM!}@wsioPuZwqf1gnp;OngxC(HOmKC|KRdaV;0X46GVpi0d&?owfQs@EY(o9TYoymip;A=#&Sb~XEMX@YT)Wt)gAOqI`n#GPt zl`zv&#zZP|X)`sT{j89d;WIl+c@)Z1D3;b?KgPmBBiK;-kNeUWdd>f&G=yOdTf)pt z6Svt%mVt-N#=27tE;GhM*9yHZ(BPcT=wW za<75a;$7TboGqV}7FvK}qX}w}5@sv5``dpEEvhyzAj2)yd6r9D>hfIQoE6S(O`gUn za<(|glvl=d?~HXB@p)$o7COfmG-K14$C0}u*3>De^Q3l#Z;KkWYIJilx=wT(kUKW7 zZt3a!iO)>W270-g6XehB=d=!L(Zah!Ky=VY-4eO6VeFgKgf|~MT}rs{@XFYN=h;*9 z8(5||x#`rv#A*eeuZ`oJ|8nYOedw62`^UPevbplmb+p_;ndG!cw*a8C=m`VMr&L6< zDX`%qPsMd4@O$ha8t`l$jx6buTm=&g=L1m+PdbiVX#581QA=qRB6Q>AX()5D=%3&% zX>`gKV4oyf>Id37s(nmR+6!J{TvswnTO_S@3=oZ(r}U9UGplKYyqF@LKJ#AcKWYQ5 zI}SShFv7T0jFS&Lt=D{|p3dd=4%%@#<2bB3tNYdV%6nUsmZ_H8XN!18ql!Q|NgJlV z5;ZWFJ<_toa5meq&T&FLD`q>meUoX4)ExlVQ~&?~AOJ~3K~xsEpJiA2D($1OVkw&q zi`oY**i^QVJM$!Vm>r@9te^U$c%nJ$#_~F{8*^cMd8E2kxh4|i!J?&FX#9=W!(z5v z?ymdObCr)=tx>2xWKm7uq-$TtCdHKQ&c9sXM193gnu}gE2BXlH4#9>}cq13{Xbcog zrE^F}C6=LL|l62uB4t^{1J5EmF`IHR*qFMh`TB1*OFBGnyhFN6;Du7yw!V4Vr$kemdXjpX&D6 z`G2meInQae;Sc$^6s|MLL!_s2EolH{K$^b`8$=nbKZT2qq8{nh6KZYors{@0+P<)K zl+{|&KL1(Xy&Qd(E$d;%%CyJnjuvPnIto2nU>|zN$FYF5CLhY8GV-9qR7x%RS=N`&Lfc)aPu;ccWvXAY@8ztLzl|) z6@#p;?3wob3dv)uCDyes&7!Sz#;H<1Fa7A>#j?%POaDL@z#I9rwhVE;){sSJM~$iH z7WKMrGvC#Ij=kz-_D`0K#@SO68l}ahE2;aFT|U`e=IgIvP3*5tE1O*N*%DAc zsJGOHw>9LX4A1D7@$1XP=P%+NDZZcMzc`HNOy}i=-41ko-tubKUTss`UWvI=_eGrr zA#cJC)VNpDSluUIde}ad=k+U#weD5#>76kML6$syTf~{7bXH_Yz4>9(lF#Z^qfqCm zOF=Zd&+0;=uWS=M=`*_uXT;HWWa1``haHWiA+WO+ybnGb)=K{3tyBj+OQ+~}^Pj~C z<3lcttMxmm%J7Hg0Vd7Vc8bRIR6C;eBv*Ab8cGjKKG=`);mRk?s+*u}rOrB&0&z=x z(CnJ0@}Fc;Rl8X{SG%cS*?+W`EmDKUM(Smss+O=gbrkNi#bhU9tKlMAYR6d)+ruVH z+xa6YNAJg5iW&L|%z@)lebH7Os`b?h)!(#zT9i_*t1tg0nf0lXSqk9QU`I3kBwmAe zsj6nT*#FYErSXo|&X0^Qjc(<4)D$E-QemQMfgSwj`gM3nBlN-~yuxfG!X0(RKIG7B>V-BKfdj~;x6oih zI%XgQ?dTkUa14hX_i-O7Xbn65qS<0LDrhMkr%~+40As`wQAfQb zda9$;3yuS}$(3y?SCy?U{Zu+WYs&YP-+xY@`Yk?lNM@(3Tbcg3rMcO;hf0Q*EGYe0 zv7~%ZWubkbqrT%8^}Di6IVAqXGi?BS#+FO1<;8Nj93m%44S=uA+u|_zJ7%GR4Xvo&nd73UxiNV?=oT6?di+dsoSx5t@EuM-# zsE55MSlxot$VU zBqFN9Kz5ARWsBJWrfvuq}{308n9i%w))D-O$ zj^Q2J;wa4GPZ39dXiG%~@=-{I=!ouc5&K}mG^~Y&FhnAr?}S;bV8yV*Op}q1m(&3| zeB`Yl!AjNPizdt$0cAf|q*r`@mzRAuW4+Tj*S+p5sxSAsUL!TLUFpS=C#jRu{{HgS z9B)o`$#FmD*4}-!>?-Y(`a7mM{2UqZwENm4vyK);<~J{&RH0eliveP|*#EUj(WU(4 z{9#s8X+N)q?khYG>erY9-M+Dh=t-lqUVPu3GxovRc&`_OlTDvKCiJMBUplq$md7pE zIZjSd$&tfrl?HAPve*1g{w}*27Jm4WJpap`L%*M|c79C4hSzTI{&pGe{>`~Vm%SY< zt<9}&x5#bL#H*;9N8m0O#XZQqmFGe)SFg|h%e;1b?ET)TG){f`aZqx~_lx6>#M6iO@Q3`FAz`wdna%_kwBH>(E z2Y!J8Bh*u9&N!P5Kixf6s6o$YYvBv`-MZdirq`Hb{_R`Gptq(;d0B?NxU1inx5E}@ zbW6*!`br0EnzD(v6(ypdT1!NbhkAxR#tPdn@~>=#wm}V7UgD5=r3RxeZ%z}~Rb3(v zl{?DSrIYeLaZ;rqqfrhU?SYOS;;s!=YNym=$z-@1p=IAgf(iZsB~UpG+~s6^Oz+NYUU%bxtX`$wm9 zPDd-twPf>>1U!z9p+i-qx&kIRR;1)81ebu8EE!4%d+{(h-7O zoIp4#z>tQ<7={Yuz>{_Xv<}WF!XkJg0As`foS}0x2>E!52Y3%gOHhF*l){F1(EnI6#Y#i?CFzai#m@60IbKeY zBBaT3UEPTPW8uCZb3EU_fxnOQysmrY&X8>fu~+O}dsC}EMU~o{7`4?_RIS>3)~r!# z?-@IY9TAZb5s{sHT-SMge|VnPAM%oCMK#e-%Br@v)Rmv|9j?r`R5!F$gSZz@ps`dE zk*YTj!6g!ug7VZ4)#w6M;z#@m&FDTQm<7C-tC=(TX9`gP`~n-*eNIt!-ork07)kts zd*L^nL@(vx=hP6XbWhZWi!$U7)Repwja;0gZrF#pC=G!$dXI9n2Q5&ZmeOUa34uV2 z#Z=6MA5TCW>f=AuL;*6mG3Ik2RfBp{0~AsHq4RQ8ng^x#F-V37mFOVc7(su*#p|dSiXdJTLp)xIzwiN|-YAQ4R0R&&EPp}(RTV9$ zlTk(0H>1lQQ9iHQwHnVVh{9bmvH1tT|g+eFv z?Q)_F5x>Z_2y{O)f{i@|Hw!{s^*s&U?0KF&#Qn*AK|M1@(f<4a`KkF)d2@4$=T(0D zE$N5kP0t@BsD$7*es6z#ovhV14|%uM`>%|`dwW?0)o1<-t!*w@3Q zPt?oe2`|76e2|B=foP>pE%21N^P)QqSky5eJKZ}u|WlJRrcW*m_uQrlPE1};v9XWE}r_v zWIp7IH5aO41#@^5xgF)T(Ryp^64^+L75}3;vZ$6M`)fV97K+gceB|CVMJ%Nea+JD6 z4QPY>ODOpkE*YZ5`0sJ_x37JFI-^(C*%FofPWn`~+T~(&@*eV>^t3Sg(GZFcj0#*A zd?F%0;Aw1|gdkBN>hO0#AU8 zrcykfQ3^fAeB4DWzM&N|kOn_WK?v%>hU?S>|HDL_#6%==6AHp}9*j=-hj!6sl!n0# z5JoRB0g;#vg;f}g5{7YD$&Si|6qP5|oAn0zIJe)CGY!ip8{-vQP?9^cPFC z!z!iJG!=(yc!_K3gz{4kb(hDeKI)^YsY;^*Y@CI%2;{fQqZ+GGP$)u&)i|D|rqKix zp=0V3x8kGt3Dx)wu3;cRo#+9Ipf}FK1t+DW7=_6h(9n)TV8BOcsDW6rBb3I=yU3?o zvJ}ljEApZpYRT)Afn04jEh_$3=5EW|6>nhIM(YKL*IQT?;R9Rg4bpgxaH2dl7Nrp{rqU}|sH^rHa;d4~A#Rh` zehAsr!_on_=sJ~x1@$Q!zWI6X@yJe3&)Z?WpVm2Ve?XkOC~j%ldRO+zAHWN}9do;I zBlEhqd+t?tDY-Du*BDF>JbvE7vNcaNCHeC{E<--lStN+A>KY>8@&17g^hWifLsT3$ zXdnG-{!5?b0ryvv)yKS=stsakJ9X3?^o*`jNpEwrk*bCrw3!;I#xzXJ_SEM;`PaO& zo=9_8!FA6q?_J9#YnbIPZKBl0LvK+tOtp9;znPi6%0&m}`~IV?LMs(V1B{>C#c4N6 za8^i-puE5nRVPp6%XLTOFTy^ea1&5Bb>@09uw6G)S=b%7m|?)fjbyNeUMJW;Xx*DzyS*s zJ;H0g02|V1FxKNE6~#hSMlfn27Nw~(l0y0h^tY`J>gb;rP}SFKon}e4#p!dUua>4g z6b@aNn`IS!gnnFeSQcn~w1;|_v}zOdi+UB^ZXD+GYL%ynC(<=qt0S_tX4*O}RCnuA zU#Q3F_4J*3Wm!?n*N$mIYpZ9R2=ev_Se@S?F{#EDq&(X-T(Al9x zrNXJI#J5yWZb6u*ETalBvWii-c3_*3l0~K!zZr3?@SKoqVI6J1+Z)@~qdn#z zliToq{=^yh2P?ryMFrfTui~0k%yQB8hHTPbo~Y)nFtp;{kDrq!#MR5%>uMLbE^}VN zY*p9)lE)W@*46SWlC45tjVh=j?jaTJ~GeDRTSg7 zym?<=XH-?~MO_)H*#ZV@Lpjv;gR6w|N%}oY8(NFUVx@e` zuepv`qK2y@=%k(3UTUxX$5;(ZT^Xm3)9p!{3V!vBweRvgptqJj<`VB`$2rRx&Cti# zH;YSJygI6JXs-2|g z;*H!&3G$$JP^%&Rw3)IH_0?Z!0rH9#uW9;4>lgjD_JwbW$3j^4nPrXUw$`>Cc1f?) zZmdNd4W{klE*+$yVvLZ&Ar1$yv_r$MZ2Wc5x*-34^>;u2Q-P_V<)Cl zC|5@WZB}P_i8&L=h{6pX$lcX;2tL6tXb%oyAdgU&l|QxP(|iI>UWPp=jssZD_nA>0 zy=WO`!@zl-Lscmcr$i~SlqRYLR2v~wlPXdf6vYxgi7WUWRj3nn$4UMXx6l~xX(S@i zi}t~Vo#HisD6*h7G@6GR;-LIQTeSXiDp~MYHl=@Dqg?;wkIgdqJQa5rHd4(MfuTXI zLptz25vnbFS0*(zwZWUCAJ)Hrp3~Md&e$FmS23yb@$wxb)|XmZ=u2>+!Xx}Ds!8UM zu!t7S_P-HeIidfJ0PAhrBe+DGRv(F|K#2$!L2^A}#W;BbUvSRKNQYDVABw>ci}4M) z^cpt6y9M`=s(JvP1lLlO%CGD>hs>M`?veJ-pR2j;err=6d;ZZETAt`dEDf|Y@f1;( zde&&!JZqk3kk#w{$(W*5;YT9TW^=652hjJf55`ut)clU4c(RHzmC8V>_o#Wm(oU8Y z--}ZEf42MPX*C>iMh&WgbeyFmF;J{xC+;~K%6)Q1PPA)TeyC-*97$tDqL^TbMx3gH zIpR0GHk<3cu^fNfLLJiauUXkFp&A;&g3P+qO@vb6vJ`T$P_B1|`)$t9hcZe9pAwP)48`f&Ws3%3-A=mEb=t`m%`ywrwpkvvRJvvWj;mLT*wL= z!BiWKaUY{_0XCdL05TDQHvsN)4B`+@GZD#6k%}0aO*aAFz$f9N{n!mB(h-aLNJKpT zLM)}AIJWQz+@@pbhIEdht%$-OxPUnH*#xx z2tOfEnmXYrPbLrhsi$bkSNIv8DR7MHNIATXUWoQQ182oWZjX=9csf?8pE-*k^K2~O zrs$yl;&FVJ*K@A9PCe&bHH}+iHgwfr<-))M{9v9kU-BN#!Ew}9`D|Au)KRWu*5W5B z)r{ir*o%o6rq1&#?=jO*CAl#BqZ?;n9A|JC?&B1|0r9Gy_0jWwWj@V+gQg5+YZM9y|QJEEz>sD+SzAjkhjq3LL0-ognNQM7da7j zBj{hhvd(D#7{3RBTS8{}6%L3Cu4D>P(`^qpq7g_w1R%&4{)I*p3)y;rcpQVeBSZ&o+(|Pe{?+6J{I`- zwX^%8LvW&hf8^Sa*sbuhWb6MS&9c*S0_kF?^(@ZP&pxwI0#2W&a6;HRpgb&^2YGO6 zp)jD)J18~4-2hE;L|!cv`SN4#6Yct!iv=5t%=);%Q_c5jRu^-aeJGERhb)%-49^7f zQNcoQZ*9)EZk}joa9&Y0)Y3~`63?{KB9tWkVWb+n%m>DLHCFXfjVOycD-sX&|6Kif z6OZv;K|91bj$0~Os!0bO;y=B^+z$$l>E&e}od|es*{|jM9JG|xf6^WZw>0TL+A3nm z5X0po?{1@)vC|dhK3=d^8!cQk#@bW&vF=mNFbcOVhxIzLtM5JQSMe>ljbpeq)#k9x zwCvPh>KAppHd(9?3Ho+14Li;K-at=(ts3oBn|%BDbq_dOrC6CIJEo66g^Iz(xPUfQeXK zfCuw28+A|wH}L}Lv>vrE44)BCy-*PbG50|x(r_Hw|Eu!x8+AsomSeq2hXOL}ras1J zm2IcxuzkFKKo8QJ>Rq&3`e}Wu^tbw3D(J;!f4N9&DF=yD;+Fc8|KN89%=z9pbGP}c z`Br|@2FtP9L75_E$baMrX_cj9YdKZU(vFDP+W+*ndXfy1wWL>0k=w-z%##zfX(CSB zre#ZUQ?E+Z5zgt{ho*2p4CIsi6-n5~%Ms6Kc?aUuPW3b5cqey5oEoKUh{1Zkgbe=I ztclBLslMYiYAu)KL*~z%%z0dbi2CwkzKf$g7){k5h{hZGg{EOXI^r4^R>ipoU&dPe zY4+pd)CO&E&bX)An7a^xXryy>m9G}^2ky;@{Fqm%?dqY*RKccZ7FI*l9~{k5MmzJ3 z`CJ95-Rhuu(d=ym@I_TljWye-RceLVPJQHiynv&58_(uNc(0DAG;YX!a0@QPpb88Q z!xKcHAI`zRK-!E9dLz>C5zTNB`P5a0P$Fv5P)rlO#b%L8PO+5+K!`=+RN!&nlz@*S z-mFMvvY+S2X%(c5$>yL#+I>j+uKw!4b zJ>T_VEa!8|f3#l9QptK+7Sf_@J4K{eE`87$|0Ao25>Mn28Y4DQY0*h+7HJ%X zr~HDvn2nL>ga4rlEv7MuSFM$wN;mE1HZxD{Hv1_9t+0bDyN=}E$S(I{((7|iP6g-c z{`r;VRlN&M4_xZIgj(|f4+&pFkR;5?B3FCIGA0?p28o{XN~oRs zZyaL81>CnZljW^3q8bgCL9&kA>bRxfw5+g=wLj5wedp+_=!Rc;i=Y_lC{Bof^#V&j z>vF#U$3n+p-&*!U_8dJ;576IgCA7sdSO&|l`dWO|$y!k9X_)OS`Mh%)wjcH7dQ~l7+p4vZ9mNgHIW0kR>91st@RMWd4TW%HC=N5qn+4u~&2MIw z@-^R^{}}&jcV(V@Eq|2tWC!_9p4CRk0y$iJBKnHK!Yd%%GDKdN=e3pMpnNO>wB^#2 zPvu0(k~RLu(R`R}c!PL8gBaKl#^E^4m3cK>ypkuuz#ue&i_h>NBvXCvg$SO3-6#u* zN_eMAn}abMOH>Kf5lQ9*)fJ~y1h=6{+As84mO=m#P2g!%+$0>sO1$E~a1-N{MWvv= zY()dj?^zh8`H&7$AF4=`j7!p=-iyMxj1b;{I6Q$rhEN2jBbjGlg*wLns-MjNc)U5- z9Lh2LFE*=NJc)zMJl?=Tyc)x}t;%Ev>#850RyE9uypON*2TWvtu7^atLohV1i&$i$ zf;bBez34p};|hBaiwn4n2&`faFDXmBr%0}fNF3pCI!=4cQ}`2WGs?Th=CD@8c1?ug z3=P3y*DQAh*X6te1?>8(;DoEMSm^)4HpW^@gv!^rmOUU><=_9Dm-_b0?^)-wx@PZ= zZymos;b?ff5{JWkRrxz|aJfyzDaaC1ORI`Q+&%wO`r)+q9B-~QYXmwA4=u4#uVoqO z+mTPBo#-!5${=xt{Ad;Z&uDGD^#tqfd8MTRZ4m3Itb9a;=|_sfcT~uHVwP4Z_#bLh z88KJfMLY3IER>qsYZl{`W(jk+xxwpnfA1Qnda5&Ox@v?>&Y?P%c=c)W?1eMv(KEJH&b{bRj(ur>*OaeA6LrxqlHO^}4Pm z@Uh+CQSy7uVp(O>rA3CHdyUapD=D6eAr_^r))qOxIA#b-!42c8nrAK*1NkRwwAR&n z!v3AK%W)KgT(QoZ=vm}GfIj>bMZ{+FzGstMtR8wD*7s!PPvWrtRff?jTE^W17umZx zZu_3mm&qbpG%jPQSRp2gL1tCntx6l=rq#Q{=<2>i9tzcCEj4k54gC+f8?ijw5-gj_ zXZ9e?Eg^4Xzv=H@Vl?)a@ou8ars@9GSZ%vb7ZE|e;L`;p9mF`y^xG8%>z!xEkN&DQx8}RFT$` zOOz0y;<@Nf4xFZ3QX*MwmeKMJy_Ge^BWg>5G@gs%E9D`DQs|}z1G}jg7vKdS#&Sxf zaPESic@O*G299zMUV#8qM-XfX*7!A26sdO^+g4^cmcOV1nP5rJf-7&0GHKA z-faHNP~)hp3Q}FfQEsIzHLI!qNarYZU8GWD^p>yHDcXxMayh1OlDNbP7{Z3C%|RT) zw`rxQO;>SEwdBR9g;bVEz!{39&uGF2*{Uj=Bk_`ps)c-uXYfJQkxx?{+(j!@)X&9^pU6d1`~xR;(`BTi1~Y^o}IaD2`th+6}E$i|M@@)mpJe>>J z)!M>7>zvE%m+U)jFEiFwJn6yLEk-jgR+wImVY_?IsMC%gy@(5z3E&w zJ>qmk;iBV0MuweEKAiIL(>-@<_vgGitvA-|Y$+bnT#M9uijP_ai<4$sURxq~fbwu7 zW41fR6XczXTgG%ts_ndW913$37HczQW!VEi^K7m|7cmu;&9~+P^BNsRI3}T`TBXj> zTmBhNdW?Mf%w^1`>Xz!Rlv-3U&i%waCOs`LpT^`Dc3<}y=F`O4S#`|%wO|cm!nTQv z2)4GhPC_o_=}k};z9jJlFiW8Tn%N9BpcxvzLG#4p1v+|$!B()nz>A8xncgfkb{XaY z#9DUgn|;dA7k5`nC@&(CMd6a`>~+kIVvg3vYmw4c9h*h6mMLoDBuc2W#xKSaeWPg0 z=gp;V;fa>T9p6cR4m3|2M;#aScrC&^TTZaDC?}rlTf3UyN#VFz`?6L46nL|3cxCgJnP~C0mBZ_eaGr@=vli7<_##A08 zo)-kU@?5c62Yr>Y7zxw|cfAhpeotDVwE@WiOJ-gk)NN>|ip5J^D4qHH`QInb+nBp5 zsd@6RGzl~D0$E7M2n^?D*o^;>4jUqACSs{JVo(7#*pP}y&O`vJ(htxu7cPV#f@Z@> ztq=ek&*8yH#}6om2wFqYsNu+$UE~(3gL>*mbED}}L%hFvQ&py?g!Y~1yyu~JGIiI& zNpN*B3#ocvUL`%$nP2cz)fH8EqWO&ep+RIqqeIw-X#B-dG)&Z2FDRG7L`yM9o~D8H zS)8Kl^1S>ac8R58y1Y(Hs6BnA8Nx$_rFd$IrInNKrCFy<)et;V(JKf!Wb2( zfF0^Td__lgQxJ-gKU#^Sh~&a($6Gl^)mPwcxQQtkqM}rWs?7(@CQ9dQbw>?BwCbfY z%&+E5wbCpw_nY&0wi(9h%3+jMf_AZ+XQ&6}3sqb7SF^dVS-~vF4b@7n3{rRa5l=@d zd*PtQycz~d(;PfNDa61Bxd=xIxM&3u5x@hGgiaI&3vHr3R82eA)|;&Ey&BC=AKt*mGiaZkoPe+F`MMo z$@g=`V2U{rz4;3|vu^%wT2zJ+>Tc-1{Gq)iohR&lP#;POC3qHN$YP}LqX!O7U)^;Wu%+i%{P2bbXR$57^1Km z)vyv8Ulzq^82yAZh(MfbpxT+2xuvq1L(EaCqw&3Jri2QI18HKZ$}sDgKbvjM_iDqd zoOfT|mwNH_#qFnKRg~&VekH#Zx}_Bfyy95R{|h=6a9NfKSsifN>ku`q_3;V$S^^Yu z=^0{RQ^S!Ai#r&C=j66R^Mbz%R-;V8bfX2dyddL81g6}~UTE8!{hKki#NaRU%{Kwp z3yvC-ggy75r-*D&P>Aa|imSzR)>@W3W0M$YWSX7T5$|?y9dCcyiGgN)ZT$%w@jo~3@1RxMV4Uns=qjl<7g!?IFa z>HMFiwkYp+(t1UEWNRl!(k;s=kwnw!t2&{s8!e3T-X&&9wSxO=D=Z)FpJYedfrE1&z-JeE0Dp4>m zBQT>J?Lr%*@z3arqS!)z!cCu02@aG%HDvN~UIPnak&5CtLB{|zK_Xrwk=G#`zo8dm z@Cb=W1H*x0@`?1tw17OvVlCPKi|v7S%W^|B77gWf2oWX6&|%!u&d5F3O`m8MeHFDS zgb1Br;4l+=*{eRP=PJ~kr0SZLQ5jv4j%ri@A?J%FB7maB1aU|tir3jdkP9Gzx}z-=?IA`uJ*UG6 zM-=^sP~4~OsDVg~fI=L3kjY1|57%&oHUszt5lDoK&%;5(IT00jzbeXM>LY&U6dtXn z^Ec6272`HMohFL|h(HG(s=6q}-ysbaUdQ=nDRo~JQ|adSYMW6{{bwFF(#=k4gL70XSJMoaxzv1QzE{spR@-@_idP0r;PvV+MDPT23$Efb z?!$w!NI`Q{;=%|<8l8p<*@%J%PLj}YTWmrLDlE=Y5K7Z8G>`U(4D1v>vV2;ZIqrIuCJ?f6xRE2vSp8H;1Of;sNPkE{N(5#a2C1Y-WtFJ?|Lo<_$ z&I}Is4}lxl!> zGz4F;ji&nX{QG(PvwpcKD5-ZNxcW+McGB9IA9ivf^tI8YAho`)#w z?c%bawDD4%LVCe(UI(%>-@9HyWL$MQ16L=k$+H%&_{G;;>*H&z<0e`Iv7FY+QsmWO zE1~YnH;yv})jiYIB%?OZEWA74k8>fgQ8CI}ZJ?GA5ps1vO$bW8LMl&Ox zmy7%E6mNBi$2Q% z@@Fcqr-<*xM#m(*k~}Hhi05bir>$?abAH7v9{DvuvrOd+R=WsLp;!!$3OD}8Px*@F zp?=Xi+!kyR)|Z-7yCf5|gW6^NgRPRqv>9@sc2oa~qNqaCl`uohqUL2|u4j3Ma9jP9c(~BN!SwVFKdm5#rDmo3IM;$VEQVTEl;+ z7}|>tR9@T=abh`@AxVuyu2@NL!tP4O3NQ*J^8zQEwkiK zDGpe|Xdq(vGcCsmoJL)`!nI-GBTvLf6rsb&M`^JZ_0Sx@qb0JbBZ^`ewMH|<(gm7@ zX0#9cXbW9K0maj2RHxOn5M>d9x`@S3xCI~(PmzY|ya?@)z%Ousw{r*n!|cSS8mHos zh>m7q`0)i5!4*+NE##w!RA-cveazwN1$LN&%?CVyd-FT2WnZ(G+H00DYw|;*tNBp% zRO8J&eyo5EB z-4vJ_wA`}H60Z%(IGP)iJAxk@J&ka^m6oJAjP7QjYUm#6ZfE=|zN44qPmQ=AYSVt4 z$FC@dM_2_5_vARajXq(Dt)(_xmUj;{EXLd%cTTVTX~p}NYFe^uxoc%_R?=+ns~fll zUA0@fl`=3DF8zV5EH}YI2a&=aexshlqH3t05U3KBhemQ!^$su9kLGBlnGd|}Ras+_ zQIYGYUd9%qgtxqFC9?`Qm#F>fM>RQi$*Wm!Mtt}#!Tx@^Z(-|RO`}mViVG@EEVAEl z)>RSHGS)fH*?XY^>T9cz2n+2(A{?|C0j#?+P~9Dr`vi}@rcnX~rWcnGZXQJ^WaKRO zw5K|sixxC5swLfZTLZr@=t6&5i;F!}NUQCL)A!;({flg_CA*5LH0uht-54pto$)r< zG&)XJk*s|c`(@c2yZenhT70n%aIEE4)Idx%Uell2CgTc|I3qgACfeb^jgAOg*T8>m zxz#Z{C_XG?GADeamA}n-WDFvI=W6fRi-L}t;SxXiz=Fm^cg5)G=isls2Id6kg#YlF<%Nu!kC= z8X}+&OS|BJ4UK67oT!TqxKD*p4|NfS5h$(2YlFlM$9!$7_}&(#-PHaOv7*1|EWQ(6 zxC>=cTin4W8p&Sy8I8m$WKuJ(BO<9O#&cQz4dcCJZZ^vsOU)(fHI)(>5Hd$z6@5f) z>MG~UP2wp1DE+i3EnAk9hs8oIL%X2u)^fGiS|@G4+$taF!}KLu8%Z-PMQ9Pyn6UwS z`2fQ4GdU53ZPXljNT4T3KntwEAnXxy5e|pgMBm{EHAPqSCq>VZOj*Y}=WaqO?!Fspv%boNoB)ft|KNEN{=@r3iZxLR+1#|5gc(y)rN)IeC&LbDzF;h5@x zyWG-Tj8phQb-@cv#(Y%AUha>ZcufTeKsL@I4#XP}gyY1>#5dUx&uAc(z<7Et*U>Ks z5ElF>7RYVXQM?fvy?~R7l2;T zbo4f_8&gz_S=}t7imAo?^W*NXuf46FeDYnHWO91!kF0%y*ZGy8;8y#}m(1B(x?kv6 z*BI+yJVs%RlMRpwmsSX=aH-eupt>g!{@yR1yJ%T(&wC8P-gW8&;<=(&37486EcC(s zN|p*Q^z|lvExgt?-Fn%|&U4n|)|W*l`79LcWPdHmS7voHvTfzGBU-A(hGVLsx+|Wb zK9j7>`_)L7%bnrAl{epY*ejii($4`A?w{tsB1#b^{P!=ySh1Q`0 z>hU_1M|IRh8Kh$%!e}Y&LN=X4C*-3#GEfl{tqCp%=w?lD)+}7tHtSn93E;sTQ{vkVnN* zw4zLUMt_Q1;)&&n94{8i5Y1OBCF^O6#Cz?W=FtafjkRi8sP?Z0IbGvPTn{f1%|($& zDI$|@Ad-jTK1EY|oWUeMhGrBh8X${zAszpr3YOpjrr<06feRRc-grr?kP0{bMlmSH zO<}_oC~8HCe4b;mmbV}kFOiN&5FSJ0#`qDBI1-KF;y&sX_MwJKSEKN&noPf9kTF8t zq2;0>MT(kKQA}gWrT81)<~!8}Gq4zE9VYOp_3fT;9Vz zn1OOgMGEiWcDxR$?BP&u!$nnR^_|L452zK*5<|o(^*!H9yOkc1x+n0n&u+ha{vo!t z)`9M72ooF3_ePB2=O-LX?2WQp`G4@S=IqQUl6^qU=Q%18Yt1JhYDZOYhTK$}mzvGF zt+M3!+VgYvJgZ{TjmfKSjQ;H|=OH5Dh`0lLtJMPc7WGn5y{K+y+`$-#WH}xgf zL2{BfEk}_@juY!JS`b|}o4H?m`kPfTkbP+s`SL&-fum{>-J~q^<1y;FnLxWlH8P}^ znv3U|zL`VPCx6|MR{HY-V)2%4p}si?LlErhWh5d(mXq~3B6zQJj>|vH2T|Iy zZ$Fqvv`FtG-fnH^j;7*%&9f?Tm!L0nz%tO*QQPR1UVl~AbIO~*4c%;|X$vRAAVFB$mn!C!$dh(rS=rE?bXRCGJcIS)~ z&(%6_4tEim@)jjH#s)MLHMQnAtu7gN41cpH<%xIBwl;t3BA5H~PyRAQ#`?YxekUT1@7+uYgh;!fY)-vM6MDbMm(ZJbQl7`jjYyH#y%+^eW8*7chUoU08&zNWl!3DE- zeoK)f_WB+OvWA_T@^$dc;l&Ckh0O?#*<4{&rxly?$0vt>9Du1@1xHbXD#FQau?KI^ zoFniO8lOZiGyt7(4^{98tr3nYT#OI!G$X~F&tF{qy=Bb%)(h_O@X_kfDOHPctlgD6h-$xFdhlD>-1R1f{gpMnrZVe}1)c?&GKC=OsDV&On6 zY*n6_5)HO@JL5d67wDgaxH|4Z0y35%MxgJfN?*j^1K1 zHd7xOOC_le9T!W4gEMiLvbii<4>_fv|yr{vVg z&Q~9ea^|1<36UX&TenybS<8qRIxC8(cy&n)G=q#@-l|-nVpK12i8jik{FS%ju8KAP zF=g6<&-Kz;-G2Od>+_bXraQ`;W6O8;w`bH|QKFo;Vawtbn`D+Pw7{o40>oVT6}Kr% ztAtcIXaOp#6mJqzUE>N2_!;F?6jG3(dGOVoBK|=UZzOh7C)Wcm8Me@TAa<4*R6x8r zG%afd2Kz403NqRRR8KA9`8!}{emd=R7NIw^o|=mdI78zxfp_v}W1z9fJnQc0?Tym; zX&#&E8}9J?!Edr9LpbF;;8RaPu5bmw7Q-N`427e>bduU zzE@T8{31=QoHmf#({}30kI+H%@$N>du>|d(*7uio-;j?Q~j-$dmTt+8o(O|6Ts7 zKh$bybG7QSi8vrTicR!RzL&Sf5$%F#Ohc(;-U&}7vrK{A*sQjx+VqW1YuB_EvKykQ zsN8|R^pG|g;mXFZ&Wb0yWf?eu}_lUW~ zv)jF1mBRrfz$H#=Bdl)6L`TtD_Q);e{}=f~<;hi>B-G9*{5kS#hmX0*-}sxkjL$)! z6gqPV&A~OAO4E@G4>=k(3P5F7B!|<)EJ#G1;3&- z;!z5TxQBSIg3@S>r5J=a!Xg@qUEG#;@kzv>D_)`tR$~Aju|zul#3_`dzBqznyplTd z4eaNwxXSk ziczcfC^c%&qV}e!+SIIFdvAi+Awdw?<0f~VbKdWH?m?t+0IX=x%&Q3a(K+tz^fv7oH^D$h}ff zSd`{J+HaKqYQGde$>z!l&2OmxS>|!&dF_f*IWHN$eSMbOp5^z=yHp(LNyoq1A)bNp zSgOt8kFbuCu%EV*m9;@-y_@lXOx6r*u4T2QMEPi%k-6;sn2(9s+X@;LxmZ7D&_QDi zFV8PC6Ll88@tI`}RZm(6`DNKRS5V3yjcilBsN$)FGcjL+3nFvPU9A&r^Qez-Lb~N` zrrmMR!58yW zV}~9TmRFkX9Ty5UgGkJ@?xHKQ;W{Wy6$d+Yw^f^Bza`vqowc17^084HEbUf(iypJL zYIAzepCMBoD72B6+Y;36W(V7EH0HyR$i7Fiwt?kKQ#F;YQU`WQujpw^<>-ZBCO=%K z2mCZPvM}Q$J1<|*8;Qv#3Awx_Kd7J9?i<^*lll~op&vjB4Yj;gdP$YJk5R$t{U|^k7oe^X!1o5N&ag>C#kjopeAtDy|XRn3^il!Zq)j*oxvj0q+g0d7Tb_g%J{ECE#c2EwE}D%$q2fN$kYdc@BF=dGX;XOz=TRi-r`Sm9 zhnwD>#!Y@eOVJKfb<0W&vhSFI~#3_XGQr3c6(@L5|PRwU}$qyaqJhh{DT43)@ z4Ol#;QAe!9Ai6?tXgl4cqja01DU9aQcC^5s^dG&(9yGuh%wjh%2)QuBMEx)lpV1d8 zKGA#{NFQKF2z+@ImILTSHBlE8xeu1pUAm53eihfzlQts*&(V_~Kp48C1h44{{NX_} zFjS@&AXsq=-B1;?aR?eRaTfpaoBS}MXga3iI5JR%ENn&!%(#jiJb@JvaG?`?a2p=v zq6_ZgDGW>jLm@KgEWUw6SK&Z5E}$w%>jjGlZE&ohenZ%8oAyb;UA3WS$DcD8eG^m*V@XiK%7 zP+rJ&@(N22ID0t%bjG>{_}#F<*3EDjYqh4O!(2J8#oia*D%x`+i>DidO|Rs)%q2Ob zLCip18cwbGabAU1gslwtH^3*nVPLXW{EybR6EkXLB!BfUO!!=^ztLv8C84Iq^jX56lrGFeF7Io! z(9dX#+?|av+88Y@cT#>zevc>rzC81`k+#-&&X-%qSXE1#3bXt*ZA$G`;Zuqp$cLyL z?DR&z0gtdoOooB}@=Jsm#oE7cX&3beNTk7v2^p-Hnt^cXi#!Wv$`;D`4fCd8dPLKL zpFO^T1HHBMma32U0lu*p*gMHn)ZpNWCO-@aE>xO%>)E=B-Nc85ZQYfrndcWHOE+1Y zt3FB{9292BM{(Kc!iUpaWFi-F!cOU|HW4S-I8R-@oWZ!)J5B3Za>Mbm}h#Z6mxaSF~<=om1@92YjnT1TB=J)=~RYMPVO zPEtp6f8~G5_u@x!xEQO9mN%-w(pjm4^c_tTCbMXvzpzypFBh`aJU~s9DiGQFn%k@4 z>K#J;k#XjH6c;Z<1^=!5kKk#HkBbyFDCMW8_A1)Lk13Rz-onXatxQv0kJqoq0 zyb*t-rRo#-BhNH_pi$A=Nq=f=)vFo)ysfK+XNdkvTcuAkZh6b=v-Rzy&<}Kqou}vM zA~g{132{tCo7q~?O;6iI`M=G9R5VCL^hf$Y~(qY`k1aw9r@{xjvh(r?F@Pm9Q zTo^&~!Dt(r;Cr-02Y4V;Z3NLuT7g2^hejyI6lxDM(ol>-I9M};!BY!`KZVTvziz{J#Yr2AmRiag32RMi$3E|c8qW117RSK{|#S|;g1-;&sa>c z{I1^Cn4?|RzvKPF3Vf6N@5pT>AE^{okk3yF(|M>7%)R`nF~J++J?{P9ImOl1vq7(I z)G)IAT=uD!t3H2OI-6Fzqx41kF8xQtjI4;>A!7no#y*Q^7uCl|<<<55pC@Fm$$zdf zPlomZk$ev?Ppfzfo=o}F5Ea=E)RPTSBH3A1#c1XF+jAkWO3uE*z8|!YeLu8$k^X*C zMw7y|j_O4%j9#i^D zEE}j72s^zw#&6U}JFl0}Kl%z@k2$=fy{ENotv56_&YOp};`h#3#uI)>Uw~{}S7wN@0?p`jfI<9xg-f8;7 zr-EsYy2JX5+RIel9;J3PJwp-Rp_LLKE?_%p6DrXFSn-l9p|LivrJ}X1YE=~LNM)^3 zU6I6I;z92`eV#VZNaJ7mI_Ft!8ILl|^g!t9%IA~lSm_aOmQk!up#Vdqo%}ykG`?fW zVt-r`#tDmsg<_0UP`*`oK-8exVKtSyr?U?mF|O>Kn_ul;BD5&iXLnV)$fx2pI>U>O zXoN;^PzDvk#qZ!a(u`)jE2?Ob#!)`NGeS4;i`QQt!_zgV{)xBO?R-5N@anuCZs0eH zKvN16l4(EN!2V-Na*AkYS#l@2g`6$-ms8|-ayNOtoUHVfPD+Y2K{_p!is9mBah&)> z>?ZyzcN0U!wc-!rUZEBVV*`9-J2DZ|&FL`p&+?~S^c!Rzx~R0|Gd@(6wyV+{j-ID`9qJ?}*Wj17D`0{JPd;XeEb z&hWv;J-!q%{1Vb=KJFnBU5!~d&ikVm0`VQ%As6lVAXwpxEcoIU=D~psWWYhM=nj71 zi}-f@jy0HuYIunZfV<#;Mv2&u41O7PkwJ;n8`bzhjE0H?G)Ey)DGv#_%zi>HB5?r@ zWKbkDM4|{WNEVx+7E-7@3X#a(p)QUJ?U9KNtPcQ{wjh)enTNij5S5|f81<#|REiC# zhc){5oC`Uu9+elo=)hhhfkz0-6V-&YxX3`&x3^EumqkSymD8U!~+XvM28Dx(wOm(P^wES-h5=uwatXkn=Y|n;Eszy{f z;5woG;F)x$*1d)IkEY*ySN1Vko26|x7PHPQmdcaF8qzZs%|=oKX&433GZu^#w52?> zf|a6JW!h@|WBhJBr3ca?_LC{iCqz9K@g;11$kX!iVMoK#l+#9k_S;!{V0*q7*QHT% z?ebBPk-k}NPDRG#hV~E>t)K5k3=IlHhGn~7D_EJV=3y^xQ%8J@RHG+=ozxfU_*)(b z7r@%U3YBFesOV1Fx`b=5P8O%P)zXq&)5Dt@{m5qSscPo&`(8kefs*cdY1Xnd- zP{|4{kvB5tGA0do>~kd<2th5*FitUth^qw_j`^C0mTP00iE`O`cLSvkycw;Ngpz59# z{4-sk!Mp=kcn|!E7rZ5Jh4Sur?`y9j?S`LGtPeMSGyK^?+D-4}redLRlco@ni9HvW z%4=M;ygQAl1(jT>?z!ezQv*3#xF&29t{MMQMIp!C*VxVc%T~KrYrcNfgi88jv%feN zE7ZSD+sp-0Thm0#Aa+YAC(L7anT3M%3tEgeE;sRu&xhD|GhV-YTIp2kt*5v8+^>46 zH`W zm6mL=>4qFkHq!&SJ5~2lESsf6K7U&8n?41USRI!8_9wPu_SUxXwzK9czAmfFs+m@q zzM9r6E!APB#!5dqTp1~kl!D}7d5khrjwbPBa5woQBKZ^=CoF^&bGVoPf?QsKci|y) zgO1W7s?5vbC|2PF&%!h#oj>KFm<(U|Vi!cL;Ei|)t4Q@I8G$^VkJ8^7{dqH^5ATFU z?3FPTr%5-K^JYeWz6I6!LgOqiKo)mG#cuRPCN<{mk-#lzj|_ekF5JUk{0q|P7?RK& zE(9VIzQ{!&vTzv=h)96SNAYBcD8L)2*oS=(k%S~TAff>rxQ@X9-$GlFiB_}`p*WAT z0M5}kWFU&7kcf|1j4X7dlZd4AEFZbZWo^)v&!$p@;yT-e`*1NCU(lP4MLp_4LDZWr zuwuN2#?K*(O36k&h4x_*hOTaK~6~i zi^5ca^EZ5nZ=~(KZHRNPR!tv~v$W*rl9He;KHcqQL9cvOzxQ$#IwT$`+T$2kHYhqM zpi;oxYHMSo%S{Mt5ZKl~`{DZ6ZPMnS47=9l>a{O%Uso1Xp|$)3Pr-1SLcQ5p>Phc~ z9yA}rrNK0mif{~(7>8*XggSb>@rPkIRv-~WFpC0tfSAF8>4cao9%8eUbklaVQQl9E zE#68#y@PBayv~Q%Vs+Vw+KEfw2sQe~uJ>NDXWPQytCX|t#wBbOzMwqcs}F`qe&XNA zL0$PF90(K#p#em6#3N%hk5JRn<`p07{wTGzdtoBV)_c?KKg#~>N6MqEtXSym`a_$- zR}}Ygo#B0OS$JxW^}M1a;g%lGtKo84zT-sMhl1ZrsuqpbXXrN6%{x?DM zX1ebHd52FG(<n`PvU%`5$c+cNRP>F?I1JutG$-#0=sGpvg{M8+G|_yslNtRvnE(J`{vrS z?QQJa>^H6Zd@5LeH;*vyv79!yG`%;4nRdxN zO}=6h4P<7%)~INtqa~&xjgP{1IH)6Ei9yr~fw;iW^QxG@IbY8QAc2oUASUAkI>V0- z;v8+^;1hW{^uatM-x$dUvDSuxC@eNQ@d-vJ<3ChFe{6sQX+|@?lI8-4LY$4g`ZSc9jy$Gh;lQ1KK`;Xo>`Lxcm35raUC zhQ`kz1!>5n6Nsb6^a)mo2!xB@rw2$wE~X*|4z?J;OQ?uP2bv2j#?pD@qAx3l7#yK^ zxX_XLLxqFBA{23S6e)C&PSQTaV-;d3kJ`aSThI(H&h^!N3#>?k6*p)jwN<{C?g#-v zO+tMG7nkoB{A1`n_X(|^>MEX`)aQl*w$~OUzq!VR)b7* z7XRZ;WE$V_`iRtvsEe?~dOY}A=(5E5<(6A+&NH{Q^X;&#-7 zNID96A8dpbgV;B4@YBLfq~jQy1sA;)(h=_+NslA4UkoZ**K^*NKE~>b7d*`KknK@6 zFoWMJ%EL;9yVn`7&0jsTYpuIBTEUmDyGh%qck>3|yqLir;xeB=9r*zxUr^WzHdNWm zhHHyzJq|zZ|2+KPpjUof{W^w?3(J$6SibYwqa~G%b%;;nUI?$Ym5g*0IehP8;mPQ*5v5Pv& z|Cv5nbA=alNH|rz!=-xe%Fo5N(!kH=!YKtI;$^ze$NKM3M=Mq62A#zaQ#IwG3iYJ9 zqIE9@aVNKkQQ}$ge|STS>3qqHvc;vJJh9%1TBi4lKHTWVN6{uK?+rjE+v*KLW7gj| zfL!6C{)~6W-+C66N}=x6{70TEP2|ZQ!*-XZYTt?Vg#u~4ctKri9&Xc3q!d`Mt5Ccp zm7LvObEGbs%a!XI?pf??nIG%i;<-u3v;lg%k5}K%dA~oU;{(6zp%?F7cyKB3W1qK8 z(>LqS+(90H0R3S_EmTK&+`|hzC5a@6szazl#mOhsGLZHtzYnHOm zzl}9i=^QxDmSP=mf9-S6w~>9WZGpL;t)t~1Youwsx>P-*_^9_y@k)QCL_I2xkoHT} zN$3@lD^Ftm=@YZFis(ZpnLl!o%DWkc(FPhsgkk|-iw2kg4Vj!9W`37H<(>Ezvf(W2 z%I~8;hT|YKUY~#F-x)XgC>+ED3^Km(Osq4?r~@tH=lDuvkWTy_Qus|?69c${R2s&c z!vPI0IJm-d0MtSvUf?et0Z<+yVZ{YXL}hg5PH227?+yq0;|!kiEVvMfX}lS3BNq;+ zD1-+Jd&$8j5`7EC1^X)u>-=J4LD+Y zEJZ0o_UDrJg)X$BJA8uugLKT?NtrB)qUf{J)JARY)7chi`6FUd$dRxa>}Nn&b1?Ge zsjGiJzxe3UgMFUj5?*?zFs1aAyL`#@vUl!flm}HmEbL(Ai4cnYG}t(RK;De!K;%=9 zgapL#n!G3fg=O>eh-Xje88x7K%wK3IrUVs*G^!FCQN!w(KBY;=XJ4ydiR+*rwsiI> zhg3Qw)I*9{mlC(hW1ZHC`k6;z0dalO$8C^I3stBfo za3GbZsej8y)J9*E3ttxLJ~gd{wwbCZ|EPRqPuRCKTYH5pYR&(`5I&ps@%F|6uh-km z`;o0-r)U8*yrg8MoEm3|k+Y@B%3a=%lIfFnLwF;uGS8HLk^k1F7`ydO&PuKY&H(3W z&ob?ZOVeWXp;|TLGY{~V;})XQS6UeQ7(;Q1!gvA87gma`*i%|$+d@~o=V%1x8MpWg zalFjj%k+&{>fgX}*)%6&i(is$qO-vL&U2G}@7`2;G53DS&yJV5YaMMJCxjK4j!5%= zILbffFaGk~=b;ZrzFwZ1^7QG8Kc2O?+w%6?dkxEstm8S|`On54u8p|3^jtyvAs6!U`_oF+J}q(3wUZ9ZK8lbt-j{r_7*+`cC)W)bK7RviY!$vE3Cd2 zr+JJyT4S>&=_-&4-r{NL|r5sqtSt8q6e;0 z5UquZ+t`T@sA~*@i58&*R{Y3U;GNz{OEFX-N@~rPOFDk$jV*qbEb9&HL7ADi6zy{F zD_g1GKy&?rd8lQi+QDanrICGy?T9+cRAed=W+(w{j4+ZV^ZooRUnXu8bHpC9CiRu) z2UHDO5cHttplVw6#i1i38bx<~S2t_n$Ih;}lJt^C@;v31lun14jyU%%&pvN9+8cKu z7}t#b{JC+1FUJ$(B`@T$D8@9bVTaKPt;Odok6jTT2vu34_=gz9M&Ns8mdO%vI-sqz zsO6iuak+VZGkn(YD9Nj?z&mzQ{0uYgVf$e*s!#wPe}3-RRBlZA2*>%>^IxwhIb8R| z>kD}$VU+p1s3%_) z=ZSkv5mJA7oGK|r>O5(SbW(aG{6=djn8!os6;Tmo=z)s3i7#?BDPEpwxub?C0w#*_ z%1QkYOB81XuJP&X!);&8P|sU(#MQz^yvCpW6Sd?OjmK`cXPXx3x$my)i7{GvmwGM> zn`s$>rPbmHVTPeD>|F@^%<6mg(9nVl1N2#cg%!fY`| zc`JSp$V}!!el?s?ieqL=R~?j^j7FPf^s2hw>zZq81&(G!)X`Jb;hFIQZZxmLL+dh$x8< z<1aYJpNQa%kc&;c4u0jGa2siu#`mECX%M-Sk3=XM0C3?nZXpKKcryT5xP?OgGX}tk zr#ONfY{YRSkPFoy(R4gVV_wMjAr~X@Gj78dzHlK0=i!UiXoVrz02P1HK?I^2&%{f- zL@uuYl^iq$Y0!{_P(<<{Pz@QJBMom54G}@q6ovSLTF6BvTL_IWrCJd2lGlI;7s}%{ zPQ!&aXvc!lfNREVXlO}4BbQe*qG8227|6g&7|{4I<0)R0_?9en{}{fceQTX*xb%!iH% zmsxnR001BWNkl;PM~%F< z`7KbNn0~n^#L>v{m`~FO8W(vDzL#Gk73-)P)7h_dLI_|hXr>U(cGFtANhcv0t&MnI z$1~m&tvwPNxm4!>A%Ybv)2!*X8^&R&x7rwEbT7Yy>&(PAAdq~85@_z<^yakieR*fK z2D|Plh3(^y7e937iLHXBn0}D;pf2W#_VVf!xr&;sypa9nVEKSaSFh44+KA0sx_5=g zLz{(e!d;A%!o*ATGut3%@)BCWzVlSq5;P4pS%L6b&XZOu`z-O&R~l=oRW{_az0_Hp zo!iBDpszPt6vP(H&h`KL^8L#9i!EI(Qe?9ny%EBn>PgnO;o4vRY-mys^d+{da03e$~bsFSzod z`0L2rk4BKTQePS?hph~3X&>Vo?)%*Lgf~Tdnd>O7Q0(tqtgY83y8qT^VqD1zqY_Q= z$uT?4L#tP>ajUkg^80v8)k@E!nr&`9_)^tNmCh}?HTXvB%VSICWf0ra6?XfB#jO?ri&_&`1oQ)nD9KAx9g z0u|6Lxn4aPSZuq!9?iOnl}GX$4tvk-ozx45QEVFIM3-0%^7} zhmS=KT!0<^*vSvz9q-CJARignifiOY1PLu6A{#UK09X-&%?KeAKA?n`pdvE)6!;?( zPw)gPF5*9kNJK+K!UJF2M?At1iTen|UHnKRPz5olhek9T%TdS=BMBblQya*r%MZgJ zApiOXHKJ(Q`g@Wd&3SQ9dh=rQj1s5!Q`ujIzm?^dW?J0V7XF)KGNKBiriRZC{x5|4 zboH;|H`e-%^{6#Qt|<4GMR}+sNn@2hN}~EiJRseb6w@#2MYG{k*&c51?bA8nUcg>| zYeZW4=TRTS%0)E~HKLb9O~@?Haeh7Soltz$5$@UM`QUcx>vfayLR=(FqA;a_d_@cq zW5vIO7~!xuoZVt~;7>)~KXk6&D?Q-7u6@#m=$_#wfEdPR>8PL>+A8&EB}=ojlM zYh9$!1R)c(g=dlpXN`9-)8fyqOAjU9ywNUuaE-ya{;t}V)&Xy=cIzp>Ap3p$RrODG zrK!EVU-eZ=#ii;)>6!3M>?#Os8|y+o;$CT)R8@$SMwp%oe&R57rBGFvENrFm!ml#p zD@kFi&2Q9jWuVK)IL!8ycs+Zx9cr#vS6&c&B%mbl7dA~^Vh+?NiB@w%$!4zRKauxP z%>n0165Ipy06`HHv6I+deQkNgUo(dgqHrlsK1sW>V*Sm`uM4Zq`lgDmD<>vO!WD zdn?WtIenq}LV4kta9I39yd&-uS4rpP&z>K6A3j67Y&_ws^hI1ZCK~r3>;EvjP@fiR zr*%b}soJGlay99zv`wmzU8lIvQPc9DJXu+&AJ(I^cES?jk+2y*^6&UkDM%Q_LhZLr zQR*x78@)N!a?d02n zMd-~S3=;s1evX5ox}qIdtn_R{KMK|5!GZWE@LVc&?&UROyp80J`U%R zfvxle*ZFL!gT?$9n&TiG7?0`jM@O2CG(L%U;3|*gB?!blJ{~`Df!0$uT&Mdy4GhIt z%#%=&Z$t{S_2bB@bxIBbovaLZ^Z%>QUq$ij0VVs z13UR|c*nQnC}MCEwULcXJjVxqf}cP*B~vnL@{N2o?$dmlix|{~3x%kQjxbX=e#BL1 zv3(Asz;@@dPe_mIE;OOWcKutuP}V17Xk(tRoA;xd#rE@o>2C2v6}6 zIeZwhP|S;vif34iLipny81C|+cn%#!P|+WEXjREq*Cx+#p`E#*Qm1-p{Dvx-^#;|b zR;^)$Yvr6VukDxpruzNn(@GHcSxGpMY(J}8Z$X!dAX}KUsu~%I!PGSivJS6CK5a%RH_T&LUSu*Ic8DZqhP~?fMNKr-n-_6~XEG)?>@ZV_V;!$%tj+&eu9?#_k-;Pv5) z^V}lE3oF@1??>%`_KhxSEwqD3$3EVVD$oq}PMsy|@*8QLvRn-puSqMUbaqh?#caC9 zf?0n`$63gltUuPmlxxaL+5}~ zn~d&8HFsORI)BPmiI?PAd>X>YA0lS3zj0ohtZfnt^`E^BmEp!x_i4K-uVDE!iFw2e z=qIdJ_M0!O$Cc6EOZpSNuDiW{K#S#bcmn@UYvFzC_{L=}oly`~G{m{0?69}J-dLDv zyJh{+W(j{D)+lnnHO~IkublkOG+mzfDXw67?mNv{I=w7GZpc2+1$m#c2JgfKp_bq$ zwH3pKbm5cmD~%EkiKC^CwCZHQ*Ytg~2K5KdcHu+#iH?qenL_!yoJnGWJI z&SN(+(G1xLhA&#+2}zWWNwgf3k&VGThZ@o^&@mFVcmf_HnqTHU@S5-Ce*C7<4e$72 zUIF#l0m?&Xw83V2i!=ONnnDYy0G*jm9ceAiqlI*oJQ$A+@W6vAbcL289=Ap1_3^ zWMC8$_$lni0qjO9`XG&tq5>jt5HIl*ckvt_WHL! zw)v;Q=~|EL21orIaV=_?RkIv1tCp_Hc;&KW5{HTL=*sfY7Z5tpGa3-00H%4n@VF)u@wGt}7Nn|`TZ_Oz~}vj*#}WA^zx zvh22)eEzb9nN#T+3l(SaJ6uO~_L?oAP10`RsQ6A$q+jF(-toK@wc>u1BkprG);H4V z;$9xBW)$^u_jb20OY*eWJ{uqSPiXC#sej`4oNLOG9b0&mcH6U>-k=qqAnudiOL^}8 zR8M;9mt*RutZ-g3GI{gjE-rY}wOPhXUg8$LRkcaZsoHBJ)cpznLDnZ?t(YcQght}; z@>H>vv`(|c&0d@7k= zTmE(})Xy5z*nB$A=6b&O_SD09YaT?5ELXP5&6G)H3q9Q1Koh}4D<`3xY8dlzKxk@A z)S9~Ay1m{v?xSV7T0Rn$Po{~cw(>F4HcR)idPZe!zNM|SQ;f;6=VoP%c7G~s=!zG& z3d3lb@*OkS3pq^uT`0$!E)DhD$llI~&Mq&sG=?lH5nmCWP)CCoB zo%SIbnV7^jQ93H&E^eX%enubEK?r5QhA6Z{D%zn3{Lv2$5s5Rni-!zB4qVW|XUc&%`iqB|+cQB(CfM7ZZGc=xxclZ}EVEBMI zSP=#{JSapIauI>s$b^e$z#l4JqYT}+4~kLB+oBAWPzzPjhgQNf#+zPR`JXR`kU-J%h#_O-NTyK$4Tg&%oWeH`n<1hDTSCVsvFi7$}a$sVPj)Kl6oYT_i8!A?q>gl`3}u#p{R8~JQ4M$^PVA(UOqKVLW`=l=fB z=kA;kP98fH^pCCBRn(-o1NTE;WMZh?QhH@?=<}Oh4L;-7%YSH4tgVyT2#_r+<-OK$ zrM4L6<;H0J`PbSd$)%${Wfn{<)ir+>z>~gfRjmwvMwAaaYWRuS>SUJkMXPGxj`BGE0gX9)}C@u1q_a-~o$|Kx zmTIj$ON}(Wi)*5BUwGyY)EXK+#N*7L&awCGTj7rWj4wAf>Jhxu2y^z-a`l;dcOUL?2EIW5;2eeyKB{5qbQTC1Zvevn*wznbgCibH7?3VeyLh3DgOgpHvK1ZLSr5J_! zM{i%g+wj%PQ%yXiaYlPC^Dah19)iutrWZ6q&*LZfT=r17&xUv_YGK-+!WLRY3jfwP zg_>nOJcW9o=eCxn4MvJl-ngt?;|=+G?{Q-Q50L|AbR8ZJRZA{roz;t+{?Ttg%R zDGtD2h{QQGAqn?!4k-ZYq7_n69x5V{j=oS~qhz>{$Wx%9IUJ~qTx3DReKG-lqaynR z-v4Lp>K*G@xjo@OMfpCv`z8HLkGF3>I^QjEL>Bics_dJkPE@?g1ZlDG2v)q*dY9SV zKe%qwRGw~Jm0yc7;xIK;2{HX-bK8o1L*+K&d1=4U3FG)Sshcp6l~I&2$as{qE$?hz z^%s#JkG#M0j-^k2n^(9lw^d%1=!)UNkuRbzM2DBF8vY@)TEyhA$3asAr<*64YuJ)< zO1_TDYH+URrRf*wTGcy$UmKS_t)QkOvUH^@LBHV{>21t^V<%ZS7Fc0#E&j2g=AdT0 zQmcFQwrUN$jm@^27f1{JZ=KcGHr4W4-DrNLck&i^zW9Ao=FuT}kMM``o41eg9y5gJ@*_1;d{14C z|FYNTPA@3XS9;I6mWumXKcNQeD5^q%K9_VXBT4v&J!i2(A2Gq))pF0LzIsgk*>p#! zVEp0zMaZF1JfvW9Y0uJVqlUItgP>3mZKA$pV_i{`Hqm;Xh!a$v(x|o20-;3GS1rT1 zXhaL+*Z~^j-_y3=`j>J<)NqGaGGyb4vyv;(b4uzUEs*W03sb~M zSLd=PrENYQ{1E%`W6_PWHm;bGk8YFqUddk9E@%JzFU~P8YvD*|n&&@fOD#l;Gz@+a zYxz^5lIXIXw2d|o3~A=u+jm9SV*iJ}G1SCeT6Wj9&-JpTi|`#YkYM;@DSP7fGh(SD ze?dRc|FG03GRm-2>!@9pKJj$DW7fg1i!-y_9gD8yf0KPPSS4A?U`t@H#vIFIaVo#d}_XyRebBMO$3w-BAf45XenOaf$!R6VVn5 z(l8BQcxNo6%b1R9SOk?;;sL7;74>+1{t+S{gc6*kYPd_c`F(0FY=$p%J`*mwD0HDK zxj)(a5&-@)`C>ai%u{$X9*Ru3=pZuj2E)(*mH8{YqziN#VZ1NT!AF`xQ;^Fy z@{PzvDpC=OfA9~|&=`#|8im-47pM#sfQzrhJ$%G8yue4k2!2SQafkx=RAeF%_2EDk zGB5|%=^*|J^@&?=~731>QBDN4%V#r_J;PI6pYQeQ$s3e!1we`<`+CbeX+0qO?VY zhJkKOJ~_$<<)M{d9U?sj}bzM;U3(b;x5IZNO36+#ogWA zi@QUM7Wd%6iUkrvoMdO~BWs-x=NEa4rUqueEyH59#JfMar@HH< zuJW6G3y*id!Ec?-$;@4%ZVbGRO1OhZ2Nkpw!f1r!IHR5tTa7$w9g!en zSbnidpGpIGURIXXkO%oeQ&*}^7ri5WH=J8NJ$)CvK}r$Jgn+Su@n$pM6=by)3E1!1 zqI$L6z7=YMS{UPH3pr0Z1sA(yQE^C)kq;1K+-K$4HiIjJmGb6fjHFDqJ~!20Lfb+Q z`DNQL#!#6in@T(Hz#^UZ{RfOc15)@SqZ)6m^wbNp<#Z8-KHSiZXk?(fOf$BMAIPNP z@~XTnw;)2k5Z#PqZ%uPDFUZ^4Z<}_TKYPyWQ;k?&iFM_%fqw_y3|bfw75*x3t0M3D zYU*K(5;x^#&q(cq5#_OJ{bWf`wBJzQy6>uk)bj2~-%wxA^y99To@=?cwNly;eX6&t z`(bi}6p!~|P_DUxd4G6;zia<|<)wmnv(nF}Pex-6Kjw+}Efg8Air+9z|cz_5* zz=v;m3J-4KEbid{i2x9SV01Z_|rI+{& zff!sxArwXw+-Q#EpTmCUI$vL}`EuN&v#+klXFR`;zx;`6ufSM)Xu$f&qLI_1-$pOZ z|0=3L1?G zo)s=d{xlwnDLEZoA+8&rkAE+a+~aHJ_uS;Q&Mape&zPKn`e1P;cl|1|H#RLBM7?=Jf zqcK~C!7SdWq89X5w6wQ=v>gaYwn=ke_(Xy)!9K3ASHtEPI?O1O+p2Faar1bgJ}sz0W#+E0Cicxt>Lx1t)g zjqgUEjCNUdGe*mH{5RHI4^xZzfI%29o5?nG4pH)gtSTQ0pV%tz8$)GxaaXRz0G6gT z6!++Zf2n3Mg5AfoQO0ZIqAXA2mAy0=0UR9sVn#*N+ zlYq76Bo?Y%XBGGmib7AS%3QpL(v}DEp1eO#Qa1BgUW#T>Ga4$J8kzW@&(upQ+3I4? zM_T4S@7k>N(&K!(JxqSka!ifsgPhLm%Wg)hbtN4avmMphe@1#xd($fE%u`v}AjXBQ zFkd7Yea#-JEGrfj(A#V-blUO5va?{6IxsgLMor1G}XYmBlqGF?+zQz=VX>n41*#yJ96W5=5S;TRg+GbD&DwUo67IXVP< zceqWhEt_ql(TH1=by&{xD<4H`+QawS7drw2mj)(>ObP44>2F7UmV*eiz-hFgXq3iX zSpk)>7#p#aI#E;niaKbD@0dj=sT=yrRZwUF{e{`+hQU+;8FUb{kWT;692!J@=qFoD zUoZ{@Q3@JGQZf36c2h7trFLu}KCz|bKqTd-8FZ9xQ+-UN*R&T&bcw#BKP$j4vKG>d zC)A7n? z9yCBhxX_zMBOE%qp(w%-1_f{M23ZI~5Q4zKz>tM3WFZ;JNJcRfLMh}T4sN(H8t)(w zj!+~}JUA)lEuqQ(j8d_5?TsCI;@bG_tsriAoi0_tvA~ zw1!sd14M$@D7RCZDMu?K-%=a@Pw#~6^S%HrHEWs}BqP;7Ec=x9C~TU^*YIOhnMQD% z94Z?ru^yi>g6jL?MP;_c9LG5uMjQ=fq-N4mwKf^C*=bpoG?Ra~ce<~yruhTq9#p|B zY@jE!+8Bs~a-6Iphf_t1Vk%}Dr`3`d*bF7WGQzUhvD{X`G03>0ho}Qs1FA_Ul^A5m zS>AU3{Q3)zQ|)bVS6kl~b*?@`tYnwH*Y#vMmPRS3EIY(h^q^_-g0#sSB3a~>xq4fp zqwFq6$c^$DJ8Rg~GZ8~fq*V?-Z0%!IWWO2(yhXg*w71$gd0p9O?##z2cGD+Qq@pPK zXgD3993IVT@fX>()7z#T|6cLyiO&F8K&HQX>euhL5^ZUF(xQ?lCKXF;l8~C5FU9ra z)%PLE$w|K@`;wogj81Knax(Q~%DR-TsRh$ZrH{|pmnpNlWYx>g$hPG6bd}B>>b~n- z=I-p7xsze;wtXxm*LD#$^DUe%sIsMD6^q6)fJmnCHJ=bS*DSl;XIbr{^!b^ z896sRcRUxf-Z+cqF3e2NF6^9_+&-gY=IWnSa{M`YGfw1ob-l?Jo^I~yIpux5v}4&t zv{u?GcXts-+q3d%b8yAeR%o;WGxcg>hH+f@L?zDVIEJzjn2+;#ibHafJd6=^kCp*Q zKqFM85;PL)0W@c`s5(`~H*~{Q*$Jg_9H0l-Cqpq6t#FzWXgym?jp;j%u(R|P6{t9! zV4*yoCD0|NPzPGh!s#}P^iNZ0knXcuv`{RIV_ghiCAB@Ty94c zM&dfFLhI0omeDmjLp|7H>PilJP8HZrl%!qkHnm3@=3xTrQ357Y7vivC6t3cTe55S2 zKn&U;5lL8uBJj{Fe1;v>5QG$@AO$*f=*UJkvJr(SMBy_2##MwM2qAFc4U&+BmvEve z!eK@Lw!?=sT7Y+`h!|8rJ~{&z;_wjxsEY^`Mi^Z{eFR}Y4ruxO^^Jt`rApo^Vf^y3 zyiuWjtgk}RD%+G8>d*bJHhy09;nH`1`opAKk!$l*&YRzH+n#I>gH@~&8TvT&v;VtV zLcQYK<}2f2?gOqz*%i`{xE8t#<$gs;e##tHwp6jyQhO_$imqQkrnSt;>x93m{~y=t zdaTB)n)L%yt!CTuOQ)`l`p4@J^!wa3^N$NJ8ZqBAho-P|2K?QUutePc?9mzMx-zxv@#%I4QbzY>^axESyf7vT3 z3bTiP3)&W%kgq_98X9U=>7l=v$)QZ=BdlhdDR{TFrxHYW#6WGaYdv1d65>~Foall% zT6JtUwKXDX8sB0(QFnQUxx0E=<}Q@4;es&#rc9z-egW@kHw~gh(**nm%jI&r%*8kLss7O@&5txA=oTuz&queK*|qrJF5R{=zN*Mm-|8rt3iNboYAq4_Ao4ns!LNXppf{zpt+|J{U`dOZ1iB z)r)G9{?T}E6c#Hr(i`X#J@LLcb)kE*|C3tW^Hhyf%elJv9{4xA5Bb8?Y|nrGM%uaD zOJ43P?jNjnQsewp)mqwhUu*wVHN%zUJMZ(mr}`@ROL^P*7yF0$T5EyE7k?qmtMAoA z^lEw?V~z2r+$wqth;DKWl4ul$$=g`s?Wr9$F6DC1Z@wCyqu!hTiu$h_^hHQgw`ey1 z7&StR=2Xi5f?c6ya6+NSSV`aLB|NZVK5`I=#xPSMI*w)V;2Jfd6?jV9zjH&AL;0^3!T6MtRYK zk6=T25MRU3@|kiv+l%6Gk%hjpdNhtSHkk*rQ?!7haRfivLpn=O_`l3dMOYPjh(**8 z+r?~(rcyM7y3uVEkh2g@(`huDiXa@vUX;RGN}(x8Lj#OJcO)YlqoKk@+fWk)aSD}@ zf{#c*EJ9HfR#;*E-!sZU1~QO>6nOrpCnub6A`4{@fKb{GJ31f;9$J9;$bgj&L%@on zc!O-Jfsbf{0f>N|9w9uuXv~H19St`$y4ZYqfZ6um7V}Vh+v25ocmDmp_|^r6hXv)o zUvNdt+Jc46!cx{c#edM>+q)-wZPts75TDbV*Yn0x+WXo&A!m6`zKjO{T}(Rub&I-1 zdutSlo*MDAKvm-orlHQmL$5}^yyt1Ik1@(mxYoN*zw@;sDh#X?o%kiYcFuXNa@w)9 z*g|E4ng?hevu}&LerC_iKhm4NdHLD#Uw7LJ(>l}QvWy47^f{N45s>9V}`B>P5gkt`SfWX;T1jWP7v6721&&qS)2L|)Ut+!yZi&MxlX z-R<3}##}vF3_~sEGri|!6j)lxSJL1^eM018LyYz8x;fUIWS!+$sZ~|~^}gckv5g;f zO;QJH=G-5uOYMhS+-7<#=X1BYk~qpQn7bGgsE_iSd90$Ev#sZqe#);ui}{3h+n-Ng zCRBfa!!ItW$Bmsh=F?>%K1)RTf4bf5o@`}Qmid&v=I1of9A_z{C#xxbw|dLJ*dMLc zRWtmnjD2dTx3Q%;&yVwAtAe5ehK0QgUK}{z%FG*0z0@FmxBQD%TB`-r(aQVxdctyY za;`h0eW%q@h#8}s0aPySQWm8PnE4q zWGiM#4MNyW^JPbepk=ttBCy=M#b48RQ`=)4Gj{s>7@LjSuD$;GdN<>RC@i1)CmX%= zoxXZ{ma)tDA_~fl{y=@MxaIAxP8A+qH5Q0Y+HUO^y|G$LwQ4!|LzY4f{)w)kpw?Bt zr6-FA#_z(6r}C~`ES`%nIfiYZMyO@Z;Ag0j!(u9BN~CeJB0f_AT8s>1yis5L>Mx;v z*F)Tff4X+T`O;U%e<63Bf2f)!2N=tYAN&{bS$xoSmY?#|8OEp!ckxJ8k%qCQ z>?oaJ^~r(R><4w?kN7J#oph$Nr8I|qX5ZL-7Dcb95|d0|6>Gs_aDmNb|4}rHpq}hH zFT}rN3%iE(au$0)53zt|(eHGXU1SzM61`{^y8 z{ZBprzbdGLSd>68!qE{M5JYWh1I%(4?jZsm?8HaZB#we;Lkm!prqOo9&{I=>cBfm& zZ-aWwYL~xG`v$|}8z;Q`{BqBTvss6KFVrGzd3a!%t|e>4mW-$rzCZF6#-lQ=&uyDK z$Q_mzm!_o7{4q84Kx)^Fu|G#;g=MeGXp!lDw)4Nxk1xHcYN*!{6k{G~9{u>A_Z{AQ zuh{NBzCZ2D$gdlJ%+B8-c6gzQ4ck;aSgn5k(Rn|G2kcsRE%KlH$|Hoy3xOAv%cj*R zpdHeNB)81Ioc+(Hc9*W4znZZudq{Ts*ZV&G_o?2G>M6+?HB6tPRz^3dG^y6T7Vm;~)0x0sHazM43* zvrx9tm<|`6l2h4PKHKk*>u9GdM4N29&{r8h#X8M_vwWFnp4wEu?pmitY2BRVe0_Aq zF_5*EKaAJ3L)ka+p8c?>M@PRfcA)*E(yFT4}1qHc@B4RIBL4QwL{{ zcHT(4@$*%7C(kP17vJBOJZ6|WnQoc~*z)Vcly$a1&usU9xrf{vJjdNN{T=kZMs@#c zZGrJha~MVR;+n&lWDM}XRtu;>B1<^M5Px}ft{UZyRC{@^`&(mMhJh^cW)&BV|{)SwS`%4jngyT^>eta?>EhP!2+<8I{2qsMH8nx`1o60oN%N z)o>9L5r96}jW9lhRiIs1%Wl$FxL6%jr|N75zAL*~8m}lX$RyMw8!Jc(YGMcV!agWC zf`jrSKJqVA9RH9+D(%5VD#+4Ut}HGc>@Ediw7iXWxFl4WU-pv)#WQgZRpl~~$bOe) zFp$o(*Z4*D6*r*EwXBtRYkV*^$b!5iyCv?)QpN@9fIh}Nc@w=6BNiiAGK|3!wneUy z(V`Sp#W>Lmgm?0@Y=@6>2o50t?@$I|G8(NAB1hpi5|DrZgdq%J@WBTkLJ)!o03YGQ z4Jh!*EtrWARKgV`!G}XAjtoR014(iP6j-qzDbZUBtco1aqh{T~jlNo4{1>JED@)D#r#?LuA^+g_`s|TWPVNm6L;lmwpguUBw>Hn-#HP^#_r7? z;Tq=7lk@s#aytJ$FS8ms^f_{HMzsR%vr1;4Pq(DSieKD=eVz5vp0;ZHWuuq=*ifR> z(Zah*+$ep+>IuxjJ6AK`l<5VR_Ft7-Wp1^$R@c+=F&zAmc`{jIf&l`Ocj+4(k#JK9kQpp1;X>n%&TQKlqe6K-*)Bv>h=;GL^NUd3qr&um3QO5X+1Y zqKT}Jd@SBRHE@wu*E_^_O0)Q~eLGQFw!>AW1sg;?EiIM9rqSMq{?ERX2|bd1Kjz~B z`=Mm#U6W^2=sbrvPw}YH-Z3f1zUJt)Tpv}z`IuQC`dsjxoD_=GqfOcxN(1{mr33RYA;GZq>fQJ>9{ zQTV7>{r!zt@yy>+ri%i;N#5%EU{h1RiY_o>DtpDF`FOrtT2L9aO1II5?n(!^iQ}eu~!6McNM+ zrP6$g$7&3acUUr2qqn>t)+2`MU^|`1JOtr0#!*B3Ms27N+d-q)I|`vcSTe=YO#YZk z@zZQHJ5CPzl^@|2mI*7m(G|=_W1PZT+K)6!WyR?QEs!(t3GG=M?4d<;6{YAYdeSR82s^4#MRp7& z(F1kx8A}n2H|RwdQ5}Kk2s{48zj%b&sEyh%qYeb_Qfd5$Qm{frHg2FGj=_Tjw4wk6 z;T!x&rv0=RQMSE~g%&mJO6c5>kbmdB=<{Uyw~*9-l3Vx6(=1=R0!5z|+Fvvx-@Ncc z`EMf_Rd6DWvMy%EYyCWv-1R<>`%xxkmfF&N-+RP=#CReay5Bmxxh{V0kUS#wiSL|t zTss`vHti?oYx!vl)k^AoFS3gbvIO6`Y#}A(_ zare)b-n>a2zC2F0-#GZHTU>)Yrh;z^%&6O>@Wevb>R%|=x5E3(30hw7rEizB9%aw% ze5y@wyX4Z9V)jS2nb&oEF(oSFE%{6)+jQ}iPVm~= z2boLtb9;N|dW(wDaz69YpR76?$V<>8rHQSGIorC_u2?o$%3FvpqZYCRf@q{M!PqCu zc>VtC{#V9wY~#f|Ewwz@hGW*v_PK1Y>5}ECb&?Wlox#iN71WU^sCQJynIG^BtYW)( zW7B);#ab%wlo(}&$;b2YcAT>qUQ;R0SDQvy&dSZyj7`_}i3YMgrDFt(qEyUdBg6_l zK&FQuwl?N6|6!|LoaCYOyJ##bi8qFWzT%!}&Dyec%69P$e?b}p(26}USFpzf*3@^a zk9?{#+r8OS++S92Yoz8jQP-&zy%)4++Cuh{_Tq-=2yx2m`rG@D_eAa`-wic^=eOk8 zRd2K0<<4??Nm&Sjj`6ihY3yawlp%7rd?lvoHH==yTw{+YEH8jTV_}op-(aJeAaj z#yHnh-xmK*ZJ2I05_u~7jFH)%T+NxB^TZdbm-H;w_Y1dvUoe?LM`$biO_Y@P@sW;V z30k5ciqj=3i!ipIrz$=!Wq)>qa#5STV>_rlKg0giPRLX8gH~O{Nt;MPEeh1_#$<6Y z>t;@R&M$g`XN&(@NI%OL>s5A|Ua?M`y|&L~9!x-4xOlq){_2W1(^p8*)cW(dGJ_jREO@dINC^w7(kWiB8}sR z_#AH;Ehj9sJ znun^eqd%hI!w=*j7;fZ;6+rWHm?z3z!+ZC^ zpl8*e*8TQ1bK#HQ*Zeh}O&eD9O3b=~_2pH2Gu!WlhexDD-uISN7pjkQ{>}L-yQ-(I z^J#9yr+q$}-|aU2YBa$(Srk|4hAMp@wQ*m%`L#Y$-UbCpTl#$I^)uF zvx++p`FCkavKroC3tZml+$r8_-|{3_k{;`uy;Zf6G?PZ+1)gIl@;I;m?B^0UkKNsR zZ(7<(Wry5a3q|h~sVt9q)4gr@%IL27A{(Bq5K+Z0Ix8VozptyYRiv4hnH$?M?MR_IcDOft)C_fsT1Nzk4MtV5$mpKE%2P`n>;CCNF6mSBfAs+(36+%3s!L9#nY;+k z&%dZOv`hX|YICu{*sb-^Ylz?V=f+Coi4iP3!mbxL8i{bDzH!^Q=9{8c^GEwCX#>>= zEw52l-u2WoIv5K@tfhEB6fJhN4Y`-OPfZci(%O3W$>q5P5Nmyzb4pvqOQ=JXsrDPj zU~@S~6KxMC+X1Z|W0u{%8HnM{{LS$fv-|GKV1yf+s5A3v8)RkFQe$NSl=DX!-R?gI78`jEFGa6L|6zfCL)Z*vprOvJ>>^ja}=h= z)Q)@zqpr-uM{pg(s4u(6i?D_!qDZ!fm!=%_!e8tut--IdoHRs~oWNGI>r%yCaN0rH z7*1p89CKm^?O`v-Mn(8C_BZ=iK0_#yP>YS0tuRVt3cpd89x|)A#6F=ZKP|xUGKZxyFR^2a`E6&PaP9jd)E2=@zkzK=L`3W z8XZ&1GRPci`P;VMTEem+Jjj;MG5kjdf24mvhAY$Yv!LVefP~<(wk5V60k3jmDD&ql zqk^a{PU?!Uk;r&I=gasX7u7J4C5x#!{!~#LO%R0L*a84@ne2u8Xhii8fjjgL>G&d( zP+yEj60Tq|I^eDPpQpdJ|HQgI8%~aWkoX`iu4JiS>fEVzx=WLWx10B+HUSZN#+lml zrj{P|bEes*146-y$jH>=#Y9VyDWz;L zm&yG`QC5_-WV1~jEgx8U_Kh#&3-Qnx#yEQD#D*wcELDT626eUei|A`qrZ4O}%E50I zR9$L?KIEBh(HwbgMeN^Pt$Y{chOD!mQEo#F5Q@yuNK6)f?4Zx0G#-ez?h0apS{yrA zXT`)P=bdc%)Be?T-c(fCq`w!{q36E!lyP^G3!T3uO}A*aW9C`xy7J0CL{36QmQNob z%%ZypmA&L_G{<>1p8k@LMNMO&h@}f^ThRbrWmUFLe3d`681V@sZHd9T!7QR_j3f8( z*T_$ah!Jg!wR*CC$oPt3%q`;3lC@^-wW|7G>MEJ8{$~7Rs>ufM%gP*a)=0`?t}OK< zR(MpcbKv*jvBBN7(I!7n!yr8Z?e&&oFWTt$WmUNXvr&YK^Fve}4UD`pPH$|ik#>|+ zuZelOk2cU&)>FNx7SfZ{RdOrlN{u@5d6dJ;o1B56fg7y})Pp%`+4oJUD}SD%HZmLg ztZmq9+R68_QLGA^$%@iW`c9d2Nx9EgVl$<))^x!5Mf@jJ{(uR#861D2k~j*NET#{Y zKSWiEVB6`ap2v73H?sD0n|d-vu^6ayVZU>BIxJYVU^$Fdfz2p`Rwv6HL{6t;-+&~BjK;E13}E@#e71s} zW(#FmD#iZAOfJ}L8bFV6maFVEnP>qWrL$Czui|$_b#Vqx*^k;_o9HN;$#nS}-ryPL zVVGMtzMg+RCA!E}TJm_=Psk;3r_&(h*DPi9PiwCzLp_2+~qgz zp{;{z1(l9E9x*v8%%9&A!u_t9;<$o6WD)ZgvH%p^3LJ#M2TF+X_SeyR$ z?OUHOxrtA|rX_VX_8Ln>Gi{E3Ufk2F>NDj%X~AkFqY8dUdD?+Ac&H{#gI(^B@la?F z#zU36unmvE=~vk0V0jOA83h+MVJe=YzZeuy%VzX1^;@O>m-F5Zdt6|r|GQp5EV2|Z zTP+6^O=-jX>)nJygs|3>DE8>zj8;Z9{a^7;sK!27Pl1e%-v9t007*naRP4|dgURJW z6(40~ks?!NU96=XO0|tK)ihORJ&<6u<#pM=`oC-!`ujRjdGfLwya;b;-RoFqS!PQK z`p&Ot@u69kPh<+6V(X>6Ges-Cl;IeNCdlCl$`Vr!RvBH5y0VM>ASTFGauYnF7WIc& zgdj`y*XoMF0^a)Cc%!bXx%xu$xCg4QRb9)gTl7he4~}8M#+$yC=r z**)3)P+g*RHG0ZkvXLw=AL|YDgG^x+s5`c@N&LDzOda?vd}m?2tE^6`thD@{?c$rr z!T#hAu?ubK6-#0l**N|?f6r6((=t(RHg@8YNXql4B??$97Bv$Do#c|_|@kE?5+F&oL(Fg3uH2Ozg zgo-YP4@0m?>7s=3D;BHir@6H`N;#p-37BN{S>nygzp)c+GEd~uTv9wMNoNtu>;yKGJ*Af1%i@&E%0?x{R6@>V>)9P%iO=V& zSgbOJmuAaYEdPgc*lrfXU+@#`x?*Kjl$U%913$*2cq*UFPp~BBW3PF%a)*YqLo|fq zco7!Oeq&SESGJxJb)+dYnPyWb_Lge%U3@m5Leb2H5wx3DA(l$gSGHQ5kS`H{dFV)e zX$-ZdhO~~|#u^z#d1O0TMS5|KF3P`=pO%pw6(~PyV*#8n(;V1Q5iJmp{kVuK(9s)Z zQ3026mpUN>7QBE9E-;v(<2nRPFyR|Yq6*p}0}0@Cp0*(yAy|wMNHj$}7SSRk`;L07 z{(IjCeYGd&roH&_F#W@Wzu#rO$f$9oMAMR!zGwV6nE3pN^Wv8C;U8UZzq~7(bTvg! zDjC@`GBGCHy2ISo-1}+3`+aXN=KP&8A!j)oMJ>@y?m#$-<139uO%z5HUZMc{As>3e zheG%haY#lTG(|F6!wfskq(O+HcyvRGOvFD>a2UrCpgastbU22O9erc8t=x(t)yw2~ zPpQz{ESVlr=b%6A(;StR`Fy1E4Z+OL%Oae-4E!N2MnQF#=*9}lR@7X{qC#|xt;c-C zQbB8od4;)f$VGdarF&3I`!s9C0IOAJW_^ZH)xVc7pg(w5xxc=rfSqAR$J>14>v3PcmW}pupq}SS8@rq_qDYSs9_RyxO3RP#BY!&~+XY#zJ z>B>_D<}78bX{B|MX`ZPnD@X_EjNhRRHLhwujl80gFF=25ES33q0oy)nw&|03*UuN; zxvp})e*QqUAHRfSXsDc2?52upk`8UCueUC=EB@VDFZo)lZ zYmNSg=r1BzX}*UEi_JXEGRi@wifA45m*b07QHt>Ce6omQRp}Qbv+qiKd4@`{N31v7 z#NSa~DnlI=m9JHP!y3F{9yWj_m~LvCC`cdZ68o3^$DT7gTfnO-vF64`KG}%=FxrTw zMuxsk&k{Rm1VZ=^lbLtsb@&W6o?6KrvN}BYgWAynISo~02dc@^XrO6~Qi(5TY3w4q zXZnx3sjk&(nXfD|x3nBHhq4UzD{CJz(q7PhIi!fKjblT0NB1-T#_wN}mZgWd$~nj9 zJ~O+hryNd2j6B*TPm;Hmvp7F0T587>JjO-1UO4OlCxJ8@E4?EptS74uZl_B;Q8~?Cu}|cp z9k@Zm={q)&pO(`cbi_ygAA7?)us+yMiztYjS!pQPfoN8gJ!gaD8vrpfNLFN%<==RY zN_fl{AcIcIqA-#}?8Y!!H`> z{qaIp!y(oHEoCZyOI%q%WwAvji*E9g=qU%_5Y?B7G>A5ewem2$u%I@^qahij(h4l0 z9Jy4UmpAbotKotRwW$^w;+?#T8+e6x$Uq{_;UDBfI~0&((Gs= zEvu;r8_Z_uw9Tx(2}k(!)T{RX%E;t zV))fmjaJJWJU`-%7U*a6@h=mD*i5=rIh;owSqY_O8~IIgZKiQf{$TB7DeR_8Vu6?_ zD^da8fI<-~`Tg7`sK|yubTTC=I!V4Uj|FRDQ-dA-b?f*o`4nM$9l;iRJ1+QPAi`Ww_Tw)CWDp zzu7Zfmz)bj59VtdJvd#sclpCItnNMTBQ#P=6YFgKWJh$dFgAo&WG9gZL$nsz8uz?c z$7`#6s_{&$mXB@Le37~1)Nhs_RNN?Pr1H0Nz0s53l)-983gr7(6#nJ?SZR95=J8}P zmm2ck=3S;vrW$%3@t>HiR8YG1td`9xFrV{MDC z!W$75T-aXRDcy(MHAGDrMCHwP)RGlx5|zOUwvi6tis>QG!*=l1$|WU9Mx!Z?DlYz( z^MH}IBbLYJbxJnxBA=_P^c|@aGB3HNe6apJmLBYAuG})!at-!Rr%0!d{^n$p;6s!3zne0GaF z`3AO&591E%$_&;_e$ z>NK9spjRxBIq5u4$$g8+3Ii4bh=Ao~wf$4A}0}oIWCoq=g z;2z@81Ua|~J5mt`C;FlYnjr{*sEz7Kg9{l@Am|?AX)InM910Rp75Ok3r?3cBpd*f! zp&)?L$P1tRiNH^NzV7*!kSvp8k`=Y9$D$oqAE;II7J8!buNZ2r8&ur9v~rE2$;C6W zn&wt@*84Q~%h)evMMqH$yJZ$q4(S64l_QWdK|_inB;TIW$HZ+S6zdX3!dB zBOXf-05k5u#Wrxxf;z74)3Hy}{+Zn}dVXLn`9kwvG zj^})|@|sVhT2vDo_y;;ix#kwUlreGGoU#u_`t zDc^CmmNr}Vk)vcO{iAFroO})c&Ud=k>N~`&47cCV-nva{mhV0q>vIfmz6k;0fx)I- zyor)x?q`m%yfq6;j^&WOk~QAeE@+l*g#D#sqvLzv5v8!@j3wQ&#}ui24tRuF)Yg6X z=gQnu?i1&# z7=`HqJ%Nw*a1CKJo+qL>Tg0BAJf*UBv;rSlD(gvp7EbHf-^zVfia%0H^Rj$C@6JoH zf>fT{se$594)L-)T3Micnd!fwc8bz{=bQ3q|1>PY5@$wFy;}jCmOXi|Ht{{L)qbC}n2U73> zVTi*Sl*ePLg%60s9sEQ90?;23P*4U<;T^4Edl^2aG?OgV1)M< z62xe+{#(W@Em$igW}^}&B1Hs=Z+M3!F(1}fZ(l^cTYCFSe3$rl^w59F7h?~%&9q#h zUr`P|`~w#fu#^U&5~a{p1X5#4#b9Kj8Zr<81u1flJP#dXWCp6sB{+=$dY~v=Hm%pU zdGDPg9}GM=WTADteO}O3dd*9jneDsdU|>iv&vQ7$o9AvGQ^-I|cUuW-OVuuamrb3$ zj0@s2J7)f7&dNI=uxrq!paJ&nfhBAQZF7U7l*yJT+c>3{5@Gp4cUd9xOXZHKk2Q=Z zDSOQ86B2F)W^zaVv#sX zYtaR5wGh3CNY+m2W5qnRie}OOkEySGvg&HSUcL7@_ues>ff*QJ1_*9}!6gvfg1ZC= z!2$#VJdhxP5C{Yb?iw_>y9Kud_rV7rbnNOmXYYPL{PN+guIgIVe?Zq-UDa!qP$6eh zXrAE}J8+C@88bZH!gEDr<$kEDb0fJo^vu}@uhWei$um|DdK;GT;hDdzl|So3psJnI ziRLnLm0l~J%ZJz{ei9F)TQs6B>WM0<|Frf5dwL_ID}~RC3|2W%pjpCi#b!m%jVPY4 zU34(IwQNj%T|cCzrx#3*eH@cI;_X7~fU8Qjw(jMT+hW4Z?&fZnU$hl(WtOOClsCu8 z`L6Ru8JXRvCLXB4sKnB~01q|rme*gReflBYlA69N9_pZ;zsVh1s;<1t7W!NN^lGp- zI%KAo364;;URLt15B=!<&T8uz{&$Wc;y&c@A6MNoKLwlEzi>de)s2M9nPHcfCqj9w z-*vceg1=PO*Po`RjZ7>4{NGm#(|&#&_WXIU`LkDVyJU5GbK%oN@&#Wims}3e=SB)W z!&ppJhn+GqJ1*+qMP6OSX%hBMbQJz&ap&%t&~5C!NErciOzpcP^*t)hFPjxdG5&$OEgi0L$1JP<|1 z6WJ6iU~yjFf^@Dd=8IwC95J1y!!)1Xi29;9DvL<+5oik~(PDZl7T^>WCPimxv?xXO zNl_KrN;&8j_dp7tpw_eiUV0;nAsLPE8BL`LVm4~iRQyFp>2q-do#+W_(pf4k&cTCd zl#-|D7XkWSJQK_G7>vQ^R9@7jY5agEBi7l6MLdV{&(_`{5x#ddy`8dlXEn-f=RYO>pfl7}m&PH) z;Ryr^&`P|7;3{~I+=!uBFljYCK_N<`dpPLI8T&Xof9I(~_7BKCs8Y8RJu5^NjtpiS zW_QtT^c`AZY>=O4`!zC8SXcQ!^RTfQUx=#WggQt$WiN3{&Jcf@8;yywGS|gD{-5nQ z<<${UlX~*pY^B1!@|24z7ycsRrCG$~c&;0#WdZrjENm7vrK^bRv{^H(hdEL%knic1 zCR)OMt^WF~?rUlN7yqEQ=pNXD4YUTyBrL6ls9NfPuF1RA6@3-Y*@t;DOZOH@W<&as z{wF_A@0?jJtEu$_BWQ?Du{&4`Tse^JEaP$_o6(vc(`O>qdBBVDGW6WJqgvRzm0z{e zk-CLWbxa-49n~n^QjZg@Wyp*ZU0vtH_UNyi@6=K&n+oHNT5~p9&igS{%u{7BLX>oV z;#T7KklQ)$>ybwt=9n19`UaijBo%97gPu132us_(v{?W>(?(qCQS6)BE79wQoyQ9}3Tr~Fjs)zA3>C2}2f z!w^+o|0{d3o2!bO{DzMpQs3u^Tv0#3uezR^z>V>k58$#m&XS+fPZ*(RV?18!9Xc<1 zt1Z-m&vI3C)Aw{SQ%Q@6fl#}0!l_=r* zB)-Q&Pjk_ZD!7)3Ec)CmWu%L2t_sHA@{KE(QAaG7g^h8d0?iQXu%6z^Ya$U|@*zkY zL?LlN452(?GF8Vk8bq%!8)@jwRq+UY;Q?qAVv$Nss2kET0ZVZYb*T=LaSwIig$Ev- zfd?MEM-V|2q4v}s=}1BnDj^-Ic!_F=heOq=KN7@Elz@Q_FmRnlpdcEGACZI>GJwq} zgL8;Q67FFN-=;V?`0%kr>W?2Py;=YE&f8^n_Mj0=rv?-c4`=5hyh#j35-N)WxDAI_ z@JV>6Jl%%}L3D;qUs5R=Rp@n%$~CM-pDk{;bpNoIUmxwY-SwrXuIHFl1?Mn8ba1UT zAEPJ-)G4Ddk5^aRO$Htg@1X`X_v7M>aCpgv~n;g+(w zr?VMD-9EhZ9}iBmJ7E~P!j74va7S#RSNgnIh*s7*d{D1Km7VMM!Qg0nmtEUB>=@1= zHB@)h&GE?r6LGILzxswcpx;j(!EUUEro4TUI`3qdu^SGJ1 zffZ=w?4ol>K(Z50|G4r7w&;OIGTqlbRHDwN$bR7rQXSM9vy^NgBV2ooqGmT9&)?8Y zr;ci)3kP8xx6fqu@;1rXY~@u!r+c^BOl6Kau;3m7u$GwXZn~o|XL6GX1ufy7ZmxNbwuR<&l=GbX!u-Xt$VaYRl8rBnPV~FDN!P^>yeV>qr?czd@b|8+W@ULmtQEhw z)Ig8xw{(uMN9J^nI-!fZA39g zAzs(h=V=gsL&M~1QNu_vCKSkHfqvqItLY-z(8$N7sW?x=8(NC1c+WG$UA{wKpdlB>YtbFI zco@EcEk=s17*5?pNj@Sf=zr-W;zUDks?I@aHwoTNVYHQ+&@;M#QK-w6kVvP{n-YW zc!>r&2gl-(SVysN@FOZy8EVb7P$baF@3&vvxc|82^O>rWs?WuE8K)wJmLQ0NXoG8b z#06j=8kcYx;M$yybPS?CD4Eo}@#*@VW~EN}ermV)WAVpwf9d- zoziC-_=tt0a3A9QaS2`TzJD+4#JEX#TMu9UZ6puQ8X zyXv~?U}#Zrp6i@gXteTNH$NIzqC2^Nb{}&6V+;-d)B0CGaHggYwQAd0aiiS@!$GrR-QIQa^A~opHJpzm~gDhVyePG~=99OT5;atLi`XM6@L4wHysWD`_xg4@|P3 zsXQ1iF5@M&L?_CS1F5g8BT8{j98-0;982pK@6@wGk)gA}tO=3j zFn6OfC{6olDR*HXKA;sp!vh2{7ei?Q66g||^Lg6I;kZxvbu>qyylzSzAN8VhXv05X6E){FPNT_CbeLY#bWBonb-I4blIvoZO4OD4Fa44W zabZ0TEwG9E@oAj2cB)%yqjKrzIw=0&Q}Cm;s>)GZkPmPf-mDsMH@?Wfa!wrKGwh?^ zIY0NJqe!G>bdybLiTU^zGq4sF@CMOb7gyn6H6qam6_5xE7AoK_&SM8Y!Ga$NNWcRm zBN+~@rL}mCRVa;Q1n4@v;tm$0Hr7xo@=^g*K_RL_%PENzWzh<8lx`w{uHq2i#Yp@} ziO7Lfc&$e0vAC?Ba0EV)kKSPxex=h?0*7cL9e~DU^ykM^0Ga4X>kyx>PSUP2BbPsz z8?$g}e0co3_(=bBXR<1Jm}8p`pQ!x*iA*@n~#TmBLgp~jd+5;b$hI&W!7`P zhc)zy28v;3d5RW;EsvdHkJr<u@c! z;JNghKizVvX_QkYnoZ3SuIHX&*-C|73y(~zl{WO9UtH&+I>GEN3rU54(S?gULJ^my zHsYGxEIS!ry07SEx+b;9Wv-1w7$i!I@y;-9srA-D^|Rh0W(gr8y^+>pXAn>0dNkl& zx=-3!5q-lCM9kG?d7kqFo{MBTSm&ToqLg!h7xIrflOJJ@+RXL&NcwV1=%xCE=q5|5 zT6S?vq8n-Q$+oUV;SW5w!(z;5?vVSjJHn-1u@MEr?}RsU9Wd*9dhuLwm703uT@&3?R0cH@ zBh@BuNK;gCEI}VViSp6L@K5gA?mW30XWt(4ReXt<$}yv(YlM%AY?||&dC=&W`+L>c zx*D_2-OOn2eklJ!f4xcl;q;3d7WT|d*%QOtdYXopjc5@u-Mr-~q%^@ne0O@%Akk%L_v~N zTz?5aRkSczF>0U6M#^qz9iGm%SPl2f^&GhKP)S4M7`$Q@_^dQoNB0z*U# zDha`LxeI@bEaagYJe&=DhXtta9Ofx_#1Z-~hG96naRwoz;uBc4*F$v$excs#!|JND zSJf27Ih+@WZ5*o$=v}-~0CYf$za8WCJYH|*@3i18+zhSwAv&Q6PsSzGLJYdmD5}Jz zxvzeM6kJDZ)aQHn1I;-ckKo04#PSTBL_r!#z(bzIjS-C?f+&HV*okC#;YCAKLp40* zvwRk@)R{WNKwCbCXr3ziBNf|_fD?Fx805inKE}KF4ppOW`YOlsS*lB^NaqTuh~Zq6 zgM1xxX%`g~4RHq^{UtLVVIshX^lylTK~1PRV#&ZvN{T3#FCy2O@%4VsKW%0HO?eLG zJ&<+FI&O7zqMfhwatfZ7$tXkkJIDFcbOSYqP zyjS={sn8NNR##Pd5k`-!8TMS>X7r-sjx7Ubsw+;Pr_Jh5RYsTOBT;Y7G3I^mD!aJK zPD$KBReTc)+{aYBc1D<^@(zpY=V@gA?kvzVjXJJg_6h!49a9tS?0T8r>~s)`R#X3L zZ}Gqo_fze1?p1AFfb*A?^9@s;_>dm_mYRP0O4r~QIU?v_y2vlw-O|>*9&2Ys*75!2 z95vnqhS{4ifkMV1Gl;j?#>K^1YA5zZ=a##~#GE~{g<@`~zqvBkW%o0S zl5i~VbnjhX`OG%H=!`3GAEp1UCuaO$OWnz{$~E6zJG`o}$P8QvuJF4e99KPak^L7B z#x~kTgUv_LyFBB}9rjiJTwZdjnX`-p|48eRBC`tR5DV#w9O-lx19&U1Rjdc9I34Gd z(NDyEDk5uh4}QU)8`ChDc2F7Gg~{alB{pRAK@2zItrWwz(1gPDCG6uYe2)ZihbmKb z?uD)NdK;yWp2KvWk0 zh=uwqF-tsG6HyJjTwBB{Ia1CN4Jkuz6MJQL@`>GYD+-Fcw8iKxv*;tW5F5l@abCLR zBKngi&=ql$dWpQ^3iZbg)Z+JO$5ptuZm9<%j$7;6d=ss30;Bj1?}dl=>394MzgH!l zRDDjpRsD1;-9vX#ADt!k&(>hOm3n2*QFokSI*0OwR;Zpl!nv&;b8h{!uEA+4MlHf} z+tz}I>OEM+tbb%1)$~t@K`P%vHF$Ud9#KImh#+6*>v(~ec!@-uzzJN#ZQMrcf6uE~ z$U+v1q9~$hHLXS(DI7*g`bY!=lW9Mmiou9QQS3%FI21*7kU%$34Mpe`#-Skk0~AmD zX%Z^&RRA7zz#&BQ77p+Vt+^ahxD47L6-7`7Mr3+EckY%;Q&%-y8>)FBX(oNre z-)i3g|EIvytV_O^zN;$E9;@>j-%t##iHpge6>-Trp>{hfLOv^pZHr7=AntmGnE#q< z#8$CP)CwwR4stnN?kSg0bqbF@x&zUe|f?8vE>kGgYk&9JO55 zm|#!4oWASS)K$2m-Hb1Za4Jfl8NZT=1+)-ju@4E>M73YVIwz5UahOVMjr zhhzY6gA1IQIwp8X^|wdZKU=e%+xjMDpt(^^zNZ_WeWD)K^6mGn^i|Ku@GlGGx4K%> zt%rfxft^-SPjgo;*PBn7zPP|a`L(=26jjpI)s<pdxEjNuWhD*5Kg)8eBNw1VYArg79XbaO z=laG{S=A`WjZ{&6T6_|Jit~o}j`m|N=JRdzM>ftvJFU4G=jKLy7Ui&)4nb0o=HUbX ztCF|}mecRLiu00F_;Unl2KVM=bO&eYoG6LE_+Q6D*(LRDc|tBy$7Mg-Z>*DUV-od4K~c*|5#!_n6p^vwo|tRI>Zd}AApIdu zh+W1Ud_rs7!C=nPFCftfo9z$WRXyhnp2S)%!8z>H%eCP5`W1hr^YR6~lk4bax&=n* z>CP$VFK4d`)0b2;b)OeIWz+~=R{y{|by;dm#n6;5lM!-M{l|TbtzzeVq%ds5k2qO4jcVNJPgZFrkd;hsEmPAQ3nwlNz8oHOCtq zqeK)TgMUL`K8bQTifE%!R807hnQ`+@%za#YPx*RP3#85Qt@K5t)yiD!`%Hbbhgf5c z*_bP~h;X9@&SgspJMZpK*JQxR>E3G8cDY=(D-EBaCM2)6TRCIYL*K~IO#5rym2Y8V zdb!{e=c`~5^*fg~E8;DhyOYdc!dJRl8STOf`mTE$WNQdily6LB4 z871FsmPKZfbA;EjO-fHu$jz=k3Ez0zMcAr6s@7&uK2|1)J-4_AdL4s;*XA6ZLOMQiM_SLA>i;cwtfQqxsk92PZ< zBs8aB`5*IkjiVb&on(D=Q=Oa3AMNULqv+ZA2H|&%{&I-1hGVT#&ORO~p5{0d)i%7B zu}Azyht()Ole0M?XSW)i?UEjwo&H3q=)P zmTgw1*LxDtVJqJk@b_?6&_ee^IH!`l_btzFOl>kc&k>?dKP0W+Z#sM|_uiQh02(lXg(m z$B%NoF0bEt-}>riJ@gH=r#r-x@SO7njyY$YI>j;nZ zOy_#^i9S*n9VUCCM6iuD(CQt1*T^mF$Rpg$8WUU{81LMY8~8epan6S7d40iGj*U8Y z8gFz>v~;M0wbyS2rh4y@#}~U)roy8>!*EDdoyY1L{1ZyE&8;z1_~|m$KtYbjBg#W3 zX)Kq)BldD(K1Q2SLW~nDQ5?1K0cFHABx4%oK_y|)Kye0_=p=2YFj1OHijhKyvNRXK zKKhmR(Iko!e!fVR#VSz?gT)cqOumpkOl79X2;)~8C5(F53xjp|SA;`RC+w}OE` zFpdUs0mSntu8D0(MlznlfrCW6zze+i&r>l9QSicngJ>8q5X1-!LUk$vAevrL1H9yP zRD(e`=oJo%0Vt2^lm{L}iml|OJ{XCUynq|PKoRbT>oE9x)C9O1&LSB}=nOBl#xx}1 zHnj$LjwjkPzi;Z;I^&*=zSyJDx5NCW?Nv@vW+`9Dci4$_is)0Kx0oO*(tRw&Aa^_Q zm2B&K>$GxiyRW&lyK3l;6UR+58wE}W&(M9`$5J!Km?|#?4XdoP+;=7Lzu=)@vi;tv zX|#4PaBbvyG)KI)o;xZ0rF!mkv0rjCXNKL5{-LH6V-}>z;!8DAH?_ajUNE-$W`99QF7n1YDRhcFmWa|-@?%J<6SXcep>Fm3k zG0(lq=q7$QIvXC>SHVeEM^!oSjkU$DK`VK-E=@iRQlHzaoJ)32p09`SJ#|Vq(*2wT z`d{v)2J3fvC(p)4fM%j34c6iM4(~;fchNK706mbuPH*kc8>*#CIYYIG>z?ykj4%Fd z?8Vrj-b234YTU8dYXy?4r8Ic=CUt1t3B@K9IFV;q_*%)@6^aY?HlK-7;*7hMIn%w| zwL{)AmfIVhf;wCu&{gbs8s=^swlZsZ$VAUzPt}lThoyrYsLQ@!PIKB&U<*ZhlZt6I9ZeamU!Ty`Q+ z5YZ?j-yjx4 z>MVCuqiH+s5xqDcf2k|Tbu`SF8%Px=Lgkwx8TgGi)NVxhQ8S1C?xr%KeA z{CYq2Vdzo%ns)O=bmGzMc2b=?d`(Z~^?Il7N;P?ezO3WaKWc_rs>5^*-Ae!FEafd! zR%Pi=n5x_IaP-tAc_5GFV!Q~QP#j+&lEe5Q24D!5AOXWM8!veQhVyDpLtpH}KBOZa z@i1@!7ygrAFHs%UQ5gVWpa_bR>E@SiGb6=GCRY45 zsdZ$#bLm0I(bxQ#}*8`3Ln6KgKY7m44t9 zQQe#(x)j3j8BWp_ia~cC;_Tui=a_R_^$Paat*N$i(z&Z%#FUF}oIQK|^4OC(XHi!w zDi67?nq8b$&6Xz>EE0C7*7M%kBhFTTv_8eC6j^k1;^XWQB5(N7sQyT{t7HZIdjk7n zDu%^)_Jmb;J#aH^q1~?Ba+dfV({&wAqnskY^Ow#=Pwf;vm6~FX_GeWIW~pnRR*4!B zRbL1Fot)yibGZt6tHma`^O;GR%MsKhIb6r+{=#$DXJ=&s;S zkqa;dl0rzw2s%f9tIqa#zI*?t^j2wU)_HqS=%Q;m`Z!IEQpQVHclXAy{o&+EGKa{M z^0K^SzH)Um`?_+w#%7d1-=P}az+A*rNz|lCm<9t) z@ePjQCFevmn)4+Du@42QE(T#HP69L*FGUM%63@_vf5m95qdZ~<4a7R?LhHFT)^mUc zp(ifWJ8_l5#Ao6zMpJpYOe~ZEaa1%X*~sVm{my4(m8ZJ zXQ%E?Mfd{u=kHVo|H%!UrR+vKUZ&geE`5Z5q0YJnui(a<8<)9{UXFXXh@3Fchf~o3 z20kDKKVv?w{by_ce;(5Rhijo0YT-TJ;{`p&07T;$qHz(?l!Rz9Q4*X2UH84M+h2*nORi>cZ11JZKt7KT3F~$aqgV%9-b9+#PGNe1QV>q z>eKtOnf<(nGWYuL1izLQL^zccJCR3s3V&xFlM~Z&24v6){A^va7kC=D3b{9CFAz4@ zQwP_1x(?6E=iB4;C{yPbnS6mO;Ib2`wf^0iq^GD1y$rKyDVm5@G#b13J8modxUBv) z){6?d2Zz&2@!bEyDMI`0Rw|E}XN;1$MXtDKuH#lmW2DS2YuI&Eccl2c+QT{E+zwpI zs7Em<;am_$jCeC-^mhsOZQ4xao*$fKe@#oMC^e^Fg2`4@XSTS?IqkFl2f?wyp!M2% zV1=_=-Ej=8M-a`qfX<=sIt!d?YGvRS2V`3QUU|Na9~7}TA|k3%z-{HXE+Ii(v8t=j zRFu`stSN2dGUX>z-t|`r?&E#`G<;O)`M(jDT=UFH=61Exnw{;bJfjap{3|!gJ!T9| zz~so2?z^5j`~`}NEux=TChG;NI!Rp6TPJkK`a!(rC+Ed$0P( zNH*IT&E2`hJoPBYpGInMnz6#QHuAdvIp1zFkp-|BHfpa)}oa7O3tPms;|o96fvUFhZ~1BAf0OA4U&1CySO>PNDY*B zMyW)speWJSQ^M8POjLdC+p2E(Y4dM0EYQihCZ6+?124; zw{%A~6A!SEI#M1=q`IQI@S9EOJVuH_^ortSntVV}=pclVA_j=>=nn0mkGx4<6*JL^ zMq`4oaEhWhrxtpL>YzT?)A&7C(RcNH{Ym+?<;>-YI$Ryn0&{o?e&s#-Tb-8+sCioO zYV}dCqJroI4-ZvWIUjA}8koX8xixpgd~SuaJcz(M?hYyU~D6r>;pB~St- z{$oJVh(`2(^ERlCNVG>R22)S!gNraw2@mNleuM{Y5Q*EgiTa>A&)|!AN=xVlB2fzt zBGCi}66r9a;iVDO015gMr{fydQ5`hQd8Xu%;yi22tZIw8RqkJGaj|e-fDb&!d8uyb zna(=ifG$o4yR@@dk5hLXPsZ!Oh(Ld58<(aQt_kMfZZ{^2xkf(!pH?vy6FO`CX1%Zz ztg@k3G@6zYaVyTJiw3Sai|u@lSGVInYK^YK59I~%T>cw5Dg2XXbmsH)+iA=7231}! z6X#{X)g^3z=a@S!^pABe^ei9)si9eRKK)L$K{xpu&Ex_|qFOL?Hk#>rXN+gJo+#hQ z+2&5q|J<<0;wsf}SCrp~Eh=QSwfAPr;U4bV z5mD9S4WH%IRG-@w+_TLeU7teZoQuxpz?o2aJ3>D zCvscZcH@HhO8;b!$~`_}u^Cb~+y^7F{2kRZInSzR7f@GezwV$Why2c4H8~V-Sx$fd zX?v&bHg<_B;w$tCP>cpq+dT+n!6xf@2z;~F|aEiNJ{k zok;^|Av}7K3Um6?ZeETAx=Ga$OLO4Cdg?+S=?2}PE-1sZc?f?`Mkw1+@jpCLad=5H z^i5ukVO#)S{anRi3Wn=&q~inD!^6Aq9v_f^1&GEjc96;yX$^`{A6kuZe3Fl01s-7q zE+7GsC=V}UVNh#2iJE*J<4^?2)R~%4b4=#d@M1QACX|6PcuC8s0p^MW*e|!pW^yl2 zqU~^~pfN?vrA2gAd?~b;DSG33Qdq~Gr~*sMFN@2bC_`guz3e59h$o2DnkB~ZHh|Og zG<}n2>HNBsvtMo2d$@|~u0HV$U6t42g+9y=*w%Tu30G5#xG)~!8>~h(e$QW1d#;1C z{DHs535@5XsE!mahifR0lPHR<*ov)){QoK$$w)>#;t`L!sEfM);Swl;rznCV0HR^w z12w14h@`i43T+V0k8qUk&|&1^su+TM^a}qV4`w1BZ>T>dp(y^w8Z@DvIDuruA`vxX zo0dLa`o!!n=bc;Fq{h1vyGuP$oz*Kn!QUzLG1SXnHq^%c!o9~N*Lmx-wIx(CZH15g z{!AUZZ`F#J@4o6;swV4lG=txu63#jM^p|`hIMzCC@8ze8oP)Y72kh0#F4Mst~w>0v4{?|3k>(Jq8q5DzjGgTb#+zXRwA$bM*oW? zM$XW*(6eAewVBVWUB0^3V)d_zu?sjxc&7M0aKT;J^T9LTY-l$1cmmn&-jrV+lq1Ds z^IIMsnk6cjNv`w0#8906%iUIgopw3=g?QrEPH|N8LPrI|=pG4hh|b8q9il;&=V5wFd< zda&w_L!o8*xOk%4km>5`q++Cdap;*-6!AF5J81yD6&)!?`M5VVP}49Q-|F+aJ_ezT zyySY|x+-Uftx4OO?n&?VasT_AZ`=EGg<9ChoX+|Q9X2ZQ7ivzuhh`8lU(xu?BFF4no5#~5y3E|Y#Z8?X#w-`td{If!L?B7%y z-BPdNgY+2(!~O31@@eMu;0(K!U09j+d>le^w6l&n@v25}qdmpiU<@!xUJ9i7vj?up zvQC=a%=JEEW903qC{KLYQr%rwfR`n&qqp|=8KLwgPK>vXw;;8qrI@4N=`ZyL+Kow= zV>F;HVh4Z0h4~;&raV-h+8~7&^Ex_*vm}!vXOcyC*iDgKT5sozG(t_pNq)~A;pHW| zA0DCz)*}`vybpQM4z2K>m+%toLnTzgKUhJrNJdSxh6g9PIrbro6ylML-gt=#*bf_L zu@M7Mgr4C8QsKe(xPaK(a=ZWlAOJ~3K~z*YD2i|4K_r=I2rs3enpi4I(;wB~2eX&uT#$5UvgT-kWbeksO7E<{nSL1Hn zQdQSl{ZGa4L)}ro(>E}c2kI$!s4C+-Uce`if?fJ&z6FDi^H?xVkNe!lZQTAZKcg@DA{KQ}37{sFgm_*8FT$uZPT@TYQcrw90?Htk zHlrZY$)tY}jaYO+b;P0pF5z!lf;^` z@~iW2t>{^_Zo-I!BKBTAfV0GQktsI@-dLTSotatwLZL*nJ&n+3oI}n({8d)x;GAG$ zMjL<2;2VCAGU!4>#a-iaSWC06xmt}=bM4E%s9=%6C3}^MC;CjnzVK zY?tGQ;yL-`*RHZ$713Cc`5^FzwcmA<8rYr1M0-xwQGOw-x`cHcLF1&Fz?a06$lIMAH6H0(3ZZq;1p;9gE$ZRvHv zg7!%4)8W`9hr64(Rz~F2JFI+Jt^H$yMFL+30>OCa^4q#;Lwx@juMESSo40SwtL)2p zxU<^nllDG+PI|Y`rsW)x^QL?w*BPv$>N0h+WZ63xD7-j0W~br?WlWUnO`w*g9%W zM83%OZt1?}z7+ab7s6gX!p)Fqz1B5Ucc)U;#!MZI{J8zoXn1)mM{<(>iETcwm+@@# zvHZt}9?RhHy9@@kM zf&nxLOQ{;FQxvVGI9#V+v6Y7qVH|EF77otx2>gtiTn;IGLJ#02x}K_`%i=pejW2i- z*Wq?Z(wh6g;fhp=2GIyk!7lwJzJ`Mtc*I`zq9&GL2};5XFC5fC9i$-@MF4nc4bl;f z&WNOQbP7dLjb`8t&XNyayucbnqYKUgG>7IO4rfsbPpK&;Ba(inJNS%RQ-8!$W%?Ed zH{t$B5jVtAB(R6wD1r<0KX~|pyhULZDm440Me?k}KUbVQqQHn;{c?{8uCvFeQ*VCF zy5PGV_R(x(9`KzCZMU-p{s`T5JlT7BW*D!$r-HG;+q$iKtUk;7-ZvreKCMTl_BFBh z@f0;Le2>c#cj-sU!&&y%x~nyj{}t2Gjeek4BCj!D zo0H8ZMy#>P{Krf*=9{~XBIYdlMwWBs#YJ4Czg>llbhEn=Y5wirs-CI2x|IAzj(7i( zzVH2u7iqq@K)}0(Oz~XKH_Mw{jP~-qsDVo6V7ZYVi#u>I0#_)HC>z<|J92TskkBIP3z5E?Q6TAxo1w%fZBp+@YkBrN%KSLYz zJdTh*h$?(AHr3MyBf^H6mE|h`bAD*Hu-B2yMx)=9Sb+Jj^R2_I9 zeTzuGi`LYTd@yjs1uF+P;HQ>f zPs9tP`?{o0`{&AK&vhe5c=+_F&9qc>6uIR8qw7DTtSG+s5A@kp)qT#H!~jDcV30H@ z;*deI`N}`L26!)tg$? zyZ7lg=d+$2s(UrgP-EAR{H}Q8`o(x8Zi#(VTt8t4Xemb08oa=E6`@9}xx~+eJZC1( zs|cNdgD8M8=&36s1{35Fc=SkJ2-o1~Rfxr8v_yu!u6Ls%t|K0Aunp&M3D6WkA`W^1E1YG++_BQncA2{q6jvB*F)av_XnBN{`o2D_*a^+$cwr(S4> ze^48KOk(3`4czo4`qLEDgB$J1L;dlF)}T0wQ8-FN&{#xLAM~J!sHZv3vX+d$LqqFfHSCz&yj?Bh=IUntfR5IDI7#27H+)=rI18JX$BmWrh;gU z)l?SOFq8@*FYTbdsG-->ecYq5tS=nIVKCz85#@qIqmhCbk|>5G+=2r?zsSZR5})G? z+-QfcRGn^6O`U??n2sbikEYQ?jL>^gT(9Os`5$#vw_Cj$EV?|#H)Tf12G_5ypL`uH zW{po%z8Q9T)V(usZTBwZ<0RkbW1@)SUK+4w#KtG0H#Ix~d+eyvTUt zTkG^-`CTy)PYo%eWrRw@Dt?VEku|6%xdRL6cOGXl3P%Pl4zv!OvP;@O>2frQT}3ID z7&^;+g1=UCWfN;FV%Y_8R9)5SRMURwcc+~dhj<+IMGE?eGxlUP7=!c^RZ@;}`dB~u z{;~GTX0kDEVGng;iR>a(WZ(02eu5nr>s(9dEsYlO6rg+ptPoqmtFzOTojguP=F1Ny zed{u7q%^Wu%3t&@{Ty4^5&e$#pf!2B{7 z!XMZpSt50!+rC>eAJ%0~m5=Z~Ifp-%=xNUnuYC!+Wqt;S+ngGlD3 z!}_2<5}~?-6Dzl(XwH(MP0ek(mRhSTi;4UiMft8dgJg-!g4Q$pU}jb86DKTXRbZ(- zAu~LEN=DZ^yY4@_wfPHI&S{a&qD7A3IS=Y*)SPwJ4V~rsCNIio8NXp7b<|Gf%wVLJS+sWWNUp!yYw~ui8y^1MRZrW7YC^{HX%b_*H4fa zdG%%#!**;(I1b_P50g+EWob6G!#WH`7~*L)qVyTG*Q+T& z=V?C5>*}bF>Us&rV5#1WD2&hra8D1y7y$q1iE!`+k$O2A(`@WQG#Vp`{-w`R0S+0Z=}SgMgsr z$Y4iwbwsifNCD_Q?gCVgc@c~1bP9g8PnAPFn~FHPp&yD|d`07`ox*wzpWkZqfGNey zwZ=2Ej3ZPXc`EgocbhfJo}o`*4OZzqBym(%(;wv5j@#)VVliJ%GY=b|i=!%3+j{ov zl}Q~^u0QRWxGU+Wz+hFz9%R>4Z31QWe7IG9nn~T*Me@^Ix`1(rqyRNWGrq*SBd6l2 z?4);6bMGhCPr86iRV#E$XSQst|3sv^>TjpNm5(y}nrY&gYZHwV8KxzE;Sa?a(BIWT#L|10%gD#-W3w)-j>=r>Q=IY7w}&_r?YjQB)WgQV=2fFA+sg;Ki?Rm> zxnAfzh!K4#yF4WW{E{l?#H$9Vi@T0n_L2=`9sLXn`DC_`U7;(qfEJPmv)KdQ zg>tdqu%6b?aKzF}sz{UAMfQy9vqxe&-slAU%|GyST#E|)3-ecbU*%^T=spstjqxuv zqDkJhsXJ4Dv#Qx+oIbdQB;7q^r%^}m3wb3XRrZ{hTu<;jUxs4(TxKI*$-w>V7v9}; z`kP;b7mupvIzXjZMAGc^^63vlL&Y!RUiK}a{XB=$K1oZ@XeE{#kHmh@)3AGyf7&bU zoA$ko)#<+UEa6nn#@JuTTpx}mmx*un`07(Cw>E!B{%@!Odu&K^hX7*4NNUbDrsU0>=B%fw zJR5&#jDm1B(+<(wxZxx?N!Ai1;V&8{unoWI40NKeC<>4DPCXy%wZcu>O#Kmt>iRVt zT91K9!EB0xU(eV3;YR`z5Dx(Xk%f$mtT4raleH6sORz!|-1ldll!`@cmR1T@j)F)! zgEKgjW%K{4OED0-G8{}oTdb!Fr~tn{qbuMY5)gvjh@qPL9u2_-`1MIV!fu2i94B=G zYUoH@XX8*AZe1Spv0oj>HcUibOhf`&Apu2bDE>h=48u99fH?r>@M{={q5Lo|^J#o2 z%F=7r7sV(}G)0CErBEEBX1p97!z5NiAL9i?Gjd~-UZk&M1bQPc_pzTSuVy%?Q}kxF z5c_dd*G3YnOC2$Yf2SwlIxWNt8mjLh1Ce^T9)`=@jSq;T`Yeo{Xjq{0f_^6#bs05k zDy1Fw4+wbuiS~ECLLW-{#s*SV2eiU(uJ)p@`HlaA{i*XS9_eINKjed%-zeo&kjM0Q zzNY@$w)WSucGw>hm!xE-{+n{%d(3~`{#m_pNbQARC8|XD)gjTHO;MNmJ4!_?daA-I zI(?phpp!rIbJ?4Ui^lv9v4HiZ+eUjmL`~73@Gi&qh9g`V^jwRKB< z3SsO!l9WwXMkq}EvwDdPRY#@kHhMPYr!A@hylgz3#b#bz8|oygh~2yad!+BMpUk}h zruT8}43mDHOxf*i{_H7l(ya7N`akuls-WKClI|@8zr_Db-4K|f{>Yf@oaa38dw*$N z4%aD-_rVPI(p^5}hwOdz#`F=seS!W`sTXiZIpOug)_iD``IYtQz4*jDX|rO!j2aYK z#(UCNIiNFAlIN%Vn|)@)#_)>Si$@lUeD80T5t-VF>N+8Ie|JvTRAVpRU@i;jy@+Hk z?@cGgIm+wO7PiwGZtuc!zKiW;g`5fU3SA3KcKp=WXeY*Uw|BVniQZ@J4YZf{?VGZm zo*3w5pKy|#ea?PWz~{5~>PuF%?5Vc;qB93)7P4-nyh}Oti5b;A*Ftl&`%c)etTg+D z521y89S?V^s75*>{B5@Jbr`uHpCIgkSkDM_j;X1CWdexak@5 z!bAb~2?CfY9wUtZYyN{0#Kj0Cp}V<@nuvC;Z0K(+bh*$&>^3_ggkR!>yk;TeYt$2C z_*g6vUU3+G_$mGz&)7Kr5J|j~7>pb=l6Ax_9x(2rH2Z-sp^79jt*d+7{H3Ex&QSIfRYgmGB`d}L^~0mg-ri{SO4rmcJx`5}W(()OeajkRH?s>{ z_4y3e8h;5Nf69}6Uc0~kAtTD_qwcFAn8eOnAx;77;=6gNsVPe{{|@Z&*Ouw3iu~RA zRHdm0)*@%G>~2kyA5<$}7yD0Ij4xTZYgzb&kZ0jC^l#Tw_XabPwZdg%6ScG!hFmsY zvi>2ZT|BgltEGEbXdAvlEHKyUZ|DgR4e@*Ohiwdh?%841G?%l9I-Rc)m&IXwm7YpN zy<45RY!NQ1xlX^(6UIo@$TPup*qvY&Gro2o7l(vp)vNec3Ya+KufU0mI3!-=m>jO6F3p z@E-;AB{cVCdDk=`!DZtl>MJ}KDx9hes(>Ue1pV;H=j(1hI>3vC~hN*H4|F%xL4GsHAX!({7qr$7!7xN?L?Z@|&C){fTCT&U5T7Zg zE;=hje*T>83@nw?)sK&V_;54*M2;`RMurxqKD34o`?Ci|_z####!q~_yT5y*du~W0 zqbO@>*UP-0+}Scdb)S>_Ph!oh8%R={hKI(on{rz5kr@e10`J33=fMZ zO&zGW|34=|_RJV+x$UEA<$MzYzozx@%?kXD+G0JAmmIU{k}9uDqN7#U>g;?H+25SN zL`>J*OY#KxN}OR|neEKuq7v6UAOFP|XcQL>j4gZ+i#1b>J>t9=Y77v^gxBcFSBCBP zRCb**e>QH59|dCQC8BXc55_t8^`FRvH~Kp6>Les&fuw>K=m9qb1fIizgOn_+1U1gb zZFnD#AZUUZ3IO|*1y!;d_}m}OV7)2mIXHs96&y1#12eMxIN1Mx zVckjmjD1;mt;c%oN1TpB95P@d1D%kBPB@2XJcl3mA+QgzNI^I)haYvs22`PYq6(tK zDc3{9iqb|wB(Nbo2F;8WW@mj~j5ZPwk5pV9)J=pTO7BG#DvmHD=r?+let~XCp$a12^wV}Y8f$=((IiM|+b zVV_UraNhcUk9cT~c8_#D(A#lTUa+djN-|Zqle2XJ_LbE5 zVIUfXlrR^F<*W+*;tZ8@<+u7ZHD*J_1l0{*_QF-!Q^B2wqG@s9NA{b4kJCptluJ~1 zr;i=(e5#hPbY=3NDZ2_0DJ+lJDGOR3{9*Po|NPW=c7`YNbrCIcz5G=PSrtYYs+QV+~+sH}Ti~(YVyh;Xr zr~6U_a*JEkQ*LDCP+A}16`$L=uq2w&2rW7HS?|YHem;7 zo$em?##7ZS>Mn1lD~o2(08|M)a}uP-J|{D^mP>VI9U6FTzjKb^mYyoZO^aKqi05^f zV-{v5#8%Hfr-R%e2ly8`YxPk(*(zrr$gx9)yC&qCn#U9F&09TZn;Zvx15!Qy1-?%* zr~1q5{5oAvVvDF08)U@ryR4d8p--!R#yAnmZ_!MeL!FG@_&@wP%3z_oCNL8j2*W+S z9tCw5Y{WyHL`!5Ko`xb8k*X>DG#2x)S#L*IR6zm~{x7E!L`*P@av*6WW9W6M5{)66v{)5@2U?KeDX9(H^;aC9`Pyz9X zM|{>@=Wq_^5QWd-Mhq^%jVf#=q7cJl5ktyMLseeb7>g+J!u*`7xV|tJ=>+Z=FR)s0 zV=ao2n|%sU9bQLYSCQz8B-WEAQE6UGKho=|C#}Z|y2|6wN|n_^@CI{m2T{1ELXe^k z=wu`yNsp&hctrJZKIe+)*rE+59UeJk!Q1*#P<9+Pg6$yP_ zhrO+A?X*Xvk8oWWCwJ1G!b)Dnrh?uTMRX_h4oZprD3;H z$7DTLT-z+C(LfH;cUS~sWkD2XmQ^&Zw`ZzY?V9DbbGFO3&%Y=z$6F}S(I1)lFI}>S ztNe0-G^}3sTRAcFnra~H_@z<6%q>=_&sA+)((~CFd>)yWb3&e7a+2C5zx3S?O!77n zeK^>B`B;A@H>hhWhi=0r(Pez9AIoHiQ7FD;;jEXw%m2|io!inwl6}WNgF{8s47P!K z>F+QEpQzx{O4O0oP^x*AUtxFX9o%XYeMMDhJZ*qqf2;2^i-yQ> zRZjlx>1iAl^<;bbmwG@|sVvS>d$y1YJAUW99ON0m87rM{Tx6u_%NG-QJ%9U`dy{vj z9k#1Gb$ru8r8zewm$L%fX(bOwM@$NxW}fD~U9&^ZhNh*yPaovXZ{~3|SKIkx-i~)+ z=lCVMuJ7wn>Knd}-GUq0upTp97xf|bI4s_C-i*+cKRlD!=lp(x}Fk z&=#FjRFd(5hwN-r-q4~R)0540)!o^6%LnB9B6qH+%87RpD!d89A6SIOD!MG+9D3;^$cX-APyn}tFapAv;O2KaS|v0NBcy5G=oqGu7jDKAgY4;B{+hB z2tp(Xl%QqsuHb$UUO~%`gSenF3a*3ge<>f$pnVloM{U%{V7O6_o&q$Im4JX-)P*0T zL^0GbnWqJ{=+i9%R1Ecp?6x)>y8Wpd`hJIcy+5V{dX|A1X(&_pHRm6TVg=$b8 zUY@qFtLzUdF7BCM(|exH#htEJFL^)lcw(pLHr~L|CD~|pkd+a$#eCjYF3_))$*Zv? z{FN?AweeKf#TwQb>ybefS#JIv8&BCOKO3zl=+96A#V~`Ki}idZKcTb1jVP;yQ(HeV z3JL#am#_c;AOJ~3K~#sI(?<}`rs++p4f5)ncDh|2yVcj~5r0kXk%|oGjsF+3fw9z_ zBMynp>?bu|FVkaKAqueXtWC1GBx{c&?15pz%x(5!zv8;CrvK5q&`$k`7c|p9EHKKx zW}Fug*Oby-bc%Cb4OVGJeO66`sb0#|1H~IYkA1@O(`__U-{=X-!YwSKT-I6Hj_zj^ z^LFR(j%p_UT&~zT4hB&GtD){b~!0_ zQ>TjhRkiSRb4_s7b(QzLbf4D)`CW80%JSE|F8Zi)`U_P;9guP2o=9Pn)m}MDUJ+yI zH(b`Wbh=uocggooAGcdn=WXSqmroP(v%R{SN|YYEgOlRSx0^b`Vse0Nu4l=`s+{vi zc`?@S56o0)@-LOfUU4?pNT-JIsCmfLJ24p!wbIpb714Nt5G3Li5|OBf!^C5pMGc(B zY23+zLJ$)sOqf}^BZ!7)c!pKS6%&p?>5@eT>h_!!Dw@&_z7VmLOApb-(oDqKwgA5roUK}_!;eZO;HS=vKYRa zCNdwtiplIW?+(Q~h*UbozGRb7ln(L=7|gcu&D2?paG!8rYyfCLm%mh~!@Qv-mxhd~ z@k@=UE)ivNv`CNf{_6kDU)fG`zP89&<1BVkWjXzY`8R(eE(dtDc^=}|VBz4&w8Ksfwou{5oV zKiRf@dHoHne=|IRqxNy1ZJls>`?6baWFY;nlZv6Q?qw7Yym7v9+Fe>Yn3`Ta;qgybuc)=XnQyjjB>}){$*y9oQmW zKyOAlc7)xb`s5>@7!m01n252jIeu)FTwme2^uMUWKbJ$D{(*FSp=Tzx^66w~vN z8>Nk-;uxQ##_RVwUH_oQt2?|P6+xJ`^&YtoBN45R>nFOZF7EGR=W^Ea2`q)(RKw*@ zs!X3?x7>{ zEsG2h^AeJg18g4xw zH?u$!41$7~2+sE&l^MF3o5N^(E>gYgOP(e(=C>VcgJt6F^ysiupeD; zi@Et4%C6h%Gq}eZV+)@_!-U(tr9*{YOQv-m)1+hDLHorpddc#piOiSY((06&A}^}E zdIE;aeQFx6hd*$`otuqPg;m-I@~%lO;O!q^)_=;P)hLRW!Vxc64W84;=e!`rCtIcU zK2(u=Wj8rDH_vTzJX@^m(p0k$y%0&F6kEvd zqrddXyY_7hGG5luHxbQ4?P)Tvvs2ebUV2Z_dOI_9L*}Arwt}tK=bYyHKV(=Fd?Ehs zF0YY4>nX^Fu+P0|{-KT| zZs^LYuKOL`b}G1Tg#3|hcwoJi%ib?q8B_TL#9<`r1f+e?nMdVt2tCDlo)a%rHT9I* z$i8x~Ea%KtrrfGO=tBA@x`NvjYIk==*kkQV@`@_Krm@=8K5!vW${L70)IyJ-E~1l| zBTpgU2+c}@Bp`;8sEH-Dh3Aq@HTboPMiTYJ?pW%7|?ohU-@ zbT}62!}OOfDZ08A7*|3&8AC;7KAAmd6;v|DQ*$d>cDFAF%GjfH6SWT$^pwC~&J~<= zJ?HCKJ(=J61((^sM3hh7){(k+V38_EReaHbBTloVTgkVQM~26nr}(-;bt9vURl1e) zi>{n;%-h-8_pV{;Meo5h=9?=c0{NWV`lPS1eV3NlkMWNPMLinL7jk-OyrcGdte%Yq z)D_Dh@trPE+!;5q{h`gwK-_9E3v`hawuwBmaH?>+zgWY1M*dU8JGvx4h z=abKWxZ(Reu*%myFvZqZg6~eCW8ie=CTpH8KP>cx1QvaG>m42VCnbOSh0NCSLuU8) z`JFb-5!-N%I2rP=zm-+fZtXvAt#IxIE;-kALpscdn_H|NDowAnKT~JvU*81v108?U zCVi;yQrM5~(P5`pn659sP2QVvI^zciDoG{FzB){Apu^&fIhKDb`nguA54dLgdfMT7UHW9-bnl9gDdH7HT3^~dGiy49us~f8JhG31I*CKP8!giv`AVh?mu{~E;#()oH_~3jds^QH z>V(7ve)BzI9lUYg?~oLb)?L>alI)FK>Gap6=h1X#v7x-E@tlgV6cMLSU?S_L*6Qup zI_yn39<)-{gaa{hgg0mNj~D~Y(05Yn9iiu`ZYag#aKC(o7>h|dyG>? z7En>Tri#G;s_3{?lJ4zvamvZH`Y7TN=Zuy2Rj6v9JHXK|sXlW%!}UsPV*RR;@U7f} zag@z2uArwoT~$DTuI}k?aoQ0&pqHsM8iVHgh+aqKog<3T&cH1DSGiCZr*6ov_TwZq za1vDr{<03JC3LCAdI3B4M7vxFaWps6U5P0wg#~jpc?Sg zFW90lqc|l~SNXZR4M#krY z-#aF7+sUI^>s^6gEXSTq2I{i*_E0BGJxF?)8kttuTgMxlHce;h|D5gqzpZd+wElZ8n zV7wA9DHRo5%fvEX5<76%Q`t9*O_tlOA^ukWBKVAbBl?@4h)Z-PaMf<2W`^w#YaUS{ zwN9qY_(mmCJvLSShcf(2Z~nkLnIUPd@*~wqn4tGxSB;%2>5Zkbb zT@bUyEs@Y7fE7+_q9JG{Xg4uq5`+lN9cpRv?szmDyn3T zv)bwuhjjkH1!E1L#u4(TAwBDT6P!QQ4}3j)hZbS=LVB7f?P=nbJ#`5zw6EbM9R=r(1MrLRrPOZhI%Yq z9aPWce^v>nk{U!l)|7Vexm>XGvbo*Txn=KGrF5F8&21ynDxtQjAF!UXV_fJ!1Lg;A zv8PnpZjQXF8vW-K*T2hO^(4HcP`XSxR0CZFjaanKjSY4U^`HFJzu#)52IvhCR6Zlj zU)Z{29S`)kiU-0|MyEY|5TDTP)#{K=Y_Ml>*edrU<2T<-_7%&Xa+&oq=kke^9evD( zBAK^(k z9>PHaUf`Op3qMk@2yO^HA2CS6K_p}aK0yoxTL*(l;~-XoAo#d5CJ;SC(b z{NIZ#K1N$m0R{a8*TMWza2-6}5e(LXIQuyH{V)DSb?U+PA_1dBJ+!B{#u|jN66SBX z&X0-|dSe(yak?#{jYoQ-`McQw4UMIRq>6nQa#9<)D;E;cm zwZb`?b~baZ|ICNB-e=wjRZ*{{x$+0a^cQJ8Gi&-n0af1Z(X;n%8 zT8;o zm~#KgwS?vO*Qh(ZH_yfjiLu60^SIeWOhg5?mOnAaoB7SoB1&u)Cr!(B*R{!g-SxA3 zx!F$)66?`jcF2$P@WZX*|{5n=uQp&z$loI2_ zEpeUq7CoFuOi|zQmfSW1sxy}JS?=@hR<5EETU{UA>(bZx^V%+56vgO)cp)~iHu@kY zQD0dE4yBq)jHjMGnHMq!d9NhzeZTnqURh91Rm$@v|5NSFwc344AC1~=NX&Ol7steM z|1WZYNchm**(~N;-#8iAEPt0()sK3h6Rl1=FP#|K)xR%L)!#F-ps%g>B0pi|Gjr>q z{53ynbTtpS26|S6w+{K#ykV9x6ID$)N!GVo$STfAIo0{xzC>%}VrO%Ra<6i~z%5i~ zcdUK(CZ`q4MHi^6<9F`ax9n{yLQWO?&Ec*Sc4fPk%Hch3Ke5j0ru3Bm8R%;lP+Rq7 zWav=gHumrjYMe7j_6>BCSCkS7Vv+ck^68scEQ;};_%(xgVV=iY2kgc zs+?-%7R5w$HcEei>!KC4K(tuJk5E~?9s9LrRpFqXI;YRU&#A5%NgeP=O~xBEg%`!K zUk^Zn{tqTz;1%3RK_VF5pfDz(CF-Ce+<1%>ya14lFj%l)Wg*6b1(gMtV3T1mXbRc{ zL#Lqj35Ha`tWYulf0j+~-j56FL7U*!pt=cq2<`{*7t}t%@GYpDYM=&c&~kbU2envU zv_vH9jSDQE=b$0vW6_j?`_vij(Vs5jdr^olL~wv;dZ^PW;8C`H`|iXaebD5TI%z>w`207E^;6* zMqoG21cp0Pq@8xeJKZ|?{6_MCl*BZ@|C}|fhuT-wQoIsdoD4%r#DbRbB5%IpE`VTW?oym9%6hQaR4=-u-sbz{~Wh z{_U9$y&J7K|5g8S=dyLfZmpKdTY-v>aJE<-?fELkNsvEcnK~l-<7LDF_hHW&gR?`b zvOU25$!Zqx*#F9=%8?J9-5(mHuT05M4Uxi*t9Z6bM9OsJ5Tn%uddfOFZ`52BZMBmH zpjHw)d~KI$%nb@pKqaWAaAN|LyudV)83H!2`g0XUo)yVhl2M?O4cq?B0 zg0r}y8ek1BI4Qb_`ju5?%Xm@$Y`dupSWNDdlYBSrTJlQD>_9%dkWH!)B?me=XXR`9 zfPd+#daQ2gqtsAz6+4`(x*#jTYKpq%*3@e$k!fCa3k&Ir^9Q+jL+?16Z1#VbDG!=; zQ=(N1<6U|u{R6D<34D-UB7Bc4T#by}?;gs_;VWZ;aN|D_!cF&9_i8iE)!H@6)tvQW z$?P8U$O)=}nr=O?FQqMgKkCD|w0?N(PfwyUE^HSLlzo&+&MzCnIp~No*ll|RukOLoNcG8WzIMpXB~aJWG?(_ zx74Gknu?=T{-+gYAJo^=P2aGWKW z2C-S8Npu8`%7-^JTMt1o?NbdgT~E+oBLfBXEkq$k|C5!@1!B<>_i$ZpLo=k{4GQ83 zp5O^qVHH+ofgm`8s^sJ3@yGh*-(9Zn(g!i2euR?c~$M4)dAW`|D9%Ka5^8Zg-c$ z!xr!ay2C1Eg`W-_IDz}lY&Ge_Kj|a9erX`_9yKA**$K2CLSIXkCp|Lk#+J0&vQFw2RUY}rD$g|$mNp5agH=dI}v z%ZQ|<)Cr4d0Qd1RyaPYVyNh@+gJ2QlfSmF*86eKJ0#5ZulBx5ujQYE4E6_g6k}a& zsVpDEx4AdFdJA{xXCYffq=(!|IOJY09P4}0j!Mx2amF1Rew&S_#_svPYR(~cnkBNc z;%8P#PuKCzA$eHkb6V-5^n+83ZtxCHoZiQ#`9l5Gom;jo@5@t>t3n%wl@}w`kAc&= ztrHU%M}Hw&-gf#s6|6GOdUZq&_A;l6YGI#Huhd-TqfzWJtHZuw7iE2&4YBCSa`4_# z>bf-3xNM}l(QRoMU1uw>-#!yLn|&!eg&x!ookhld z_N8lj+C}>)vwW5PDNYZ&ZlJj}SDlvOiaD2^>vmiD$PrQojysL4(|V_DuCH331d0SY zXO_wQJasVND2qG$t+|0Ynce({GcTq+aQ&(-SkpYySS`C&{@=3C4|767a{W-G)Pm5HJ~XpfPJZJVL6S$C_O}9LrM9iBxceeM6(~L8*NaZ>8`4UDunX7s9bJum!(7s zAI+G+Khz%rep-kGJwg}4L#(G`XpF}CJ#JDXT7@X&rMvJWufB_0dX*l8ya0tG19ve6 zmylhzg7}{fo#6W8pecxSXm)rywOuvjm|N1W<72V+9nvH@FT$=i>@m&|C00 z>rLFm&8)yL39pfW>d1tH{<<(qqx+ERZ$zpgYBBYYwbfV1aJD)N zP?{a%-y@D5uohIB{md-trW>giJO{p|2=B439G%u;NRXQ_C~QgoNlXwNvKmH392kwSQWMZNgNFM5^boyL@Py@U!lI?pm(Jd>)kz6UG&)gxBay;oj%= zI|ueAXP>(18){Yc#yOj*HqvsU(AgqyUz~X8y3>P7$-Z)ccB%2s4B1gv(3R~kSWe2G zDgEsz!Fh-b^IE3COS&tt0!Cvj3(3W%>)(wHtg`LnH(3K$2EK&Ui=1YIKeUvoCtBiVnnBdvfvLRG>t_J?tUGHE9NMRwPx@ko|Y z8LAvF&qAm=p3+0+!bJj-WE({)5%*aMN`b9sVFA4q4*SYUm(yimPOV*>9YlFl$FDRM zBk&Mk>&@`fXpgZREx`sEb4#LL4H{6veUvq-ca71QA)# z`4pexQ&dDnl)@Xl$*PAXXN6`TpPmoa!agq22Q7mD3Rbv+cnbCq1?yiQHy;Ll2GJ%U z;Lr)$hgx)#Qc(c0bQ5>55OpC@mTf@@J4d@1pZrO{Z&T`Y+|R0oXrA#rV6cJK_^G(NM(wGn@apV?aW@5h;U{R3R6 zn$$l%-s^k+S$fUPF@bS*d3o16!Qb2YJN@fGsPoFGiv7+P5qm-xdNvwWjH1SK_BCI| zMrO|P3+quvpTKv{7+;=1Nqe^YjxKAjitU)^ZMN^T^#~c~o~LWlb84C#mRRF$XJ7k_ zVyOqLKK>HEPXqJ(O?*CUyEVgKNZpdf?c$=GF_S78-PmT<+{j~8qHS`5)83hqk|Q&Z zw@}JsZ#nN{=c-ddHnz#i?VliR{ak*|>+_$@L#|zBlItBz(Z`sXvO0C&hl2L;BxIie z03ZNKL_t(`wN@=;^=TY(XsyrbkxmuaRy_^Wu%9~p)b})yY`UPDVu;>^*?3PA#VmfF%{R6tXC$q%R=%D5 zX12_gwlQO)uV(V@^n*UPPg^O@pR%HQM(;#-^DE>KZ{5T6Ofk}EEyHLDuVz=$*Jzp! zMFrm1YAoY)4%LyK&|KA=9#XhILfzOgtjAX@)_>Aji95dV%%_>lL;f)9iGkLa-a)AY zDFhMHao7I;(Ds&5R$Sk==v=$1&N+R~X@dqDcXuba1b2cIYR60d{4V+FJCqP!7*B7t;Ysl!oBs5#zXssJ%fw# zQu>icQ3pMnW1x!A{?Ug~oblQ@FTD&ua`rX{$^*1gRM1j5d)fkfftH?{$*f{ev{GbF zdx#n@(zQ-%z1_mBqDA6}nM?k}o7D{Yml~&XaC7d+L&X_w0o`=h_AGGRkiS@3V6wPL ze_ED4-qGDT!vC|%GyaIjxPYE0j4rAIKH@PuKetA)++IzGj^}8N_bAQ#pd$(i$cF?Z ze7+U&-;VQb#71oV3?dn=Rt9inyfgf)5g36HpZQBhdzHb1GT#3+ZIr>SG63kmj$)KY zd6dUH06vVM;fSL3L^zFdGzR(Tu2_TK*eG%!M)<{A+ONGcQfLz@X(LfiEY(X<0Wn5< zj9$8~bt0GZol$@ije3slbl0`X`B9tYnUSqQo=igyj{9=vr`~qGQY|92`DBWf|AXsy z1IY-cUk!b-k`jugmGG|#T{M^4O_Gw*KLr>2E(8yn5!zBT*KW1 zdMT@n?17)t!-H{FQc9~dr*C=Uy`)Z`(!GoInj9HX#{0rOO*ZA}^eIqMK1HtNeaTl+ z9>%qZZ}o1vas#MC(OOU8kj1&Rx-X81vD!xUR{SkYt*?H^F*ax755FSEW1^8qYpuIzi7LX2_;!5%cL(FXjCUmb z9=}k_DU>)xduR;rWR1c_9z;?XQGkZ(Ewl!_RTL0z3TYm#fiq{=c;`}mxTBA2h}c78 zwCTp5!Y{@;M{2FLfZm=Sa!&C#)e|O_*RDG%i*xu=e4le;nwBu#-{$f)SIHcv06Ys=8DxH=;8iDy*YYImUg`o*HaUD#;D7~|&ptTP+k@>l-+^@>u z8xb$!wacmkH5Gg0Q;MRtp&wOa>{G+&r5J0Rr8kr%>Pz*^j&n8;YmkswJ6Ouu z4}B3@A6jk|4Xv<>nq|0xJZv?hH{4epL^at#9T2yWtcqxh^chrMTVyo|ZFg#_zg@~* z!gD_I_0ufz$NjZwftKX#X;-p?_C)Pp-o_qYDGhrSt(PA{L%rQSe|raN^TZAsVP*}z z38kYD@5C`V#aJe^cI)mSwPx49F%Y7ZCW zQJOYCJd#>dX8T_xq6j}nE*uf7;pd%LiCC1!CRm8)iAce8c#w==a0bQr5O07RzabL@ zs&X4NhK@&YA`Wp#rd6~G*Dx8AF&=xd7kiNhd5{N1Q4~c#YeN}HpNuq6hC-CVdNPtk z8DCz(6 zhRI0e#pHz`n;sG4Ev8T^6~|?oDVkFs^rQfnMHDro6zU-wi#pm)$2?<+qf6A={7p+n zEo4WGj+fn`6D`|%=OJrUza#E?Opo#Fismvk{{zxSZq;YyHoc&EKbwWB0e z%4~0&WXf4gP1kOUDR!z=kDA*X0;AGa zrv01z$&=U7PB+CPx=R(+H4gA;xs#*0Jk7*R8Y7=`61TB`Rn6o@ZM1M`CAchZaTiqu ziQJH8(>~NdF*T1DSijq+tl8>-N>;1XVAV@qQzJN9t>tL+QWI$)J0d+7!Bae8MI z)z=s;(>YioDAMNn_XHZ7&twC$l$IYCLLIbl%;w+ZYaU|9*sE20`H#$FHVxL6etFsc z##=3Hk*i5qYS;&_E`HS8>0hcj^fP}ij_OVH-zl1IVv~3vN{NoNo3@jtHmYCbZmK3G z(QJ{QmZ6f+Nr-(~9~y>b7))c)L>R}6FzQZi#c}8uB;s*{>)-`m&^Ve4P)+25jTg9$ z@X!8J5IzKP4BK!BUmzJzP>4EzHp>g4CThaVC-?+%As1q>7A?^bRxv&rwgSsx8j)x3{U=5#g{_1>?fU z=*j4!*VQhIhQT(LW$V_zDv^5mR{3W8dp~6jl6*L{ER^5epem|Xyp~%d%GKGq*ZExSv8t*Ap%3OGer;By@{YHbB}%x4 z`!A`vMhr^n<(*&YzdG-`b8u0y*(m({UD{WHv~!tWJ$qC6rg-=I{fw9z5ep**JD)pW zYkBn{`Vd-;NzC%09;;8%w&5v-VLf(hos8(1jBZzRQ)qJTzX```FbgtJl48A=Dwd;jOH)ZH&t}>nW(8$g)W^ zOW7VK(IU{9YluA}D-CtLa`bl{bKi42yvfEx*M#WFq5M|aOda3b@2ng(KGb`4E9d?6 zi>ZCw&jK~lcDf#j{@A3|!8ol#*j?ib*9~fk(fR}JsFD}H}x~Bx?NMBB=cJp`H&oA?F;-SN7%VOEYW_}C%?&VR*+q+ z8$5s(2*1(K*sQJ96O7%Wzd?=#Vx}X9<0tr>^PETJPvV_kOjgl{7>Ciqb;6x!jndy6 zVeYn3ws&-Pucx4EQfzGaZtt>4Gi<*Lo{kRTS{A;r{`nT zE*|K(l(}c1roJ_@s#(R+G3s|&$+06M-lsciMb%Ayt~K^HiTf_)X&~p5eQ%8SS3WjM z>KflFm@jxT809IW*VU%G*6GKLi($(hLO!R27}O3pQG&%3$&uz75at#L`y0uE{X->Yw9l`&O2A=DSDLaSLYj} zXZ7KYS~P07e&D=1Yi<=ti5VR`_CG@=|MS{}lK76xa4k%chvX!=%c^OwlFz+So}=#d z+)h?gX{l=h7lLuVa)HR;-)Yf-rB<=jp#PiT2>H=efzfxL#@&0Jl6F5Z&{s3!S@@~& z!rDo#M%Y6CEBgdr;CEt{BPLi%PM}{u9`siUm2d`Vxww?LC$7ZHfU4l@pV~F@EmCm` zzf(3Gu{-m4>{6MmIam^m30a}0_9?ry{Uq=)c)^z`aZY;4l(kXk-5VS^62ZMPDZI+{WDfgD?yqcyki$|J=Mrmb@?^JaxL?*{-=NFz%39XV}q~37f z(1kHIusyL+a>G!k&{?w;P14RI6{*^NDvJVQvG50LtES?I59#(==hFCP5bjFauOg31 zjH*e!{q-FY`V4+*=MzCk1ztp*jH6ujQyWZUOlPkgb$WhPvt5sB*+=cIRHfn8^l8v#LpI{S>R_{0owQ+?0pjT!q z^{;YUcX=k~59PLJsC}WE)_(O?OBY2n5AMN7A6etIoQ?0o=CWKAn<$LK@fS`=Rt4l! zC3u8V+EeW#s>?rR1F9gitM}?aXoa=a9vhm*5-Zc0T5GlF8~(&$Ve|BJdLBE~J|&;Z z;#`_92VcpM9c7?Dx2A^87Sm~b z#Gj50+U^$~dF@@@(g;zOTMscspNY~!D zHyDrgjm{0mPrB*2sWsFx>&>*AdVOseDREuY7gw-K*9vNV=o@X5IEe)NS9->ikR}cx z9~V>vn>9Lv%jm|bctFF^8>L|*i08Bx!|?@o#(g}7&38~5b#Vvh(F9G<5(!Aa`_Bv| z!|RGc3}XM2+=;`J&ka-t8_Jl4e}X3{#M!tiR$?mFq7dQ`_a6iPIRpcOL~JQ^do@I6b2t{=d-^sO9!ikrl=40 zez{p};MaC-^C#IsMFuy^e)jj_C7m;j9%(K8!!TO?ZFdhfv~JkD(-)eF=Ce@doGYW+ zM)bGR)E;?J=25pOw;kkuG}o?4o5j1-k3p|&8*L~%;$h_QOq6MUOr@-&^F%~mcdqnqd+G4)O7m%Mlv66UWO(ne zki3RL(r*g;j-3aMHJ98>qx3uHvYczIVG}eErlAe3v=8obMyEc}jVs5_+YaNxCPhmBHP-1YYkjH+#ngZ}@);M9Na;wNSKpsy(K0+C&$^16dnbIGSv{Syg=zOwuDZ5rV~u;kIL_%Yf?d=AO4mxD z12vG}NS|_GtBS>2dSm62Uk54m!>8@(tKa|izFg=7gkCW$Jd-DCoHtjdq|CqZ9&yd( zw`Zzm)_K&WyP`QoW0M+9hR8)@txB?=d}=>Y1(4V3qa1t-w^T!PQ^(bH`wwHLsDfvX z?fM0M8`tMCJSz0YnrD~Re$lUJnrtu6szzK~rEn5Pu*FYBIX;VCL6;n1&obuIAZfu z+AOv>g7!*#pKEi`nL)4qZia;VJBf2lFt5vO^Qe1>G2&(lyG;XE5Q9;-z7aEHqx zjoYiOh(<2-#Fy~$@7M@0Yy_z@b%xE`cpJ*29LnK3CSbyU){OnvO^Jm*U(Evk+s5e1 za3ME#;t9@wmYFk}tPF9wFbbnEI-@f>e^!X9p&H6jAu2?rFa$&R1d5|Lp3n`tf*3TV zUi>Q7nM-j;qNJtbg;@|cO%Yk%#~J!@h5kl?t$xS-c;Wd3UB=QPe1 zY++?q4gB{)hiyIekKpN0z0{)V*Ze=G_75yg&!ciEzZx8Hn(k0|x?#4n8_-R9Ddw?L z6mUHA*H)?M{H{*2?fYF+#|=G~*3B1__(MwemnR>z`IO08DMOzg_#;A3wdamz&fl_) z%Um&fgvDk9vuk7^@?7+87a~_iHqibwdKmRo85Bcb-by35l0BYhb2g)z(a<@SUXc_F zwS}U(t~)pC?VUNJS~%zF-cR|1zQAexuAgv5`jX?Xy`AImJGVMAsbTt;j#2y#_0#i( z7f-vNl0Twyu-B&^@jnEz$*Zpq20Su5@71g6>zt9=Y|+fvU@S3SawCA&b0i-{IjTkL z>7-Uvv=W1@aq^|yjBYr@okS6uDaKmuRRJEQmT)J!X^)X_;f0sli#_y|-q325$a%3% zZI(B=s+`3GC`|LHbHU_%)3r^ZG2!{O>S~I)MGt!K*p>KOV>3^ZVR9xNVhdl=uRNXS zS+951h75{A=~&}tcNU*Yqp%UakNeHl*Z@3pUJJWUgqL=H5@4Voc;RV$Q+ z`PfHUl%G3M=|DDHmkp)l0KN;|WQ!m0D6|*#y-~&o$IImC(681qtFfv=!;MXNAgb!W zhe`(@&=@Xje`76@o6TWrHiGD`R~3&iQL8}@X*G}Ir7BOB7b>gDk;(1bW)72sc$0W5 z|Bx@REAUM)yLm*t#YWt)%c=vk!1_%N#WyOe*dcCf0qvvVf)`h?N;z=VXvhP^X6-gb z(@FY~+1@3?WpC?_eNY-|kgUL^B@B`69}h@v*s;W zKH8uSfQD#GqvB6Vf@-rz-lxn$OI-f$O2~L%Vzl z*6dJL^Rm@RRhIiWO$_DsSSQ9uMnqhS)SS89v)to--Gcv^$5SUIMtrJgy}^DyqXw({ zXsr6kKdnF1KD>0)%vvHmDjL?~*WWy87~GZC^V7aagqM!Y>%78ORUTteR1JMd=pJ83 zch%3E6#gUCHor1wC4KGhlYS~{0uFFHN21FWW@s0~Qapbd*bx=jau6I9vO-2jc&IU8B&i*}l%! zDR>ak`Z9IgyDRdEBw<=dqIxBVUOWu{$5T6ecfvjGhp^aSDf^6CV9%9BrPPXQ-L$;w z72P1OeHAvY*a^y^ezLydvZ@v-EQ6B{BdE&od(lHQRd3a5-i!BoS-R*bqwAu2Fx}cJ z4{!mbiP1vnGbj$n`MvldxJwocd-p&Wru!+KmH(L8v(=G9f^V|@tDOzJg)}OAirFm? zW3*Ip5^^Wi!%2iYde9{8Ms%2Sx?`}nqHB*UDi~v43vK=|{`siak#Tz;)_(RP{Aq;g z-5OOT>!ZvcM10hytOsNdksRrS#3}WU!R2b zVwR{OM$!k_LOzt!0B)^S)W%Ub_eXh7R+^e9%ggTSHa!(3`I`2c|3p)Ig10=4_t;A@ zRut8`Y2OJc%3&%yWo1>sKCMhVho7!;J9X7w>WI@T>IH>I%R?=UiL?Mg(E%lK0t2b7 zmQ_m1s-F}qL}f}rIEHA7pDE3FqfT=b{+>LobLJEWZI{^!)9pBGB_8lZ)r_BDsceNS zD8@DL0^?OC!~(n*3HY8j;01KG6|vAU06uQdm+=}~ISoNn<|~N$949=16UdG0xQ@%H zg<2?%z1WNUpLL%M5&HAeS5!i!&oC5@aD*cbX-NC*c4Yw5Kn%n{%0;;-*MGlB%739Q z>e4{y(2(p7j-6E#te7SK(s!bvKPH#D4@iORw+@`$Mp)0nSsbf3<0H8T&N{!_Cb zei&YVsm9MN`iHj!1dqJH{sdLip))t6_hODLo@3w#$mmcA$D zaA>=ICaFgH%uwOL05jFrt?$%vwZju|C`Ys4i~Ijxy=x7ynwx9X5u>TYLFe4@o~7x( zsM=Im)fZ#QmyjGfYbAM_cx#4_O>dvvB&njBE|-|cwUHv8Mu=RaCdz;jo`3;R#RP zo(laD_&!*yK%HELau?@T>=X~#ncD2rsbH>mYZD#D`t&&ohm7sUO}j->Ib)1xRA{lY zLDZp8A1lpVVK3qdY=u7Ym;BCE$g$oC4_>ib*eC1->bTs`N3ajJwv-Hd>bRzz*RHFv zY7G|iQq<(0GF+WP8|p`Q#T4-hx#$aZfLCIfERJJzSoYzy_{J`T?m}7B)hU!SGx4`} za_kH>%kR!!K;K8wSf;&pPqRAFX5$Y&YM+r;lu<#z|$2W?f@{>3V*pA5eldEFQul|9PkNsDQatz;;SqO}2m)7EXXk-gbY zuzhwuc~j0XDrtFXhu%aRpw*$Xcu#Nq|JvK^aY)dcz$+(mfTzlOY8LOLK2%@)sP17A zo?7=*ZFq5lx`=Sw#S^ey6;UJQMg4_Zp&mK51jnUcR8#C^>nm$2PKam50j(&7I|l=* zPjo1qE zt)}%FQ(G}hdoJR|Is2uWCU1(u)R5k&GAKq(?cFL5PmpKC&4auE03ZNKL_t*53i$^n z@Ge=HM^O{bN$dGr{z~+utYR4D*IDn5)-;~R;~{=QIaOIMu_sb*bcR<&C{rEL9F-z! z({S0$PQ_fTmbZ|MWDGz@I!xzKgjaE9+@K-U8D11c86@BWFUA`bg}{3;LkplO&r@IF z7Czu7C89C(CB2qGtPA}5~WDbhdxv>6GYXLyE3D1ZXUhda1~d&r8c$olzv z4H~r1{3sT&h@~o2g{p8eCv!4gp(}I+l~9S!VFD)bpR|)^qcfIZGp(cF$cs*tjeeq{ z`hM*zZHVzk8?1k4^mbM^vX@w2w_UAX8&lU#U%#F$0 zyf6&MQ@g9|pjzSqmDek#98S-iyj2b7R`xT`OlJur%y=p;i3eeQ^hC<++Ty67t+b2a z02!q57ot&G`IK*yj)q=Jr<`Mdpw?Pz_r9>^VH?x0Tea-%stxDmw$xng6A|tbp2^+{ zLAU>JUkTMhMer!GPx$FKw4wXF**>9WSd0800&`MpnOW5;wcWakJLqiY;nvt-SVn98 z2O1>GP;0w@%Et#)bBv@q5&t-QyUh6dq43bU*PlL>Ph6SfMdZ7vmcd2l6)RdB#;uVh z)fGIa-wLFu{zlG-;lW#pUmEX1>1l_tF3cIdIGBeFtr8cf??sWoUYTFsmTf783R)pm zMa|_Qw3V*Iq&ZZ_7TlOGb6L?H_jrRSKuzVZ+#4h1NFKyXjK4%HF$ptKf+GNdKz_gJ zUvUam(3F0sLHq`LaESL%3hD;0;u6VVIb(g;gpZ|YuQAV5cFo{D*Dck~A7g~{ZF(7b zUAt{umcJnv?x>FV3328f`!D&G+9I3qYq5j+i#wtqPSSqIKE1!*%laxbDzLNz#%Jhg?}W+#av*aU2X^=P#?>Fg65MTLmy6nAPg7n)u7ff4GX~*UlTI zlPlZLxN1rp8LhIac+xOh4yLA*!kI~mcY*#`jJ{?H95~AJxTq@S$R}orVw~2^bf)irNj(eakBBz8>{|uUbnggb#=xX$d7rt*;Nj(&u|~OxQ4h# zKOqVoU~&Vs1^d(iUPe_#0W=p^$l?$^#Bh?N(RbVw?%~%*QmGY>K_GaF}L*zi3OO3e_I@59ZkO%j|6U$&6U zlh}a-By(mw!hWg_0g1Wz0y*di5)px}$d0?Xiw~dEJu9#RD?X2m#v&Fu;DZnT&-c_b zc06Z67G(JhMj39`Gd#mHIzmSX8b z??}+1#UQ(moS{0taU^B-Y3VL8C*mwtcpb%ZtF1LHCc=g!3CMw>ZiZKDgkGbw_$#yrbLqP8 zUo}t+^R>73nrVTFR7+cHE#X~s$;^pF`a+Zzsaky_yS3SW=|gS3fHB0CkN0AtI_Nmy z>=*r&{deG`vwL8K%43Bm#01A$zXS`I%kVcEs3=TECRqa)kx8}S4%AWoWNqi#W~}{C z4Yk+MGI5-;z{GDVfy?ptvL_M!h!fgw?HhPGJ2ojVuE4KGQc0~F61gS1V4*#OXWO+^ zZS_osqo+K_Z)iLn!Xx}hr@0Q-;NoZ|wo`Wc4hN7+b+h*H1R2e-c)<NI?5x} zV9l{8@)(VjWHqF}&{sX-KD1i&Lv8rbkt*?A^gP3yA z+R4nX3LnV7dG5LuZC2s?2VF1r?C-3h57E#6FAVknow9L;Kky=TC%vNow_w)vtJYRI z(r;QnS$TLxXs7?h%ZjgtzgigoAmyDeS7Kb+oRsY59lI-U_Z74HS=mwwh873E4i#4I z_%^SkG2%ohWS^DK{cY{nvTSIk+Q73yckR1sXIgsDZ~Yh;9bn%RtF3Hl&lX?1=Xx6i zwx%NGq2IQKTBX%4Y``P62wtk^N^rmT{^HMMdPBRdMq-zEp|y4{chALHRNx7GQwXtG z4n+m3ry|jaoIIa;h&{GJJH-n#m78)BE##uKnaa_2QOfn$`LnBunaM7v68WkciDSka z<6Esma#pjUwM0A2nYoi#MhEp>DfLpc#Czs)t%m-!{(PPR8AXbj)+yIxf>U>>c9nmiU&`0SHc$U3)hLz%FyNXFI+1*ue}V5jQpN4)+8}b zyJ^?g&Qprrfi}@$J4H@czgWaQ`Kf#eh&7?Qss(hjJ(f^jtEBeHdC(f5jQ9P zi=39;{KNT7nF4bXqSCFzn@P8m_b1!QUpm%kz3FgNsfc@-vjxJ`SZtG#Di)>eR`N7@ z$kw!3w6a?uhnOOxF;aZe-2P|2Fy|fflz)D(nB(uTuk>ChMmNnIX?IMAW07k=)p9Mj z3d#?>SLNZgN>FvRMgEDbDu;?ij7nC5mT@EsiAxx6S*o&ps_J2fYQ)`nulyM<{Hy$Y zLR977?8dxH?xh(>P?b=dE}?+fVq_D4+8fnBY7%c!t<(;z!#dt78j1JNRb3ve3drI7 zP`1?8iZ$W_{Y#H=9A5D?KJpVRVvWzDFE52qQ5-@WtWz&E{c89j3dtn*%^dl^&mF0HV>WaI5MHkRFS zuR+yvH7j(cZ0LYd|HptQqefvGxA$}KVzB3@6yE^+BncUS^#8)Tq)L$Zaz!&3FX{A%fr4LOnpSHyAj!JqlcD*1uGamp5fUdZ#@Nd+r|X`93hkTA)(pODe3rv{JBzW}0>B zpnlpai*{Pb&ZJVA&=BWn3HPJx+GF*HxcTutQgQyfN#h1CUDYRo@b5ty9>blwzG=eS6A-tLw zMy`*%nCbM>-N~!03BH~QUw&%qyYkZcbd7UMuzgy&sF%*C`Y`(+r^oY!?1y|>d2B`n zYOK!DbWtuCCoj@cBRx4Peu5tBsOfy7cjEQj&6zFK!AegY_u*!;koTw&T{gnCJd{oo z#8l31ec;~Q&8~#M`JM6*b3OH!`i8&7R{DSvT6J25J+hW+Ay3(tFq;=hAD*$r2C?d@ z3PGt^)_JUwldb3Kch!+^sVMa|ir^9NkVo)DmEeC-7rM%YU22<}fLxrJN8=B~`zJ|{ zm7U*-N#tP0}iyyqF}5nT9LUFL24 z4R69eT%bfmqqk^?FqD(8kU*XB8ZmqxBe4-hkr^>a=4~j1+xQmA=z>b{;SP==3%u~+ zHFW5R`X3U{8OfcDsj!T_hZ)z~snaYXDG!Y5Zk0chO zF14h>)PQnRS0h@RA>zd)1LtJmHhu_|UmzkEZo)XHl8pT!)$j8&?*%&lJ8 z>&anChSVc9v>KiH=*^K~YL{ zQ5ri$GyceRF;k|i6ZR_YUBuhSHdeOej>+5XE?T@jF8HU}$6O#gTC)<*d1iSYM-A|J z-3L4s(-*1w)(&$SzjNHc3~sI#Qavup-x^-6sADVwbVbDSaC`!cE26b1Bc9O$F#z*b zs=5@JN%!gfXek#JeLgl03TunrQQxI~6Zu5*lO60N+n8AcjR7<Q?A3m3LPA&?GD@Zx>^Q{lGUQlZs#S z{9f2A$A+*h$yRjkTuJHUMQyFWoGp7>4MHPjd2X%`(^fb~sVuCct{l(I-zzugg_m=P z>Ef^|hbjoOZ*zO@r46GPyg*lcLOxkQ3AuoF&|&%jC*t4}LfoK>{4H-1W1VSKgP+hz zk%bnhG!v;>4pBniAqsJWPzUh} ztE?>6RZN%1s0}?=2UW*lRqcAzH=%7bkWRXns0N|!?lJ!BzV6X2!!CJ|xX0Y>zGip9 zQ}jtY?Q3sWPfJJ&J!`5p)+1iIocANHNBEVB*%;&1w?yZTs;0_#R(p0MN2qG%G&$o% z&v&nqeouSuAM2SV=I95VC5;{WdwUmel0PQSw+hK^S{Cx~hKNKXr(=)0YHi{hrm2>w z19THRxdsnVIb<(+mPezS_87gHRRkC0X@b>G{?YnE_L0N%mg16#S1+hDt>($_su40y zR*p?7pRI!b`dVYA^R^b!$r-Up&p?Ec+DO02bWiw z)j0Z{TA-$D9}lKz&7mUd(qv>-E8*w%>IhO{D<4dr$bRI)1ug+M9Mp)F6sV{}j* z0OaD6c!6s?4doDv#s~taiMMcL67s=7739QKT*dj%=L9oihZ#yx1_R3Qzj7ic{LrDp z^V!+TIK)#3g%D4t=oGy`d6ef>G=-+2J$=CUm_|oLIWoCEX5u?hQ_G@#GTu4XJ0=%; zUA0DeA+?8gOo2?)lxB*SS9wCL2n_lRs>#yGZH~ z->pwq?R{pOP;{`C)zs=|AK*FoA@VQR1NUX8!`SMKnq`mYi|pnTqK9x;_t*>F*BZ5rACsROha(4w)!Y+ulP+&vtzM=SE7$NCGJ=)P(cX! z&N0@#3Trer7FrX{;lVRuzq@w1uO-L$=UZRkqV|P84B@nw{y~gz=ngK<+ezBKP)oCz z)IWtKjJ2}8yz(lKdJ^C5!|%aCpFXCHlk=>q?9uvA1icoWX##F>6)wvU={Tz4g^EEw zUKy&v_mwL(hiO|wcntb-L)TF4fJhFFu=klwLYGvOY9x#DX#OJfK^B%t_G)FQ8{CNI zqaFQ?cvVdf<5IF=V2WHW>jitr*S6oBuCB?x<~i9^F7cnRuUa|fGH%AFrB_u~$ZdKH2A!^Y?@drIbd9Fu= z@jcd(;0Nph@E*x7?1b3j1|a@LjWEl20#$!aS%4zavW>`C-5Al zQ5vxTfGn_K<25Rw66)auPW8H{anq0JWeN_!r|bo@-DO8Vwy! zv736q5V=uPj1ikZ`sYWv!T4)~7-2L|ekS*lU+F5NcrLd~zuaJ~*Yb!;_FegpJSop$F-=lk>2JzvIzkQYLflu|>7EjL zYLyh19HX?4bXH9hT__9Q!9iItkuD=QYRL-L@2KWD=PZZg`Uj&&aHzT4>aJ(QVp`?Q z<0RJu;nsfBlasocD{Qahw9(rbV$4P(Ey5@apSmjA87IZx8f0;+s#+pL{4sTNO1;q6 z+)CS}&2#*rw=ilsOyiW1Vh8Ps)|8me(d#njNGX!IHhzO+qE<&U)lCs6M$#&BinHPm zk{A*GDEv)irw=F7Hl?}k_xcWFn%z_7w+D#vR9G0=IBchf+A~oIf74@ni2L?Ou1x7N zg|ebGUuVFZ(9IYb(I~NwkrGuV;-vj^a?|JqA7?**?7i%L8Qs}8-g7)tg_N&kDn|1p z+@PjvmJ)WHbkk!ltxh8b`?dOXma;>`MJgbo5TL@co~ogQn2S?fQEwvRX}vtpClI4d zwMNxF-*IzX(DW|he^53qt&U<7I;d3~$Mbm-TwD+1{4s;kJ_jWi39*PnxQV5z=jPMs-YSh(OjB~ zO~{2@C{1<5cc_B5WZ@l(lOI8`n*O2xi?+9nvg+!(gy-7(oOACj2MTw0C@e^Dm*8#* zgg~(11PSiJgG&Mg3l=1}1Pj640tA8=Rg_oVsw?O0z52)V^!L3zdi3ai#^_%E_8$B9 z9CPip=A7TjS7H_|7gI!WdTu-s1LYknhjrXMo2PBLNu@XJzqMQZ`c9;dG_$qs|XI)Bc?tIB>?08**B~`R* zsy*1nCN&q&1H({O%)%?0D{BT=@6=b-dM$KId#b*vUZ_*Lr4A1s(y6+Pvs6FUmz-o> zPT!Ufgx|d9R6rKBqF+QynW;u`ClmVv89rd)2 zv8~g2HvJ?g=z}6e7F3lffz04czJZHk2ieA8l(d3YIcHq@iTEmJQTIA?o2V_mm0Cx0 z25sb3;-*}$OV9xr;yisSE+b29qSF*=23(Uuvl&@3AyD4l;WWWeF_~u2IV=`c<$LiM zwbXc{zO_OPrKz%OTphg?$8?Dxf}aOYIm=ZjzoSV==iKx=b>_hcK|2f;55;^Z7i#eW zw&i;9Eq_8)=#B#z)G0Mp{^}MPz1(G@#vnWx?)xr!fmb9n2q{RcE7@<%3!Z!aNc<>s zc&Bk!J{`Qsr*#LaMt{(0XP*v3<6vK1AJav=SWLTBUe3>lXm9a_dx{5>*@NVF2-rTT~wXc4ODY}&+YI%?FcSXD^*83yPl+V z@V2U`x~h(1C!G$|r9+7D&kQ{BE%aoByLM+T4aVA!?fcF!XSAB3 zo9n$4GBn>EnL{0N7O~`;YBIm$F-{aU)FX91o`WWEA)0ThW;jR{csP1+Y5hI+@fm#> zKI*~q;BW_a;Rw9k10CR|TF8rNZiAPI!9nCjG(W)uyrVAYifG;rgDOx3?EM!5;^+Jv zVU$K`s6yLmJ9?le^`cSig2Ss2hA48&rJ@S%iQj02=xcmtY!)}HJ=S$=Qk^ebY-_xK z?SK`l*X=7*VjKjO)r#Xy*U*hNIv`K%KOXjv&BfkrB?Q%lSf-NkiI*=@!Vkt?ac*YMZ+&?ebRd7IJ4-!e0s zJwnw8w$`QitV z@xEQYy;(tfjpMiPByY{Sm{~h8P`%b7a3T0x;1HUUDK@BkR9kMd>bTc=O454y))?$* zCO%qubw1TM@OjoL=ZLqZ`03j_hm;C$rgl=mB&UVmz@4dz_-M2=3dq0YGsA7{ zGTX@yRyTK4?vb8sVeX_x$>rikISp|F2hFxdqOpaZ8vBgpb_4oJHrL&(L1F73$h5BB zRB^z+&(|Perc_VLv_{Kf;+Q#)-vPz|yd)Bp@Pyi`vs{^ekX301rchV-379(N~jYo7x;~P z^8SV#4pK>R2u1XLam<{`GxQ=f1k@h@6eO&c?u89;zGYX}SgPrrb#qol+gim71ri zD8<{<7*ycOx+DBJtd;`0n!3u1X(9?Bns4d>c!kP(HQYR$mtilqsboAsB4?lxpGS9G z!A)L(yC{HdFz7XkqA$*IZoEV@xbYCv;YA*JX&ol#TmRr z912h-eTKF4I|`7AYp5=Qq9~0Jd&Ld$*mE>|uxI#)@ssmS$~M^?`uphBR*LzLtI7WZ zut@~~ME(2ZhaWd}NqwL1cx(B`2KK%CAvu`Z^Ub#(`X%I#eU?x@L1r}$911S<{%GfS z!kiQlEB7KGQ>=eDh1*jtq@oC!stV`E)8MzxL==8l{mH}!2OYOr)IB`76LYX!mE{jS z0Q>X?XB0+ZFdw$x>aD7z(N!i`?Og3#Bi%=g-R=UOeb!H6wHU^2?5nD+|5dQ8cR|L1 z!2Zl7SWgA5Uf$mJO8d6cQpM{#j@!QB+;x7WxpIQBLxyD4EfDYW0iei^+E=q}6SZ#b|Ud#S- zNNhwdDOEvxW%{D5Y2GK)k4n;L9wrmy7_KF+7-jVvncKyIB=Oif_T))&p|q3sid28X z0e;C>bRBw(mQ+!cw3-tV*Hs?P1v@^lipGPT{dKQlz7U zXN~xhU(#!x!Z$>J9AKB3SCqqGv68E)R@OlD!E;$2O|hM~zztA!N}`$K90f#knM>Ev z-SlD3fdVv6?ln%R8gx<)(D%_)j20i|6{AtGoZszB$vnj_+MQX~m&2c*@7Se-DZ%fY zBF;WE6x{%G9d)E3;TIgo9xE4!#>R82^$kAwP&^JCO3fo*T3>_&gSpHT;r$V3X-|@R zVjT3Ww?7jbLKbG7q#foV^;8aUf2Mz<*H$HapFYM{IKSG-{b&l@!oy85QtbfH7O#*> z;nWI-I)hs9VFtU9m)mo9I3kNGqKMY2Afgb)Iq;Bw6j``JXQ&zSaDom&Pm#>?#m{`s z{)+$B>AZ<2id@DxoTiR)EW-IXD!@={^iY(+eEu4T#bt~~4Iw?zRMG!fAV<4Z_|8|MKeq8c9J1`L!yby|(Gyd6bw zgR0XO#9$JJ0Mr8uP#ay)0k2RSJpmY08x9WPDH7=;y{4hEyQnJH7*pI{bI|yTvu@9< zH;KRYkDQGZ{6l5`14!}0i#Z5G7)AlW|H9ShzgH9v;vnBaOSFU=h4CH#X5UvWRZ082 z^SesoMcU!^#yjJ^{9rzG#d^{L4SW}a)$QVH1)mOnsXkRpC`6~Ki-D)qNWM}^P36I^ z+;WHUEf3%X_1v{Z74*j$NoFlR9lY!8GrNi_;*?w`dYO|vw^UCj$xg%HXezeoES^p| zc&r+Tjs7ATYrOM9wuC1~6-hhfo1eZT@I5x!!%PzY$nN~kd}$U4T_r~u`?LD{p9iAV zpD02e9;veFC-M*d2iLG0X3g`JOb<142Fu%jsfqS5^B&^mHalpiaOIS0W-r%W_h3BX zC9=F~rp6o7J;eo%GaAYJ;)C;?GcZp7f+N_CYKZ142qV@+{XB z9Hv3AxFL3!=ZWw=R^b32Hh!n`)Sp}H**a5C!&UB1;p89>1#nZ}7Pomi?y*7@bp+$| z4)u>aSI9e03oe+|+`m}&vzA&T1Iv&=?Tw*goM_?f)gx3}`xd>Ru^DAdm&hT6^M@*t zcR}RD=+&5-@Hl0V6+p|N$F;x?OM0QJ22SeoT%HeLh@`j89f+1LwLl0f zL~gXIhBVgaWF5JaLd8s@C-3GC)P&RIr=l@Bz@Tq9hF%!Oxf&wy9)6LmD^q=5=Uf#L zBB%I`Z9b$Xp^aK>FEd^^-v)nCCskSJ8jsf_oJ}IuZj+VHF;LC&MD!_VdUbGSKFHOE6d;Ve|ZY~6tJAco8HJ1U666pig_ zGGA4jbSE6-2H1xT+~wRzr8IUS6^F4I(YVXI5X~nLhd8Q$GH`PoUIFwI%|KWDg@d?+ zvKR+9uJA;F3Q`x8zGmeA|3critL_)}6 z6fe4`PD~#2aY=G(Z%ooz^R{y$vy}>XMn`5Pp3BGxO!m(Sj#1&JY0h*Vr`O^WD#)SE zkE*0NC*R63TnAUxRy9+1QtLx!Te-}i@Vo18*Go~)XlhLHcTn-_p?`@AAPI@Om_C+S zGWdzVZsrubunO4ubtPIYd+6+(+t=Ke+dGlZh<|8^IgUC~nl;~iVcrwk446tBk}MCH zqvcxTrtYRzE00`d^poGYHqiGXS7;9Fy4lsW*E;3;G<>e7ZFm>Edmzad_Uv*tOx1l^07FJedYyfML%N*TC$5bu;j{gOIVWCb-7L2@&+vt zTX{bgNLG3AL|o5&5x5rIlTjXrJG{zh$al7gSx6C60vP8VHJeQ8gz z52^)vs&1uE=@xpU+RANcfwuHkRhyb(8S((kEZL{;uz|X?7=Iv+&#{eE?x9C=ceeQ~ z-r)h>{a4}CzxA8{GUWN!^3A`TF#a`q%0LD(Xa~xok(3WSH^mZu&85%}jdUmKM5|B? zE#bjAWT&U-O26SKw$nkx@^jvV`WTOL+yrM3OMB@c+MqFK!wJ!uE}=iBqb~~M6ivfJ zT1V3mfu?j9Zn{s;(S|P23z|kB=rtB#D1N|W)SwRtp`|n)b?E@zr+wlDbwFFOjE(?u zu*?uf{+kt!lpeSH`j44Ac7^V6o$_q?kD=#Zl^w+}()ZHN7fhC)pbPtGl*s+JjiUR@LkbXt$q(>a&FSmsf0zNEh1A^ygqh`4S%O3TdJ6OgHcGwnb% z<>N2JreO78oIPLnb0XA3PI3;ZVehYf%=i%h=3-)8{39u?)~?<%uMs1sit~CvVHY29{*?VUNlm%)$_D**~+bhpYB-m3%3Cb5ZI^p{h1oiw<^v zx+eqjn%T!ZBQ}X{vIqz1Pt>FB)QaW?&x#Y&%vTs&MRi?J+UB*uetT}PpYJyRt{;mS zdVwC+W^}-1dV(l4SKIgyMsPhz#xG7e+9ASp5vPwTjRs({zj!WsJ|cym8$) z8UEb-%Y7ncqIb6UtT;yv=%ZQM=pb4d{|HM~qYTOrPjE`)GL<=6hoNiyp9xKpt|Hm^ z()|!GaFZ)ygz-@J7Y0p|@uCCGrz+;(vZUOtCR0xO)ZZ4>`E)RyFYsw)VIhC549vuR zoTDLBnnuwix`qit&~mO!19%{3VldxAFK&S>PNv7QTClZI!MdC^mMUV2d02!|2R#%! zWpBHR%E32WW5i`K%4m$%l+E!u?bJQzvL2+T$q(|WtcSL|1pDEluhEpcVv4wmMRbz7 zhCyI_+FkIXOY>*HLPT=pvqp{o*2(rLibO zYw1026N5xM*~_1*<3jq|GjQ1Ox<5v37p1H&krm}p_t6|fsgWy3UbE@HfxOI=Bers5uEqCdGvQXaZRv&g_$Izv6gX?lPXC@z+Xk#vG? z(Ji`5DU?D%48t(WpaN8sz9!R?qk*V{nYuNf5x;UY)zYVRJkQ~IFlZIe;Jyqra!?!gi{_w;{#8`JJjG)c!vr|L z1G-=cmB3AaD$;qR@;+ok85#xPE@h*4G)~+X`>7SJpiGLVd#)2!sh^teO4#{r&%7-f zv}XYB|4r-kU+V+_{Etk>X`svMrNP<3`oT#dmb<_E0_R2pw83}CgGtm4PcaS-9K8E? z;B$322;*gZg;zSu^nJ|tB0u5!YhOS)6mQnlJHADpG8sVYXIdw@IDtmu9xCSzRi_sp`g zR>*^}KK52+8ryucgCXiVUvh4$3c+Mw&CJ_84ck%NUal{zr6SgFypf0Le<(@3(aGu?r6XpV;!1OJ*&C--fU=WmoJUf zkPV(d=&C|%A~)w~@o?OmS4Fr>@pJL zYg{Wfa7+It&Wb90#h5K3xUk-<&*?AG9r?Mu4(jWw6_>_A z{D`BNAqr6%n$iV10xjt%wWFV?l@@wGw!=+HVl|>^J^thedKt~+NGzlEh>^u*Z=NSg zp&yTO=I}COEFagE#9mB92j@tzwf(oJCmt$O4AF0#80(mLCf1m>DJSO0iBu9DzKo%?;q=;}!u654xC7$4+HKBx1m9qgkns71F# zvbc?VYP@}pim+5mFavp1yqat8^&3tZwGc(*vDv)KWgIc5@`N?u+)k zO@FKI1%+cNPfT>&R*xdlYlUtOBK^&ohQLtP7|&f2%%OiVHLA z`j|@B=sjLx0>7j~{0^zK0P8T5CZi0QQp|K0!wYc^m*`8XsHgC^c#18^gJ<}Grr;70 z1dYK8IzpRp5MNRPZlDNFK_nHT&!I7g``|hRN@6k|q8pt;6qP|4nCONQNXHqRf=w4` z6S`6wDY)qcZ9y1jqAj9i4Be$NbV3Xjzfe7DL*+$%kx4m4A=*NrIY2rDz2^96$ z%~)xF7d+vwL{?}GQf zb)1Te<#>f%{0J3M6^ZH%-_T_^H{BwSZlbEFP;;V)5<~fGncsMGRpm7M&;#+Magbah2n$W=5!#F8BGKK#Bv-v6XFe{yzozOa z<$0eGmZhrLcIta$e?%?+3Npaw;i?A;|gUTU_U#u?(fTkvLBQ{6jP74(ebkzefZ%qYTRE7(GV! z(vx{F`WZ_xnaip__$jyIzqyTg#YMR{)l;ijv5hu#hz96I5GaFTv>Pk%4gSE33M@N9VGui7&$Tz&3MdDBzf8VQG9O9 zaQ!HD%U9N);<2&JDkeThH*rO#7#FO%1{(6;NW@W|f?fCl!^leq5zPlVlHc+}T1Jbg zAwI!v+{Gb!4lkb5Q7VFc{3EAQSuvlwP*HBn%h86K0qo||@FI%a!hjbksDT(%z&13) z&**~+h@uMo8IRG!@ali*6e{r8FYb>mVC-OhO;J7wk*N9CT>8vf~a z>+~f~ah`2-Fx%Cfka&RE-}Ajaj@$&(n2-g`C2A zmfkBOT@BEPKj1FzP!Xyt=ZIo>vn(%K%lzVZnTU;|6`i&^gT<)0dg;eg zo9JKk7^kit%S-v4*g-FOC7Mz$+G?lkR5elc)j8}~6y>>G$MDFZW>FEOS5#5V6BA`& zKBmHTIe(0AqYuHZdb6J9zo|TGy{hBHtEJih;J-1z)yeuC-^<2I1S=c&j8EOCFxOox zVwPu$<0MR|_6=LHTT2{D>-Vnf^Ho+vp%V87x{DP!m{QFA$$bfnNs0cr%LjEbFIRhX zKExoI=TmRG&XSM7K?!W*6kQ&YyHOQ&mMfzGrP4;~kKSm6bhR6+xxCzm71%82sh{)) zJ}*|u&3cG9fHT<7-Dn4YLX(V7Wi?ez?-Z5QbUoVsUK-|NJCYypb@hr*<9lNOuT%Z? z1p0vLI#C>ygX9gK$A5|DdJ=yjzr}S^C_}>?zgmu6m`dM@F7md#Cr*-W929NDJ3dSm>2o|Z z-AkS@1a###s+U@-`^d6tiSDdB>P0%1>q#TzoO74I&=`Y$Ed}619o8>J$LfLz$>+fQ*oXb=wc{~OL~vvPlRS`XSldh|uaW(Y=DP6|yCW_H~lrYd+4}!yEV55j|<0!mniGwtSLg2R0u4t!gz;&52|phcji>-?HH&Y;Ky`}YMDD?tP?|&WoC{NVlz^A7APxm+1}5QG z)W%)vLsR$(U8L1CP6&}qCwT^^a&CE@3Q-}Ni3*rQkIAG6cU+DQ5r;p!+xS51>!<(& zwf~QR$ARXpnr}(JN;(a+f%Xi8r{#Em480~FFO2Nn|2is z5a#Y}y)|0}zp#fnj_-1ykN-*VkxJs(slziHWp%b^Iu)F&p=UyM$oW#!irPh%{GIV; zeq7g(qONx4rTAm%*}dyBO6igEn7P1Q=((v5$tHm&dX^Dm#$dEFU)Sb3eAua{XX@Sh zM}3l?qLQA(ajFf4>(e}jrnu+09e>MU&>m%cVm7hP%J%pSd)(n>dCKWZaaXVo^GWk( ztDZm1XB#p2S{Dd5;y?9m-3DEFC$zrFRd};u>B-I#%Bxzd;hu2wH!ITh!OUqaM|1QS zy{uTVgUYI+&Hz=TXh~0!yWEGqwx!ZkM?F_f3^a1i*zK$=Vj?(PPM6kIb!pz~?Hf$7 zudA}OFtbq9$gpftc2sehVAb}o4*2c!fV6{_Z; z6;uSzD1bTGh-GS&a-89E6#R6WzE>CQVX`jv(Rliry5b2>riW;vzq1A!QTn!ABe%*L z%4=V72FYJsXPgWt**Rn65H)0Z1uWq^#x`9a73B?@i@L^Z={=mo8A{#Jk6U9LFE);v1B`5Ds=S861#GLizSrT)vWr}&0(SywX1%oM zSy%MJ)WKPCD#@EKa1TcC7pE4z4^C12+~UZ6fFY;--v`WyGkch(alVDptf;+2lycSJu#jUx!TEuSt zfMIkG^>rn7tH-)HH`0wbQw>y|c`CN(3|{37&~N0Q7>o&YOTE``j1qJdB?JumnR?(H zenAuuqn2o<2Oye<^KvBeD13(&{E)WO5rAvq55#db6yX!-jBj`!VyHF@u8c)I2h*u8 z3h+TZ!7E(lSJ;L{NX1?Z<8e5Hl@y1%ybo@iN1W&na5oI$ny5u4zTxYQTN4(AtD!_~&M(Ut$<^EcM}qtD)aESGdr zybs+dkAXk9 z(>Wv7njKvm(Ox}ueiWx=i1>uQ5%1*$Sxsc8x!Es-mJfTFvp~d%Yy(Azl`mwOx-4>8 zAt)n`8LJJqC}oVpdK3|pR9n?dRaFi3Z4R}5lV@D>jXcy$UUoYaE|%s=OJGl47-anyevL5sQqEV{_Mjj`--Zezg0{19zN{t=O1ow^L+@U**T(Ki4n%Q7WK1t zRu$_6OV-FY$g{+~BfOF*3sc=B001BWNkly7q~|Wm_fn{VO!|3sgV>ltoLKA7^wic^rSKb+jI*^|zD@C6R~51E@`haD+>v z3oc6GB!JU}<*ZMwcuLbb9d$lvrreVLc5xk_;)J<*ly z(#Ao#o$sKf-5ri$(2!sVuN2uc_vwo=GAkBu<+99<>Y32#OPnxNa1L;axNSG)-nf*R zSHIyl{;xQLzY^1k&;hSJT|=@w`_&j#)M;y8GRo)>`jcj+L?xHXNk}hRR-lqb9%?QclgV!} zz;)KFU@dh2ZakG&TybW+WcMQTwY(Wt-sLiPgwC?Qv!;1UTGw2zkfBz0cmK$*Lv)xg zYIDfEkTucKq5DE*^vuw=p`YcB3>zLEpLbPg)3CVc^{!>^Be^DddW1#h9^-x$YUe8C z$rtt{I?BB`qGV)echRssk$Q zN*~}`a5>krp08`aTK?wctEduP+~f4+mQhXiG&Q1TXS<$*?d__9^W6AamNyR53(6+m z1gG0gojfU%eQSbeF;i%;nGks;GL;Eds9tFYJ+-uYF^) zX6fdW`|c^vUkVD5_1P#ZOevC@rtb zx8|VG>DB;Mk4t|%`F!HTJa&cXmig!TE;`Qxdn_Sq%EI~!PT@Q922Bzd!#kMwtYBCk zPq=%z(L>ZTif4Od&9+|3JHEr-gEnq>z6y z9t7*y+jMq4NY(Rw6?o+HncL}wcosGjeNZ7^)u?wlTO}Nm%fxR9Z|xL2!(Eh$(hvHD zN^^|RiN;YYdt@DYC23pE<@xSg*q0UU}E_ zP4o9P;%9oJvZ`i<1nWCj?To;VVDmtuKxVMAKS~wVmadd>ML(7KU;P?YyJXSV6nU)V z_^e|zRkp!9wZuLQN9M<8_=`&6DyoS7+#9KA$oT;b!5qBBVJv`K??)F@p$j-cGsH*i z5_QpR^k7*tzCA@R9YD7}#1p)tve(q3dM zTliVtr_Q1$TjFcz#~Gva8|f2WIaF>DZ_KAgY5AR)Mg8SOHCp!JrS>-0w|a#WI=f2KmKL^+`TU(;VsI#j;;!gI<1iIrdVPi2{#c&@l*&qI1g{67)0|`G^A8Yz$zR>KJ3FK)WRK9hl6G)g6g;n z17S!sJ~hVDJf6=f+#a>KG+I&}l!Zss6az(UiTEnpD&BAMN$pDwHUKbC3vL)__;0w0 zK@s}z`SJhSH8_DIG(*?Wt(+J>&a2cn=1;~{S-rr_=vq0Nbk3J!VUD;gs<^GhI*ZK$ z)}he*AzM84IFvKEkcv?ov{c_XUj#22Y1SL-FR@!Tk$J-Uo6lt9w=!X5N|E?G=@D6d zLQaMJ5}E@K^>qDX`q|7(U%&J*>FMc@RV8Lj&$c5pG}Q6DkSsf7Z3?#Y6-*eGxxo7- z>@&CJey)b$93K&--MPgE>ykdlNxsfO5m=*NsobimyiR9eiOJ%zNa1E2&&5#PY3no< z*^MxBUvQvZ*AB&V`K7VOC?F#<)f?#PF;~o8{f73`)c^l1>B|D@dzqO zjvmErR3Xmov{RGZyIkGGqwM9Z+UCjNJZ`b8WFO}sRC4M#jK^`^U`5e(~ za(hIJ1AGS#XW>3F@KAeE1AX`lJirT7=C5=Zrl=x%9^Hjsr}G4OX(vCx4e@c@;o zSBPS|xV_r=g;wiBzSG$3$|F7v{lfFJb?a%TgzM_QaVqVhUYd2EYl#t*DMwPQGcgcO z@n~cp;{<%6M{xv~)5Uo$Rpo8YG%>_Hl<_Q3!b(p*nVFv-yv-)Ai?ceH>}?$rTg85H zT}(DRlEfokt9lw=@L%d1V*bJ~frPEkaQkG=vM;z@wJ+7L`gsO#zQeyBy$zrQ?(U@E{gPW6r?@)ipJ^@dJoR%ZQA0mc&%=) z>q^P4o)nIj^egwE=F|uiaS&~&8-Gg+Xde=JF7}Y5a5Usc*v(@w9Gj^rSKx36{Dv?D zdASb5J9g<-Jecmlf}quygjlMKWvEDZa13q~#W}dCDRrVa(M4Po-_a_(!zpTy2viqu zsFd&uVN4gP)BN+VFL+Vy`*K?2 z??$k*Q(4#Kq6mse5oPS~*w%V;cE}h{3C~rik6vwSm1Ji>??27gsJ(+i^ z``YAhP?DF3PU3Z7k{Y6euEWLyx!7DPlVozdE4gf1yV$|0KWAKvxa4`^judswu~q`o zc_Oa(``JB$mF_-_55zZk-y(f{dK00H(pK8nubvKVYu zQOn5l_A~D~|Cq=w-a#pkf)$NtuF|{|Zh6f!#?><<*1X7z^d{?|{8bjrxRpLCrEFIH z%)h-;0@WO^Q;2>QVX{Kxdv__%SD}mi&65($(=v??xn76}E}mwu;-C|^VIhI;+h&h z+KY^9_P!6JKUOee)l?DfT4ufuJE$E*xh~jKF~Re5VpMul=jJo>?bG+A|1SR_K5km{ z;%p6~8dtkqE_<sU^~TfDd~dD~nQ1nb?X8sXyP1)R zBXjjie)cBGGl6IE1Uu8SDm)PQjzY~O9KcN(ihzpbc=-7dzJd><#7^Yo(x`(FT+&DI z6uI=baEWix8s|hTsv@T<%*PQF_l1KnzNOFNqpU4Dpc(3OUR_w!p=!=#J<&O3v6w98 z>SfqNh42b>sHa-0Qq)u2#ytK-GIzFn;v4URlmqHVYiH;`5s%S{GsNDsCRtxn&#ZC$ zi}AsDV#Fav#L^@_EuV@Qu~sexFqmqKPP9m_H;UOVo=DYT$RFiN-HwlQdwRx;>;|~2 zqxCU9?ZgC!BcF4|Ur9TG+5Rv-g>Va)v`clAh((u*c?2i{zxD0TS6`3A7ykF z#p33qzvO|jG2Xsry|;gQuZH~dM)78yr zG)wAuxz4awiy$`fr3#Hi3Q?jKdnu#cWe7#y*}t(_^6``qQ6I@x~kn4A_d+_Oxl zMhx*B*4M+vhHT}Z!X9{5F~ScN87&3ADXW#G^ofSS_J=zv6y#7#t_C-ASV3%GN_(4{GWE|}r}5ct_Y z*6nvqa1VhS(Wr&zSd8}inJ(qb)aCfCxaUfoHL z(HGS)5fn8<>c8RZPpU!vMI_$fZ+T5T6}OzL>PK#B*R-41rJaB(rLzgQ+-9`1b5nQ9 z;kjnDG1nGpoGU9TPsq)b;c4Hz@0;=!s$;FYbC9A&-N_vcmvs#$C=BWc$IFA zLs+Mp2k)s5>U+5o(Nu=!i-4ZvYV0cF-WrGrR<#pCKH?s2uEA3k{dD!Te-C5${pYM!l< zP`bVMoAl;shh9{FRX@Sbcqda_$?i#>ZrMUZPPixe8if60&kml=w%)h;x8F24m-P4LA=$Iopj8Q`Qdv?5{EBLv0fS%Z*|>{Kd>P9y zKriDNoX9RD(vts1#$A3_bw*vopX=ItpL5~~NgzO=3GVI=#f!U^ws?`E#k~}7(c+XA zw?cv9?(Po3LLlxrviH8O^?tbT@x0F%&-??{$KQuJ#+vvEnY4)+Hb3BXC_|wXO8t;W zgm$7kt;7uRNX)0d#CU2*{i(a7qcMF)^8?!tZ141-+2JsY2qKZJzu+7mNNu!*MlYkC|3obZUdK1a zVk3>_h(?0{m!U*F!Fy}56>lAMhkL4fyV)Z-9`DVD>K4znduzq@fAvpVxL(+8dbW9O zTlXz9n~4Ne<|er{J?}hS^B%e%`>Mp>OD&$>^Yzm7qM7~Pzt5h*M z)Lfj?j;KQF92Z6emDENDuX6PY==L%FOG@HeeQjFrgkpYK1?$FiQX4T+Wy>m7eQy(0 zNsW_!+=_OHw~nG(VYt$fw%azsL_fS>@-# zyw_9N{75hDI-DWf+lk5{_o!ohUpB=J^iyBM%PEdU>Ytnffzf!99#UdVzM@*KYL$vl zD_l5afYYOUzZ^`im2)Kiaawy%U%6hT(=oT(Y$jWKJmzn9KhJAxpj{=;&$?~?kvQD( zF}PmB?5O6YtLHq{y{;G@FXQa*5X;w9QM@C<9R%?tI|^TLgIl2>ytD|m_8k3?!%Oi3 zk!rL02wmhCwE)pb+lP2Zu<;jk(NY{kD%Me5c+E%l1ynQ_`VqsRp5YK(M5gk` zerBGGOrxw`UyqUFWK;Pc=crn;jh;zG@Pa1b0=Cj#dP;XiZ!DuQ`>N{C`+Tv|&+aGM z(?iS=uedf%Cxai;ubPIZ)I_8MTwd1XK5{Jez)5PMlv-o`q&dZ2?HgGa-RLFy(h#f` z*JunK=B`{^B#FJ0MBOo))?>OoTl!>4*;L+=K|)X+TBEI_WRZ;8+(>QFA5v9rtdBr6 zkLRz{N>xn>br4l#OK!|Z#D8KG&6jt%fcjNXKhA)SC~k|}_=FL>8f9<;iO6M(s!%B2 zP(Loi%tuiMp-4q8m&RHQ;z`&}od7JM*>nI8_&uU<1+%aU`?xr+;5IMiUvPxVP#J_m z=XLx5eYq$0Ba>F+0WasJI8Q;iL(OX)XmX@soA%L7y0wh|AA0^r39&>AF)pZQ${}z5 zFB~PXna`;F+<^n!!+lACpD3dxexlrzntf#&`>XAGfHaOs*Yn^?^%x8W} zH^X)mDDEid-IQ5JZPV9@bn9pA7p1tmHqVHZhjBwxGGEF5D#^-F`MADWN`2&cIghPy zyH08oN6UyO>6da!=bg;y>2Z3GdKUPS%)_~3+?6~To>o5c`8;3yE}F;ON6chvwY!$J z(due#vbS23d^^pOh1;26(4?}c1%PKtHje&VTTH{svSQT#jk)B?v_ z=Xq1Gja&5tACYXZVfp-gNTpQ`_uz)sB@q99FaJ#ON~4dj+iEX2!X|HLeFn!H~iM zNBsT^>F6rqx2I?=Im8=Psn&Y*3j-|Gf!jjugGVVMXua{Xr%CEeHcPz z4}CUl3>J?7HJWN7SB>SL@S11w5`5tl)IdBwp$uI1bUPIVtx)j_pb zb+P`I6YLt8fEwx}CUJc}ulk}6cH4uonqv99_EGHR5L${&svLjC9x+QC))r`Dz7a=x zD9)?-C4liD${CuLVwVGk!vJ~&8xEhSh(&) z$xTH&6zc{+{GYbRg$y|CO7gDtyErT|wNNg|!+5C-mj&&EW~_C=93?J@@5O#Rz;fAM zOcS3qjaq3Pob_-;T-5gF#pTw|I~=+`uyw#XLR)loYjiAMK_3W+hdW zAG=#x+hxz-zx|RNThjA;2g|pwdM3BeN_l$o!?7>np~(UF{a0D_IEbI+%=g5Z^K*81 z{LFjaMphO3Pw#STs=3Nt*;mQ>FDu2PnRk4f?N6$xvDXpmtZZr0C)qa43zAP!lt(z0I6k{d>Aln)`=0%^vDuaFJC$1|bEdrG|0HMt&8E`g8>&aE za8#CJgYRlb^zX=xJ5*M@Qk!WT#mZ1|MSF;rT4l#?j+M#n-t1QnwUp+&>zNZQjb9@Z z$vl|v(iC%xRYX3P|Eaw^$5YX)Cnx$2+A~#$%%YwzzWS+;GT!AR6zCXK-Ekr}Dr;35 zC6QguQ6i_6Y%7}hexW(yny;-Ir~04+mDk3rcp9iRR39l)>m?uaLhfz(@KTQ8HdK|e zkS*S#Ik#7BID|^7k@6*{sg=CXersezc2BDy8WhU^a%_sSE4+yKY^R*eduX~nyGnG5 z+7uXD;&Fb{nVa-wqRx#Ai9tCghhQ9~uGnz9H#rcse5qI^lkxw0L|=T&=^54*UXxX=4Jf&RoFDn`5#?~$L|a1mwmCe=~B zP@QldCjH98=^Z7~Ha^IQxeZJDgs;$)8lWh`Wr&(6pK^69mX-NimZ}E*L6g;JddEMh zL)rr}sV@`Ga~o@;J;ToAE!e1v;j(I@9>{_6wtS-MsHxUI0oqSV90gW0xh9Ho0i7+G z&J$>f=uHu*%7+k&8B~Jr&`V0Aztq=!8KHC;)scv+(5V8o<1;A3pEy4PsU2bggkURm z;wAioA0QkdsE8;IK@<}4CsMH+Td@yU5Qxt3A{DvZk^e!oI7^WP^2;Cr7=#`<$koC$_RyMcraEp6XBv2Z?SW&E}rt%aa1t+8!_5@@rNFxhv9cU z)L54Hv$a%t!^ZpVahRdm0e}1P$HwW?b8hWkey#SSVntVmZV#AL?nU&Af)5@p|MVoO zV?kfg+<=~3lsjrEu7!FNM>WR^u~M_!~{QZJ$k>#8p7SH94#daBAO|Q=9xTm`0-fhm);*GMN zH^x4ZU-Y3c@g8p|+HZ{Oz0--M!Vnj|70k}opHz?@BJuOh%qls9%1jEH!M|t=5{!G# zAJz+)RHC*kSu(i^|Ks{kYsLqRp7@QH=}YK>HU^6vFa3YyM|zt5#kXG0v8JhN zzQ5dsxPqq9W|OO0hyVZ}07*naRDC?9IU2i~V}tXwxWl{bqk3PvffA~xs2Q4@F_Owg zbV~m%^RB5Lm(TbnH9GPaqfN%>QkNn>=_4zomh2xoP}`+u^Qz>l8T;&8?-Nr}^v^ln z%>3$YKqIrIl@hikZAN_UB4d)re!P`4J0LswYQp58|02%iMmxR)JeQX#+V}%Ct;#$D zp&}oEOrdZFUfbg9@KQTIg+wDk8-ub|ANyy-%N^EN2oq)bZ!WG?a81M~|JlI>`H)rJ zYHh!lRqbwAi}7f{K~nNk*#~vGh#HSIc!^-HM{()(yp!Y*M=P_ znEkgVs4Q()tJD(p3Sr!bN9d8%P3y^lY{@zNoLnfOd@6-)YDOo-b;>WIM4p{z*Rt2q zLhZe%&o&OJl8B;Ze3Xl;KKP5Gv6kAZNtn+cuoIt9O#G;o=AL$xTqG0Z40%fVYmF(L zX5kp$!+SH`%CnnMkT^!u^k1}%;vk73u3mHxi)bn$C{4^jG)=}DilDM$IGSTQ_k)YS z;%m4-c^HLMe!_bZgYx_uE}p<60bIv;c=?Mu%_DgomcvVf_$TBdm4jj7A+k^tjp4+3 zltDK%6?*;dxbvyJ%=cfe$p3@Zt?0x4ZKZE=5)Jb^24)MeAu?3}OFf zp@;ON($C6Cnql@avt6Gu_9g5vs^;%eI7IHT`|&I_!us8AVr-`%{G$CLy3h-eO@nb) zxzPmCqMY{H)hoLC$8wL#xyoc~@wbK0vJpsG^gU*agJQ4tSS%AgsH8qyv=pDkVsTIV zNpGMpG=gLjUzU5UdFDrJn>Ey~Vs&*bHL|qZG|@9Hy<9Fl^?euPIyeUwt@Lbu!0Yl~ zY_D%Lx8fZBTPghsg<-ktJAJZvWsJpct}Nf89e=|ete@;pqL|8-7pbnQ#J}sk`Mh>n z3&$BT#Ia1cwby!8>4y`lp0B^<Kk9Z4|!>~{Akj9_}&f}2kgCOpS)rjVJgd&yS@oqr1RX1UC4z|;3 zWy%ES6Xz~%ysAp~v?d(F#V~+&+U?|BM+>Y$FTDn@h`im82phJg#B%42kpu9?)!hi5lomM-f3S zs1Z%Zd*t8=9iWS_X*C983TMcQxQ}j#LL?vLM!3$Id;x*{1CQnj*v~a^26MPFqUjg( zrGEI1hw%;Q$e}7c27?hOu8CXZtLtsCyrI0%>&U>fcU5QH7T0_`?d?`q<06&BB!{Wb zFiJ_O_NXnO}9+Q+?YGo^C@$VyMuRKP^W&` z<43EHe8G)nAIoKZoByu=Lszu+oW@a0BTPR6uk(;|w__??`2W`q|0hH70=cLQ9htc= zte;ivUjMrr;v!$B#b=~gp+_C*qEf^DMQ??Ssq$U%uM52jj0r9mGW7k(FS}BXkDtD+ z)tLu{62ec1R`GA?8evp?JNk3wgp=-T=1j9(%Jl5F**(mB%3rmWNBK3ER@JGqK1-6w z*0yVhjfaj7YKh+3*rUJKMxYKo$VY5+7MrucOIhb-QHy(Z;hwAZH=eSj#uJ1FTo8lR@IUd zsVq&wbe@end|KTSedsT3gV^G97W|MhFJX~uLPE{Q+Z+y~hxGAl>9f7;#7|sBCUJ@? zre@&|#~VrdI!8^%B4fDT4H1Z;1a`8;@xC?YBhNM>)kvkg>fpU9AgB0ZbEo-MnER~a zSqGGI%!+#z99F*1=Z}6(V@`X=n^S#1h-@(zOSQ^07-#(aw8A2rmm-z&co$dZq53|) z&(9q_)JMFalWHZe6$8aM?JFZl2GPIzDlSgTwAcI^(e$@B)Ay8rwgc@d;!pjIv$*Ol z-1>V;5F7RS+E?uK#ONnwztqGi-E}AXTBUyro^)<2+9-Ti&{LXW4?^I(;>p>*`5%H( z4%)jjAGn8_<*e^yH@i;m5{s=7d0X^c$HuJX!Q&$0Gj&H_KcCeguy*iXdDpyc9s~R? zxTYh}9h??N1vB1d^ySBy?>*;H!tpF%8gjLD`Ul*nU+563iY`QqVAPP6&m9cUnAm#$d;R5Or#){*PA!3xcC>%5nN7W^Yls9dcGlVx$8&Q)B zs2<|7aa5~pHB-T=8aHMqb|X{;@B~LNkx0UH`?#ISL$R7CsYm>YhicpD0zEX^(Qq0^ zhoRF#p351m$yn6ko=8>uI09$T7qf5`^U2TAQ2eVuup6jys;xSVGrRyg&*la67PoPf zCaAZnoOs25!lg#YEvl`yn9{^(?usfrzz(uAtxj;EHvSYRsixQ<9??wNOyqWWXd-nI za}Y#QB%=mhq%3NO=2*dA`cW*Svm#ksqF#tb1e&O;GLn1Y0q?^u3`Yg@<~}ML)6{E@ zM*M|83v#%P;f@`)i~r<^gkfn3*^Sa(=3Vjp z5b()A*;OU{bdU_F!2eJOZDnp7&r0`Cu0YP@f1WZ<6n-N z1`Q8h5u6VzXr);7|2%g5Z}?ki_~uE)cd1{76}%A~>`INu4cik`1Sds3J=Ay6v(0TL zZAx37cJ)WzT=e9B>9stTFD~+INLSY#`++QIzsc(D33p%e6;z|uGkcVJ%d;qog2Xt! zz$rYpAiu465*p`F$WTb9+!S64euXXq{t zP&4YT_E0x!BmUO!=&fZDyf>0U`?-$#AI=@-`^#O_EaiOe$_(6-KKJuz=CG^+$HY^6 zf}AVHF=HOxql&P}j}}oE>oV8p%i51(gLb^Y@3{|hZU18LZ#>SSH05^xp;b}!ayy9+ z)D+8AHaAe8aa4>VKN>Awi9+-pm*%eY5beYtR(X55{T|;@7HdRmr*c@YJ+Hkr@}>K4 zabC~u<*t?2#!ght3WvsPY4|;?eyi_|! z7sdDDnRq4y*T+2Zt$o5?uN}u6F;fc{EhtXxvkviP*Fjn;>Ny7By(mV*R=yRZ5~HS?tV zbqVMxPnu)x)pVp)OU#^FXvTy@M@QJ(PwRT?{h*8u++|vqU3Ksg+ zKxK8#dTt)HdddyjTnfT*^h2nYgR*4sbybTS<0?Pqk=P}xDz~;4`*4~qyukxKn$~a# z&E+LDLzt9Cqhx1($M?A{w?-m0E3HNuda7>8{^}lY<@-{?Mj{7LH-yqf@iPTcT^%mG zM;pvRXV^H3f2lRnKxmB%NTrdy13*(1C+G4e+E2R?O2cUxT~PVB3jV`ZW@M>&{=f}z zhwJb{+Jg!-2sZ6QcaBj3zK1Okb|6s^g)iAqAta&2#4i(rT5oc ziLu%wkw?*@h3{MYiB($-p@P((QdEAcr4hcEY4|~SI&$LE*I85@T{Yvx`;n)f4iaiH+IVi`G%{h^}Gg) z>6U1NO!}ABTl=|_s>tu{HCDRFrbXI&9^!lxFhou=gVS1)%|YMb;(L$^-Lf^w02n|Lh9I3_QuY-<3bMKgB?YqSEb)&rc|{Q78R^vBu)c4D9S zil?z@E7eQ=sSTvgXd(VUX*#N!sndMW8|W+?&`S^DrT*=;QCeA}C(Xb!TEtJ~zU-p> zm5((k!S7i#GK)pyDB1#j&Zc&FBI zfLutCMhTw1j_-#Zlpb2|L$d(#C2+f|Avnv zGTTRv)2n9IDX}TvDpj(wh&|-^T60pDp!wb^zVYJJi~ULO-7}MmXFK_hXQtg0&3)tO z4}E>$X}vRETN9j1C|mY7^Lb|ZYI@S#O?-#BFPhq6&c^-~eHV+}qv!hx4ah* zH-18S{=~JEspYT-r8z(3f69ry7W*t5lGt^Cs$pWys{Onc1Q*v=nFHLUr0IR?#CYhfOa< z8MR;a}Mlboa6Vx@Q`R?0AC%C>ne&1;^X+AL=Yqoz@s`>5tz3(@Kq z-*G$CTFw?C5UpRJiU?IjcruUi^^#3_r;Jwr@lNcf`SixS+f^v|NJ{0LsqT}3Z$b;& zA2FS-=_8Pf8{8yr(ewK+em`~X>e(m%2LIt#)A6!++mb&NDJ<$~XGN_V_bP8J`!Z*4 zR$R^tce3xP6@Tg4(`_G)p5Gc*FR`S|h0b5H-+OgVaYW>;FqisWa}IQN%sZ&ctJ!=O zT}85;sTx=*`WZTatwelV)xvyK7m8l1!>XAKlA*FbkHmB9gmye=Sl9y3xa_qVvqf2F zb!U#T8f(~t3$QsyMysEd!5U*C$$Z-#nS$4E_IPgi>#o%Lk zb+aPXJij$T-T93CPENv5@*|I(qdxLMbq3opoflIq9@>eHK7K3gq1;^WBx}hu4&<3w z6X4NXiv@C)))1%ju8NKL4iZaLvT_@3X$U2IZ`#|eYuJwMIHh7_rcA>PHC6@FS+Snx zm_2-H_8skK5k^(nkEa<^cp;u^{ZN4pXc-}0?b7bJ;!jd;+;152ho1X(Ut(g!(8>eS zx=~?Q0&Z9y>oIHWROPv=IDX)W$fvCG>t-QBaHLp zNs?$z2OM>^-}N`n6nrm+v%ko%x%9DqZ(JAPu!reh+?ShGA8eFY<+s{MyRPTDxMkK> zoy-+z?04%^O?!H9p|>3~7doEM^pvrgsbP7Tk=>~9UBAe@c|}|y;i`JMng!#H6<(Kh zPgq|nWNj8{2|s4-!NR;e&lz1eyIMW$H=e((vNrtAsCpPG^5akSE4g`seLcI8x3m}P z6*3a<=M{-dLg%>kaD(<5XrXmtO*n`J3g}i90=Bk}YL~Htw z*YQ{AYbRoc(GWlKZ(=7((h0=iv1%!I%9Tv0k6g7#63(H$IH3-x9qLCsqx|Ry@Kbsu z6Y*R$u|_^lQFD=`=BchKmA}T<`~n?$v8IW&sG#nt?)Kk*I`Oycw5Q7yIkoJXLQ&yq z*k#u=bFImqPTBR`diI|gXa8Dp)V>ybziHgfl;@S>ViO7dl$fOmd5mZu>)RX+=KVpJb5mRV0b;kfLQm<{yFtJ7sl*7F->K|E^+6sT|w7yN8(JoUH>aK+cglGjEO?+AQV&5ubt2RPR z*B#~+^M#6cX8EzYq*ijcT7=zv6@$cf>S;gV&FoVSOc&#Xzh2HU%vhEBSL$rJTIt+I z-cZ@9ioKoNYX`7bb+eXetteck>0`uns;S&;tADr+mg1-kQZG~?YrWmUz9)~XbUDb* zr%uQikt#N7N8R)De0k49^9LrohN&uQ7fqr@+DylfdK>McF;D-jlTp*v4!!M#{(Ac8 z58+sZY&pgk9PlNwkGw?Hwe~2<1NkSUtJA1~R3=VC6z`BdQA#aSXSfmd;%TxPhjT@X zRHC|OJH;m|YxPqjWq>`NKgfVa z=JVJ|K~xOeO^9vwXY05OvTc5ucftyAbx2;M4i-4@Hrw-WSlvfEQbr@uZ7^A8ku;f48Hx>gxUMv76+;V_2!9~B?9vQA6SC&uc(`p+?5h+VEI zJ<6D^tx&~fkaLUNrIOVs?@jM}thLX`<-7s2RC6-K0$xTTJPXI+R)bY}v{fJ3z%{j* zTjDuO)dAV4E@M%E5AtL9Foq@}TwUW+NL2~yEp!0Cp@!_Qe&c4MlPHI0tf4HQ;d(rd zJE0Xm;}-v|N@9>2kA9fXAJl7P@>SK8qj?=tv4`APu6p7X(s?wGMlf$hIa($!^EFvr z4dxI&jdRqUUUD16BAL(98Z_e!4&o1Ltoo#8Vg~{+ovU##erdl{w%W-bRaLcIqgQ@m zbU>8Qp3)IjV?c8)b%LD_N=#%oUrQS!d`AJc38sl~#@F+9%v z&)vyb30=k~r<$QGrC-9pnl^f&El^f-K zx!%6!_%YzRvxu4kmjFI;JTIW%(Sx##3tCfck-k!-is*ik~ zJl+QKK9aQw`VWSmy-3^=zi3;yAot@ON})Yg7Vn_f@(sN7l8%Wx#t3Iz`knMrsD1Wiy%>3qed84fj_D9>cmEBApk#powH3TMU^p5<*IH9qI zOq7*=b_FTK5UqfI57(sAbF*pB>gCIMeLFpu-L-OR=T`Bp6Bmq`{=*DH$t=Tbb&U>LCrDz(;rmmz5LM3bwejysf%&dkzp! z#E-aUJk~6A4Z|>-dTQ78K&_zIXS5PtY~(5Y+MeROEBje<6ueFC21biKe~u3B^kuj# z5|RG0Y2LEnac@s$b#^Sz>hCM5-Ul}^TG-EuMMTu}wk&Zk-y-=i-}Auo_{YDqV=+DU zowZ)_F6H9k+FEr$dU&M1gx1p@VzAVb!$<>vOC@vZdn!^}srNnkU42 z1HVy4`V|TE3-^SfVq{h5*vNHJM0S^tk;?OM0!3AKwGLj@jL##To2gIf~TB(5wM$-!Km$K#D2 z_)|ob{t5k+1FdjYIcb&pSN_T)4KMd9KXXrxDbPq zoQ_ajmLs_fZ?bK>iX3gf?hC^W;tRyW|gLGd!MneLAnwE=+vBVf?8t!8Ns^)cL0Y zU4N|^<6e{1=X3Rnan!(lL43^7o!K{uSGWppEo1HRD4#!xF-_ig2SXWjQlzA&RF6u2A_R1IZ@gw0RN zNpt5-OGwT;Ba1q#Irh5F=!S9JPa2PXqpcdQ)!Ad-PtQA)?*INlMvF+JL}F@|t6f-K zD+fQ>`;ls&wVNZ5BDg$ktv?E*gnEQW$d%Xl8^BIO6o3F#RON?*r}*~JNuF)p=cSZj zPT^w|s-@$5(M7AykJMXp=8YN3UT({z!tuu^4H+)MSqxLHk91 z>Z&a;W~w$aN%b&0d0kxDzTsPLACfOPCwr}EuZ{42uwR6&|2WX@<7cWFo)*dxe2#N+ z%7vEJzju#}dg<@*T`X6nV1jwU@e1d0DeF7aPhXhQ*zGV>`i8t=Vq!*$?aaG*sa$ybHD{0vAHnDVcy&%tL!*s7G)Ebdt^?8at>U z68UT14lm~55?Z4SHAfUT}oq}b1+ z>kg0F6nyw-n}sk|}Zvp$)tmtPR_IAdV74wV{} zJaoISFDWeh<;BeHo-ZlabAHNeVAYm)Whqe{Re;RLxo5mC<0(HDLoim(5nHI2w~i_m z);<49$9k{s|D#j0n_6Fdr<`x0t1j9XL~(6t@IaY_SiDCQY!i*NjlOTmPjky4dMJ+D zf$B3FCkJ~HJ&9T+v4_9V_dH3)Qd=!RTV=1s1aa6e$^TYppgzUUH1|sK?eL|!M`^1Z zEdvVMQMS$R0?Pa6Gm?0hGHDrqEtiRN6h~=P6)y3}aZ3Bzp#oQY*zZ@oR(odws-(S#WQUh&da&VL_>t~1w26}`XWla zqPu$jOW`lVeAXe^OZ{jsuxD7)TZkMXaz&x*i%^G7qu9FzY=HT^jLu*v9ioQ30VH0EK~xmah(+>{s-w=1`NzMF^Sn99z*;vHl)an04T>2I}O6+^N&O|UHAD{A#cAvH~jDGqFySD1D)>~^;Iog^R>)q_ToO{Ic z#j53-ZEi8|JAULZ>PFamzbr?C$G>5^>&}}+$^T?d$f_oz5M=#HpRq-bwwp^SZm7kq zYjx~HxaiyJ)#cx5{r!J)eY$nz&gTbJT|ZQ4Qva}hhj)qd98Y<8hzxOr-g0M-Lnn1i zjg=3Kzw{WLeS_3%T8oB`AB{bE37KQ;g!}_T-v(Uw{G%sprkrYx_k6|Q%9D09EkJK_ z&;^=;f^-lM(8(n!A2pG6(T%?M{E4<|x9c|DpbvRfRWo_RSypQ#uG^XXMEz*C;&e_4 zoX<|#ODin8Xsy%`^@F|GNJBX!dAi`J=xDUFpIZhu79n&Z@VPci1e-n7uQ=uyOnayc z#nJ^zcU9*}Sy$cp{3^SesUJNXRVdA(q8vpn_>3doz9POA>F#1GRR7ian&a(7`Ynmf zeRht$T%~KPg}->n|MFa{Fyi$Q#s|4n^-%HfXs5MkY6=^#opD-Uho3m-D`-`BPY-=q zeBh;^50&z6hSvDp?(?eB?;`Ix`;?v@T{)~+R?XaBYb;gUkdl2cNLZnWP;T4*iIoYp)R|9T$)#RRZTY zuZeJRU*zHq>ZlHCmMW%Cq_QH0wu`}Hsy;wJfF~4(>SXa&E-f;&$FzgGTW_rqvKgh> zqjK-LKIz_o!sb4+u6kwl@a(md&Uxb>e(h19zhHUCJGEkE2-T9lLYL&%Tzv zKhzMFZbOdZ->neqcLZAhdg=pt4|8tgP1@%8R6M~F&)>98E%9xkg02ewLseh>y>S5W z+ZFI5VjQId4vGWLZT_`X38Z2eLMcz1jZ}FD8 zMWyu2_U6<~ENZQxU5+Mh|IFz4!=8@()EsMWm+O6h*oWk4)so*}B{x(13E(8dH;_epQ-HQZ@>&M--KyWB64n>Q*OQ2YBceg+w z#O>wT?#$eOPP)7VyWt9D*(O=~SVo$g{>jHS=eswU)Mt_%93A zN*L*$s_yw_Gz8OI@h=b4@9`mOfJ~$}+BrUzcVS8Fw&(>*a0B}l@A8WJ3*O3d(f3|D z@t)o}XH3!tpUe6npt6hE*i+!+DR`aws|HF84;#6$Mtm3Wd zw3g4gmEyA8boI(^NuRBq9DB{d`b}F$@W|2X0swr5=8#J}peYDgN{c|lE35<*XCAf= zY%moTz(YDDeuru_7n;IZoJNTvg0{=Qg`z96v^t><$tKj3?SakWt2%1#p*r{kY^oGI zQUk;;>PQ>K2`HrgP{;8RMtCFb+`N@;8L{2pJ$?3i)-ZON~7$u&2niyp;K6{K&%9@z+%G<_~mL6et z4=hc|~Xtz(c$tznUEC0zH7HxV*(jm(!&Agf!VgZiXIejvD= zQu;#*)JUuODGFHDyDz`t&a5k++Cbr)Cf{B{Sy#angos@Hc@~nqr&DJ`Ti1)9HXP=< zl-wEi?RZxLuciu7>*A@HI;d>lAb`Fq6&$R};lj6(pjJ-^6U0 z=pAAllJm`@#um|7?C~Aae(D=7c3X(0jP*}D`i-^EaA!KtIll;}r?cysInT>|EmRPU z6n*FeI|MDD*l+jxCG;B<7UgdXXyDx{hN#=-LK&y}+kASwHbzyaOtFL)XKOK5+?RRu z6#iv?>>_rAeNcz_u_E}J7-ANsc|V?~-N`6haC-Ql&>`Nsz7gUfROdVOtOu*U2K?yh ze(s6&IMrrY4C7t-eRkh4*J8tEelfV3K#y}5dD6{uu9BjY9G<>8Z-}#ALU2Y(*3i5g zqP40muXnjTKtHMX{PzY6+ZV2A&ttmvdl z-to4Zp{Wq({hNBTZ}gTrVgh&RhWUb)+ml5-QObC(_mZzPupGm_;)w2IrJ)EP2aPdL zjHX{zO=yZ4&{NG8z2thViWE~#u(FbheqbwOJ#jat*m10Zt*445(x_f9X%j%Sq*_tYpyfUi+#O3z){in}Yg$>K?MsY$nzd&;utZKqjYzA8)4~t?Gc*+Jb1{(aT z?}9uyK^vi_>IXL=9eT4JwzX+$LTBdTiPgmrI%*mn&OA~LafLPX4C$smtk-D z2{xQ3Xeae&`a5kPe}#wi9P0!&3h(JPF~f3E8_FA5BOIG-E7zs2?6J8?XuiPS!Dn-= zz7^uG{+yM-77z8YTBP~~Qu*<3yzLU9}`Knk5~v`SYSC{aEYJ>_5g3mb_K^LiS>_gVF( z6KoJ)z;|mg)|*fW_vUM4oo3zZI^&CXt%s+u36^32#)D1vQi5hdDnG6rf|qm}cVS1S zVJwSf7kM{J6tAaOWBJW^Iods42#{uJeZAhmQO{R3yN4*Ew+b4hhN-!ff)gPbm#K-+ z7RMMD)e&{YG}UnamL=0)<|EZflrcY;FU=gEOC=DO|6>Dr0Iw~(8h&OUHXUR5eXN6( zk*nLX3}vw%)ZE?Om16ClFW$(9jkvB>VK-h!orc-^3}1hgfqk6KwOP1v1b;yeD9$A;HL3vv;$G}AW zl;xqCr#G>hu$~%WGYC?G(!hZ}d;wk=q-sDxr~oS=7y3~K{wtUGNMe>s`3LEl1+E<=~sD5SrAsNQ2w(6Wnz`M8rj_}#6H7R4@~bbZ>iVz<9?}PVRc6b~_+vbMfhi^8@cMnQhr-qjW zxAj@*9MCM;VN1bZpBYvRidbreZi5glE3h?W^4)>UAXyVZjd-&aZJ$?eYFTMX)yrx( z^(cJ}U#WFunbr?lvgHr#tR-oq`AmHvcYwii;Xc2~@3D(I>bh3OO1ApeKyA5YxAxug zw>@4TYK^zF)KA9jXxy-Mnc@4p{}||SoiygjTEj!G>5^@iOZ(joFMHA_x2d&}vv3EcvaYt#aHUcQ4+*%>B5 zXLU=2P?XV=#b`H!QfOMne7=!wuy=7+3r&x_;#nhSNvqQkp=Ki)1`Ft*FGCcT?|lEG z{}D<98lzps!A+3rKlH&ccP-}}!(B8z-{^qznfsj%XI$VC{|f$lj9IXfW%JhfH+!Vk z(|H=JPEZpl?VBQB2#@cwce$q%o`ZK-#eaY;(J#NxqAWC9TSY_B>F?x!X+5ctxx|>L zcB{#9io5_FK@&d8z{b*n6Evo3nE%Ko%HVyN!94s9vxs##kAJXE)7+LzY^}GPr;?Fm zG&W*rlP%qUm=Y?6KG-IRna<&Is;zENB)x9#Em` zg`7sOR5KB5?Q5H>pM}Cyo=%HN=34VGTg+Z^pDLv8t5Nc_+N)Nnl4`F?wcoaE$5W|W zDaSwDJiz9oRoQqnYmT*H20o!Yk*YG)bEerN%wC!$JSqK~suJG%%f!q)Im>y=_#^K~ zf$IT@;;EyzuK1i{bISQJ+`gZ%0OBBDr-3y zuSffz<`r36$4Ogjy+=T8|4`a#cLg*ctKZ4sH-MHJp%#eJtAwlppY=&l5WLd22lj&? z%k0oV&@8(`wn4OIS5QCR>O=cumD_2hv|o88eS#&Q{#d)LRlpo}jyu_1OT1>$>TB6N zR*zr;=fW(0K-D_!*8BtjFxE*lP={@JrvSpBDevE)E;U9v094Oixm@o<}ovl@`Dw&^9@|^ zIO}OWn{5g1;{VXv&3@dTXp0Z_idXJZVS7UogPPJHRfl{(JLOb%-AQVmJ;wDSuc=XA zjC+%kygc*Umr+@koQ19wtd?&?lB`DKXs}s~uhnv;N^6sP%5ycNd-_=0;WSNbO7?wn{)Mxq> z@kg+qtx8g@+y%K$BgO~c53Xz87dOQ$QBit?R}~V$VhBuvD)5#~gPH81@3rTrYm6n( zaml|QjZ_^~swn8GXoUK4ew+!hjje;{Y6+ExUGkP1t40|6R3G?D{(wDDRh6Xu@LW`f zbgZZTWVNugHlJDe2S;nYtbSZ?;wkH$U<4b@eew3~SeecludG(bD)=9^QRDUN@{y;v zEMxoEKbcO7-O!(`MC2CNur;1Fx{--o6sJCL1Ij}&wFghjXGT+Fzx$s1h}uXgG=tw@ z^H{8SYlfN^)kP|=`al(yz*lgWu_w=(W3`EJTdpP=(`jQh&w+K?2##7XmgLdA7k%Iz z*${ZHKsFO=#R!_A%4=T&YUiwBl_Kx_=;2NBf12DaJJ1McgUnZXCkl-WdLl+fJqZ4Y z(stUin_Ibw6ZER$y9a&uICt(ye+_lath_ZMSwCu906*9RejaqX3wz)-{DHf`M(=SJ zIPeg@gK)S`B_V=lgBQL-Lr4ZIy;DiRp$t`p&Uls;gc5XG<-#3S2v^{BbyHM;Oo~=R z)iO0yT~~$Rp6Vzksb(rujG)EpA@u+az|51y%`tMIYNr-+k9JG{B99u=+-_(9eQ6im zQFX;JoB|)|0ept%st;77x$3)It!9b#@CzMMHTe|Y7A#N>M^IHcUF}yh@j4=1qSknT zia;ltsw~tVhGBWU3uWOUk77$)_-uD~7MnVQjhdadfHB$%j@WD2Fr;YJk& zzBWcF*~GX4Q+zY!5?%A1l0$4u={?0enhA{!3YaJBYg7DgT1r@*miJWO_FV4^PQL`p zCT*#IOUq(x;8)3SH1zgcU~337Z9(=}@LA^D=Ydxr=D5#W26vcJWiDT+*VBseqWU`QR?>s?+ScyY5Vga5&DdM)@36xGosyqQ00Y8@IRCLO;2cBx)n7nebEHGb)94s%Hr`1!k%4qluwNxW5$@0=` z{ZioVOrwzVv@_lslh0kKg7_f3VycM8aeOeH6}4!QcZaATn`y~ViY?Gv<0l@=_uyt$ z!P=E?5#4N^VU;gU9}t*7Y*y|GeZJ+obE-K{v^Aq?BlUIp#BCX&y24x-3PaR4wLy$h z*{ZsQupTRg>tt8_;Oyz`C=W%J3;AK&sTSi5h|6l?^0@DZ^a*?&v{Jqi%~hUgK=Wm3 zegy~fl}0rcMp*jjD@!@SaAC)v~&9 zyTj#fk!4P#tJ0+=sn@a|6=I|LRCyS8z({R3AICi*+pT;3X2^Ia<`q$^j1?viI6*7K z8CDqg($}04`d3GU=%X%*T-}uWeY;IlyJWBCv(rFm2|d&>*bXBI;R@ZROYjW>;Wm_k zZ$>H48($}LhkF3y7{|Jprg6gn_(r4D85qLy&>=d=vm#hd5UoWoTT}fX9{R&4XW42P zt<^-8ZZ5(duo7C~Z+x?Qjn8qEI?wX)Ra|FjtQJbps4pv`+@caRpqW~>=Kwntd?Ni3 zERGCFs_NVCH#a@b>G74e*26^4&tH0kUXynUPYQCeD%O4446PRXDPE|~Lgwzys=_|| z<~p;~9u}kDqmf!qJrdqQGju{#XpJ)<3EWf`6f}WyPz1{0SxCiuv;eB1kBx(5NQV4S zjJi+(AbKScXUPz=6_z81T1r;x3&W@hC8%yRTE(g=>L8^OSNYW}>Mv{4GRQM0(_o4t zyQ(V3^Hsb!Z>Ku>YUH(5b#XsTmgV4^?20Y%5lqJrI4_;}9|hnH)eH+w2fF|OAOJ~3 zK~#RBK57m;RZrRPIGTD2{|4I?^qT%ojWJB3t@W;QQno7oW zxK9oEB>aE}F%m){liy^Djb+_&CXV4B@jDyOm$NFoBBWy!9>nqRH_V1?$QFyusp6tf zLa0*aG76EKWku5=0x2H$$>G{;c1LCC6JP`kwmVfF*3NG$6wqG=bkd$#?m9}d+t#Ok zfzZl2+Ln*EPUyI@x~^T(nlV3Z3vb85*-YM@{RZnG2D)%ld#H`Z2(}WJu(#S=wiWN; zC)h=6Ar78!j>jRLk7au(Qme?;V1p`i%HOE7v}KEivP%bePm_r(OW+=XaplWmq1H+|)$Cbw!a+Frhm9p$n7D!-zg*4L?E z$iUUU+?=t_7w5;_n*Vyb@1D9q^F1{w3a>o~`B6LVQ}Wi_LAeEJJ?@33&ix`m*7h|} z!&EjsfIpNCwy7lfL)?RY{4ehp?Xe}5ZFQSre1>Lquu>IJtJ9i|%SEA&gSjGRzf zeKNafnm@MV*Lj%bPiusIwQo$yZai=O5HKy{z9T14&paqL zi19`>Rft-NRi>|(RSZ4T(50k18-F!wM?&Vxn?Pblb|uY4IF9j?$~Ov=%yAT zkGi(xPWSvndwH7Lfy7ptQ>Z2#^_8F)l`Ds;WwIzIEW*Oo3mghP*}vLc9w+~!C@jOj z;WJzwc1E9Mt?%2CT{?TD%2lx<-6(3to2>$lv4X}aSC$H9zqnr7UIcDoQOfk@YMJt| zFEX!$_NTq1SSK7Z3y6Hs2kOuSyhRVm#R_3Z^k_Eih1%_F%TADkoaPVnfw>b#!7_5N zr7Rfhn{A9e-l0?n>SGP9rdsR0n>QIdIqLX+Ar~8`qR@(q)JL#TSuu(#sA{wv`@v0h z7zEhVBG#2H5M}ro_7D35y5lqCGC-YywUW#K8UGW{@rCO>O9&23{|MXiJ^oq9{nYj? zyOHsydEK9BF~;^HT|$r&!>a|%!TGjD(2YG-6TAmJJM=rAg}xrfuc|Y&gO7YSPKS4R zi?xRU*@3K(t|qBqD2(gz8N8>xbQaR+4fO>Fts#VDdP9@Ii&vp8nPRbEkj}UB+K`M6 zSO>+Rt@4LF3Xx8Vrd>3NE<-g)fRFG5CYejsd0~^6X{Br_8{kV`h63oCEJdb`6;HQM}wCt5SlxiRI}Tby7XlDH#b3K~WYv3-dv# zDWZ|MNx$PPIKs-{I+&(@qxYhWdZf<7eVE3k!eb1;0hlALYNw2(+ticNRCk;R`;Np%^LF7Ejw(idX3j+fou$SVOM<0`{O4T!YA=hEI}K?FJmKB8h#-M zN8X0t;thClz8K2k2)F|I*-gm98n6>$@I33tj#gPy?nSLC)siZ%Y|vNCG6zT(RH6fP zg5K~u+~0RcTvG*Pj`FB-W>u9-tyEQtgfnUf%)welL)l6_^gS-hzkWn`892~IzssHuXFYC|Trq0uzRo?r)JL;)g zCKrMmns_te9Bbk2EJvs!e6PGE%X=F@9`DNMSf1JnYr|enxzo!pofomc!_HCBMT5*^ zFp*N!clw*knI6$io_4R1Tpm{|U=OWF57h$*Kjet@yQV&}`xZlQU_MFv0`J9XdLw`4 zpHle0jG2ZI??t9MA~&*{^tT!aBOzHW;umlc7NA&lhEC%zSQf7u*{Y^e@QDWEK~Fnh zmUKEkh_SB9j`)D`{w_~%vzI)bqnm?$74?$(DoY@*XxXDT!cx}LR!JMF0CiH8V1eq( z;`ATvcRnV4MbatdaC{Hzu6n^@mcY(pRoJ6GLKD%{qQO$=tbL;nkSnM10DTz@WG(q< zIAfMKy8C~iao8y=-ZIECEGN?GaDJez$|6T7FSKO?=nt5Qm5tAO3#*;Rt5vG7>!$6S ze}1i|5We*oEl(QR&QX@-j{dYo&ZQ@G36?=IJjh09>!3Y7q*$h_p199@%G1I1lv?3i zIoI~Dt&i<5?^^dl_Yn134p8;@5IzexnH_y&Jj?9a0k=YOG77pvs4bMiwcwRq*a>z* zX23nFE_Oj4wG%_>GCffrp)NhaKFnYfX&OF)yI7BZMP2WwALJErIjkmxCTf6eg^TE= zIwl=v5xL%73jXo`GtH^Gr>yg`O-^mmA?d&0?w89vohoQde4Xxp|x(bRKigdZa zJS#ez1w505ANaFw>Krv?*W_sRL3LEya6KG@!_ZvWR05o(Eb0zTsGbahYP5siVGmep zI%T-nsl7rMRAyyZIEF)WoCJ%Z1*B4OejFQM76z~`I3Ih%4HZYz>4f-`;%KuZ91CS3 zRSc7j%rvloK~HeJ8mB_2ry3*Q-~;{)UqMZl2^Oje@6|v)5zE3im`RP*9xREqXg>3! zFr~3wsV9MR+z^gvAx(p`v{F#CXT={ zw8uI}A7g1F;#jgOsro=68f;xj&GeG^4}fgKEv%kuV~#d+aarxDH8-^XSF};zD9Cr; zxFH(hc*ui3Y_@)%veYVdm+r`YG=uJ|>U0B^fImjFPu?^$TOCs+Ad>21Pb`2T<{f%N z3yta2mJNOM^<&7-S{2;Eq#w&XmTS7Nif}bS{hB=>XSKUyMp-l3`~@_=PJfLRb+6?) zjh9UYK?S&p?M$~C4JXYksKH0DJ#a>yQXg?XD?@AKNOhi%fFk&pCCD199dMfxDX60~}ceP>^YdHr0(jB9r7%g**(;~lmubt65=9}x#Jq^?2 zaG498m3>vzJ**@5PzyRu2WTh#3l-rwY@(L*g1+dbcnz#7@=IOT#4EfWpC$9F5NJ)q z=mGTnFf=tNFMsv67A7A+*O0*l3#OI!EO1{=x$EibS!3z0`Rfzx8P;KZzaFi}+x&Qv zx+A9hy|rDoOu?J3yD1LsOGu%ph`eaN$v)J5N__Nm!mV;IG&cVb2UI1Bk!Q`T(3^h& zf9j1ZcqA>Czbj^s(09;_losiEsg*qCX`DQ%A7?i(n=fX`Fd=)Ed6K;L0@l|sS&qa` zyq%tA4wt)f$JpM6{^ys6EpqPR1>f(+7UxR;u8yJj$bB2#jt#~ZSROE)TKXr$T4`4{ zdTU>)w(z5-C-aSYTJ=z$;i1;wGJ_9LO({~g60OZrz9EL;dmXsIGdy*uwK05;zT(^F zD~|}E!}x*MLYAd>mJ+Wh2h{2JuwGf&_uNZ3Q$Y% zr(_Jl6bghD5Cn}_CC#7rW&r^=&Bx}y-X_wbPthO3Gu4Eyu)pYms>Ehv4qoP;Sws9P za%BVR3Bgbat5QE40~@F}WI-d(7I{hb*Z1{OVh zsNd*U`kShgAAE+-a1Fdr4MHgqKGJeL3yq-!_J#IX3y(rFOyGOjb6BNqhks#YvF62^ z*8ICuP}96Vah^pcR}sENo>(J`ztqn0WKj-AVFHD~Q0k|8t2q>a980pRVn6)>14Vn< zq^4m@u)tA1i7n%&bNhP-h`}%IiIq~XCr+3E)lWf{1dJ>)Mags!Vi;w?q9&yaxonZTf| zD>?6MJ#rp6n*@9HG@6t*G5lL}7x{q~<-HN`77S3;#314C^T=6Ll}4+3R7BfH&sk^D zQ$|o7b`?6IhdOExYy}-}J*AU6@ej6w!RxZ3{nq%`Fn=SR?!ZE|NHxYzxRA}(ZR}sx zPai`oaH3x!){U2U$K> zRaNqM*k5KX-z=K0s)~j}P)GO$yl`6W0v|P@7|5U^lmR;Zqy~^j)!_sbB)d8Sp|pm6 z1qHp}14OI6M34=KAQu?Dp|emE{GlaeVj}7kg^6q?)MC4|!Elv@@?h0L{!o+ENcB{` zr&=_CdQliWgIrlsZc)MV9E_nSDxCV#PSPowNEJ~HR4|okBQPz}Y{U%}#%N5&w{l!?l zxEgCN($e`_OR4N_nXB^tVF&%Qqc5}BX5#1Sw!JFKB(8cn^))Rw>v`eekIPvlB`gdJFM)etVo`LY)j!h^E9N>%@5h1xnt9QJL0 z@$G5%h-F2>OC1RBWDC*jSQ_&2OtEUY)3f99R;1TYE#}?rY?_&xy~n%hTj}uWp)Q*q zQ8I9m-;L0EX0spBk#D>yDM8s@oNo*Bb`t%K5}q&4@Z7&Vk3Bb>p+76w-TC9b7jgs@ zxj`o^3#M!l1d};V&QEfMm z$2sU^O}str(LrTxDOlHePaPC7Vu>#V%3Ig^;?x(>%d-O)L$tLSJH^@tw^f6{>$^$E z*&A)L^yE#^YABZH&vMmAqm8`hTcnbV?&fd4lD4nzZ9f)hJv65MD(dsU9WKanpLfoJ zOO8|QJ5*IWs1)96`R8WWuel1HgCt~;{!URUTOW| zU)FN(3CZzL2{WaL!VVXwO$UT2l_jzYy1a? zsfub!L%_%PQ(?Nv%c~d~$u_`nS^(`u1!X6Xs*97U2zF_F*e!u2;c(B7b`O%3dNSTk~vmNtbSf`gE_&v%}o&muhCgkihj+ zUms!l8)~!1Y?T~Hfq})?8t;$5xp|#Gb#U)4y(Ff70>A)nN-6EeV`C0Esh!l-uawBE z!vS)#!s4j1$igZBUm?m6d+!y?kl@ZGRZKg0BOBZmheR zX3X`u+%uhTw5{?V6?SF|4HSE6@*^$3}_^jZM0SEm71iYkHo+aKw zhLKy%ecRJ6xqimUtSHfkcFP9t7h;(l@hN})dbQiVUuzp3)sC|0uwYH(`3jArQN>;e0w*gJf#bI<>ono7{>9!h1F(}K1$+`Lvy@U6Eh&&3>Nm-N3 zNHLtOW)*XiJPO&~aNl2+YQDanPPY8Ma-RD3Xrq*Sy`_QCC1*XGl~+IIhc3-8&UTtc zlYOV;9wVRcmYj)w*m~9>ba(MPX<_CcmVTaxw)B9ahSHA)2Eq#K631~U3d89q_+y9` z4B4t6MFFTkXg`!-r?4UdusyoL8&*x6}XKkJu2?gcA0emZ`kAanbwEEH59*#!}LBYDBTv8d^XZ=nJjc zAsns@B08X|npfFnR!r5k+_!x5vv96nE5<-~v8l1zB6Ftrk*yC<4=<6)q=&%eaz?;1R5f8El`qPR>+y;Q(%hDO8)r z(>~2-ca=1XX@2@#UPPP3BiLX$QXH2BpcmT)0}RC%vS)m$c9z{@wQv!n;4XFuLMe=G z$5ds|R0^$ewR+nQH~DUskF|B~^JJR&MY=ev{t*s!1B!d+imGa}Xi8UMuBrrOA;0Ce z=42Mhq zojpgsUeD^4o6mO)?DUE)w^}XdK5a?akXR&Q4{s5+!yV??pKiDA3{47)^v~d5v`w0( z{A6=WbM`m?;2xRx(>H)_Wp;MHv{%CtELolsQxx-V)8+@S4*HzUlN%bhLq`^{(|h`) z&1O}wHJfH;@;#b|-Dkx`2XhB2M)`9lhzoL+|31~7me~#Zi6vYe#TiRW+X*VepXgh2 z@~7SoEF0mAUX|X!z1YZ8u^_Oo$V3TCs01Kfw>E=OG#yG)5FJu`;3<}-9O^+Qu_$cf z3q3KaJq&lbtaF2!Suf%fcq`(?WXi&8YAPh~-!+@m|at0!u67}pqzMsh` znmtd;wZ!m9E4S9PvczLKC&pNp%%yjj#@#jEc0>?W^a^Eev2onE0#O91f!I4!3-s2et5x-(r~cq zt>6lbRki$P$y7Ni?;CYhb$ltlJQk#%vZdov<%V4DkoCPM{qmU=WuCsu(<|?t7;3qf zJs1lf+#0BA16K{k|!W^@&H(j;@dDkJa8 z6uP4VAW-dr5N!u;)jUp4=bGO6`JdL@-_3jY%go4Dt zz)7~o+~ZxNDvByTo16x*G#TUIEc^n;)OMCtG1ggMV3R2UkJifsiw3}Css$h#erJBn(=urn`nCefv5L)DL{0^tEi! zMG+a{{@PStkst9@lr2Qz#5S(WUQfbr>Blk~#v_9^?+- z6)6@HcwN8O_S0&Yudlb1yQ*uM>qqut=a{_kcO8>vr(E^;Wyj>zg%gHFM)1mziY|Ce z?w1OhC7cGLq z@EB7d0Zw5jYC&aH0tj`&JIETXdU+puZkU_RXmv=gg(9`p!%u?$t-Ix`($;-SHoR} zzOeuv2`|ZxZ>TSpgBoTFd0)LZc2j#etzXA;xS5sEPU#OQ1SJbX7oCJ1+CzR#8*TRX zW%>GBR|eWbJBrS}{bsC|U;P66v{%lQ%(8M+ZqIB#@;UYgy5YUX^r4VKZSXQkh-0-N zl={nU@B}N;1c+3AR2@=PUrM9^DgwjcZ&*v0AOPN*YMzy)nM3S0T4+u6SFk_{Ji``15)Hsp@KN-VH{liT!0`|P zukju{!*@7|`q4XBLRpezS=@>fu%diL)5UYF3I|m|S|@}Ns9Hf&s!c^{n3_Nvs2`Mp z@n#`!d$y?dSskoGKjGQg|D}S{}3=^rMlqtnSdEVS1n7n0X7;a%y zjIK7M?YA0_E6izNHHNBMwBKpw%s%qaOo(;=Q@U8a^^o;eD zH0R{bbHDZ$#-4H>ePHe3C7QSyCow-UM`l4EpPetU9}jtF#wTr5e_ES_U6K<5!h)ha zDE=`Ec$0xq8}ouJpbAl4CR6JPbY5P)LBalm(CB8|_uC;k4YT=F$|EKwa^cx+v?Yebf!&*Z@^V zd0~QBkEQwl98M!W=c>8Ra=`yCRgVFGX%Q8%s%LEcT7!4k-aEhGnIp~>Km{&_-*5+YAW)G{(H*>`4fG3&sr)J# zF1%3JkOxUTiht&Rv5^&8@-5?_3{_R+M{dTu@IMshn*5&rLN`v}-pY$Gx_|~^47O4a z@eh9HO_)RVRTZj@7krf_VJDRD+)DyDb)pXq)LRG_T2ln(@>AK$ za&Z&Y-Cta}GR)Ek9S?3AY@bI2=@;2rZv!0%3sikHJ_Q6!nw*PT&{b;<0;Q_Te%o^U=!C?DYCC@ zMFX)_)Ds`f+V1bEjEeAIr~D|WgjH3tS}xLPD3!7IaQtMODlS^>{cC)o)*(7hpY`@( z@!XCwwMqCueyy~&Up%z6i^>xS&zwo$%Pp2jUl+1HFLNEFsp7D>JD1{Ds4(! zZFL*fINaXVXccH6$5A!$LEXX%wUB44M{K8Sr%l)Uy!JnQ&`QFOqTH1(p%Ry~PNKMKt#)Grr>d1)oZ5;%RU1q~X`asK(VPpZ{c@X_ zgofx&OUXv3M26UpeX5#rawlqtwrGoz*oU#0iVUtv9q47Juy&q!)eY{>(SWkWll#mE5y18XTTr0iqfiuD2Oxk zS8x+ZGJ_l8lkulM0ma>Cyftv&SI5%~PF`!xL_hqDVjkD=VEDQzsHn#BTPPLF$B@8RIT}eKm@dJte&Jss z)EV^PWIZNFAgE7Ut~jXllkY<{-Jn#o2dxzF5Voi!bnt|ss{YpJ=zYK0;4i0olJ zGH09ZjWy;{ejT4wI8XU}ohG($<9-o~mDDC99eGAm_{X7(x5QCvCJ%M}!$}osrSh!h6>1Ho`qyy!`?b zv)1S6x!LKXOMtr+; zkDXnEesb>8-fE>ikNs|6OZn1lVU7|vXc@YxyVfA~h|$W0C*losG(!XK!hgwZ?UZ)h zb0Qfu0=aSSk4>k}^eY?IcG* z*%PQa6);&|W|Q{PUt%PU;_i4PS7~FM_rrelElvN%7@=2kwpPhp5F_EH8R((eRA=76 z4mDg(L^rFV*+f(~zvPk_ELvf+W#?(?q*yBIYrWMcRZ@+4cmMOeuSd&uD?YZ+63Z(5Dl&Z%u_=5+iC~7APX$v^WzQnNujRSvr z7vZ`s-8K~#ykZU_Fcop!TI>{sRVdnP^Bt#9Sj`qOXhL7aMQ5#`?P>#xh!f#sLd%3~ zE}ZLJlY2S%aL|aLne>yW=4d4nwQsbFwuW?)f6(h2CH=*$+uqH&F=|ZcvWTLXMVF}= z3S$=BNK!ZD3Mx<4t*GEq`VnIDp1BV%@R=ZFxNEm8aAUpzyVQwh|L%b+4!vrX#&gI8)Senc|wX1i5Gj^ms1 zoEAe8W$~BlFNe}KF$QH(2P^T3gQyZ(@=4^!L)xr;&<;~1KJjZhgfh5^DL9M_yi~hX z1=QjuN&sq;CKwGt>QEU_P^maAPu5WS=0G$7E%>hV*& zGJ}ofRD^PrA7{m8>Z!#%iRvTf1d5m?%zajSW4lU`?X0%ugFx3n!$6#IRYvi0&sg(; z^=F`}HC+}k>ZlxE-QiGYHR4*)KMLF^RarhjbM(x<;Ok}dHJ%1)n``N-is5;B!=Tyr z|GYc=TLQ1mSJrjeni8=NO=X-+=14U_^+Ii~&)?8RHuq9r zYk!s8>)!IdxXj1aHu)LXZ3FFIZ);y8Uzgk>zE!?2fgtOsd5J!9Bx<0rDy=>_my7LG z%62?#LgXruCdShH)Ty2Zxf856s+)XiwpR&U$U18|QJ=pNYs4G53u*KfL*)+jx9CnA z91p_FSP>};Ig^d3c& zUmZ1vSqX9r7oh^EBGYBCnk(iZ4U1GkRRc$~&SJ7G!Sl5LX?KuB6R0ltP`98{8=fz_ ztI_h5Raj-4H;s+7$ZF@A1&cq+U$B}Ua6at|{Xwbnvi8<-_SSc=4`gqG7vWa>LcQ}A ziJWfjHLe@GeZ!4O^vY_ix=|_oji1E?wU)Y~PlyU!&ziK1vm}^vyKZR=|R9C-bAL1Ho%PV)v4%!X(uNieha-DOus^(8Y zvx6q8mMTA@@JG(|5d?qUXY;#3^7?6NnC&~4EN zZ@3Gx_=#*Ow=m$RY-&l{k)$r;CU?a~x{4e4ggwaMd0bAtqcyZuPL*TPl*?iiMbm4R zC`Ox5nNIRzt&vddlX2>z^&bw91NG=8&7!x6qKC}rPs30j6Zio3QWpPLfJUj|3E<5@Ig0^a|QO6c(}K1Txe>K8#En zP1#TYKZTdpKxrXOgKN?Z>y3GkkDAeDjC>iW5?CWme~wYcTIC4@$_FC+ZH-6LAp=%8 zrJ8GO`<$&O)*aJjkcw%*1Bn3WhQRnzIRrjtWnK8 zXqA#xxqv#wf62cs!JCy$3Cs|gVxG3wsvM~9+fV6qSpU{6V}7+}nz_a@^N4q=caeY7 z)81c0zw*nsDUUKXq$cLP%9>`T%VpO6z%6W&<-CWUtG|$>$I?|^;D`)e6;V)s zDkf0x)FJMoIiD=W8*nACO(oM3xli6w1=UK`6;X00o>2+@Bt0zc64sB8GI3G_G=}i1s4-!YkY2k$n4IOQ@S=vzTHm-`j z<{rIq@R;ByT*h53?S-}=s3H=$2>Y;4DOFYs#v$b31`Vaja|qUAA?X64yiyPO9_PYBu!{`x#fSkMjZt<3e-a>q{e#dH%>9mNhJIgUIIg&Y?A( zS$sFApYNRfY;NI&!i5InmG)d56qjv@k(-Ue?&9`iN?Cf^H^CHl3Xof5D$?aHSp**0 zT?XTV{-1p=*XKeij9#kyvO0AZbEy=E;v&l8zBXB_tIemcyqdRCUac9Hi6p%q5{zL+ zaeP5-=m;0dbQvM+;B%^^ST9Cf1$a37v`gZJc2F$P|F#98qQ1+PYW(PM#4LOFzueg& z<5Tl_N?PY#ZQ1lD#g_^D(SJKsb9A5<+B7{{`^>S37vrr?S(kI}2W9!6<*uP&dL`|l zJZcNF|_o$GF#~YMGJice<0`SvQ%7-MeO4|Wa8&wG2@*Q;vm3b2i zB8&5>AGk5caU?Q0PTk|C7>fixj9R>cOCWr&unD1Zoc< z+Ur561q=T|h|$`7gwcJWz*?y79G()16-d*(wnhlIT@P*zCEA5L;Lx6j{D54sz*!x) z<$tmw0v>mkA9phLeBO`L%--+bqL%R@_gB%vGc2WvsNrv%Rfl%d3ij|?eWtA{E@(wX z6BHA9@c{uEfno6DCO1I>HKzy|+7q;ZlOA&ieC3VOgMb;~&wy8evQa|6M&H28bh9u!JMy{xg;tcF;{^XsuQ3>(RixHfbJHp@O?!xDwlrHAd0XDLuQOXVdfsqD z`})CqoZpAcw(^+!i+D!)tnGA_ zcBurk?k zUQ#zHuiUOhJ02OuZ9Sd8JDSU-Y7)9=akfn~oGZv(s7Ax|XZ*W9B_iG&@A;T%>AUH) zOm|PhY3~GQcQer(qsEJ8{GGJ)kKvP33p-mBx}ACQ^S95ZmWT+O!ML=WeD`+9r{3=DKa1lEf% z+|p)?d#Fx+J=q+Dq3T0!H(AEa8(!76O;5L~sE0Dg-YTf1KEV3LtE`T>zuDWx-h2|u zqx0^!Kg}-S?iiP-8l+at-zsFkuW597XLU-hs}u2m_|++KZ$+@Ae8dz@%%*;3|M zw}MlHoB5gs%?o|cW%wW4M7&cUL?t}sTe2A<74Q?l&D3T9jrlxuUZZ;A6Yt{6D2@Ac z85RaoUZmkNs^Aj}F!M;ht_pB|9;Ci-X=dt$1^fbkVTt;{Z>bgS7XMt|yobQNVDMy@Op7>kqgt8a?xWi@s`Lv#No_cO%ie3RCl3iBj0AO9^fv=&Gc z!>KIZ+QWj6p$I#$ANNq5OCv^wScmB(hEpe;6?gTTd{L`m+m6Ei`0VjCOr5bVaV5Fb z*oBWMhI7K#&=|ufJduG(YL7pUSx9a1e`9W!N#3YHBX!hm^M!Nk>}u{umOty0dzbN` z#k^LwZnMk|W-a*;kMS2>@o%&aNRQ=|dzC$~*t)2uiw1OFGzm1aD#}I5t9q&%)-SS% z+GhSEFR2}YY|Agd^?qyqW?uKrGcU__nZBGV{vlp(;GhMomNn37`Y|yjEbX8B1D;oY zTQSq|^=rxx^jbSC4EkN0F1*4es)&cQOgy1c++Qn%UG$r%z!!Kw{fW~2kGbF69tf2a zPD6Y^pbFONQytJFLs5ajeAE=1R;SC&(ceI*Tsvp%| zxd797GH>8tIgd(Ez2R58-5Z=c!#kjhb=7y-%%{E8k65L}X{{7Ti$PjAHPq|b|D`F; zBO!@gNN*K7QBLqTHNKs93SpA^t*p_iVHCc1Sw#zz~(!R{Ql+iDn1pR7T z=bV5LSlZ7>RsX5y&}pd%<8WFaz9I`vNn#=vaZR>!O|{-SV_D`){#%um$B~D-swarh zn$TaU?pR?j3O96&L~U_boJXQ4DjKWldJ8Sscw#?fdy%n0u8}fuiMHV+=4+a$5wRYyf3jplP*T%ksA^Hw^<)A=Kxl$UrU($s&d zE)SOH)L_h$A?>4uTy0BRvRmPK=&{2Bz6`53<3j9SAwD2gb??+|gLBHp8^d5D)N#HPsPy^s?GR{+rgL zIR1hpiCPXbwr*VCIzy|zsN2E#$3L9++v@3&RN3fZCCe$sPGg63Q2{)m>l}|b&Qy!# z0r^7yrt)$HzRVp|bGe#NVUJ2eCj=YuDi-&(OPZf6|q)Ca7J8k^-d zPmC5Ga^0qKIqSA+`W|c5WRqh+GcV`Em-}47yOWV;jU_~s!E$aRJ`ZIm`~%dl*&>k zMDSEK7<*_qeapk}n&)CCj&UKp74=0%?!S83umw?xzL)8n-81aRBI^a4G+S$GUnN@U z_p#cx-<~3VQQMqb%_H^&;VV$uYo%VqKI)4y**Dz_vfuH}kZK9Z=0|-cT?!o48qy}w zNG*}O)H~POe8t|qGt$DUraw$N{h~n8gb1g9cvMu#bmyMTHMvdgg>zO}XT?$Ln*5*a z75YBAQ)*(dXn#?E{JWNU^OkwzKBDJ$9KlPzXf;FunymjMAF3CIpyRv+sT7W@UjM6Fs7x9cl%$8C0f6bPid%UCW33^gq!W zw^3E~Lt#G0mym{Y^d1-Fed`i#VKmwz2AK83L92JQoQl;T9Do-Iyya*64Xf;OSXSEJHc{V3uExzMK=$J~u zC@k7*&5@t(=XQa<>_WBHYpJ1@KgB72e54 z<{Y@?Bu@aJ)INWI6qgNi=HQ+9XxoAyN57~tn$R6>=o`L^$8cDd{ERG0M`tYN=iH7K z%8}TOG+7<{kiVU+Yew&VO=j0`+OnFnkv5Z>So<*(m$M`NZ;hq=t*XN}_w#u@>aiN!ag_dpmpQ@#rszn-O7$V4lGw7jG z)P3~eMoiR6`%dU0Hl#<`wy5zLpFDj}rV0rF03ZNKL_t)%dyNmO4kpP_7OWc7*cv2F zUaxBLYB7~Rq9H;!4krmg&!f zZahj&L@}CV%d54~b*{&m{D8kxi_`$o8)IoOcUFTj01@h~oP!YhpXf-J@d=-8`Luh& z2;FA(^Nlx@{y6B8{-3PKFU2PHBil5SE67#iERy6HWyc|_yRD4*o2N4hTgk=(nb$mO zjCVH({wKbyWwW0Q{;HMN`#VR|7VVALt#wc@RDHF9D?94ulyEf3`!VfYpaV9lDDTy#OZ^;6t-i*p@Hp?9OixBaSjX5mGMgvT_s9|RH3Lc9 zT}`HGoGZ8RT?pDk_h|*0m?MPni^)70d-yLn;XyopKsmhCH|uK=WBwRON0bOqCnSp_ z6b&bKAx9JxG3XZL5_geBMX0>k?G^53+)2M;TcjgPVPxX;;v;~_g6sc&{d+a=jVLOLw?qc#e!`4nrYiSaxK0`EBzF}TmE$iP7U387*R*1(Ii>J&^Az*pGRHucDwCQI-t z8l`y?ER>|>HGRYzK@ebQ2}3_9F&2;sQM8`Xc@hYHbMgwr0%#!rY>-PJQBU^?eT zNpV!{gPnTQYq;T8LjeBFpW&ywn1Kk9Mjug4B-3taJdJfIC_I86Pxv(w5CRJp!cYlL zF>Q6Y6*UBn4)2OU>bL%s`^Yq%^A z;48R*9Q3C~2%EHbTAuk0@@2-~E7;!lfro;NfOf<7)6TPgX zi@wf1Cs4~jReqMHa+%|#M-?^dnLD|XnIn&5hU%*nmB@_DT9kD&r+xNVPl>FqndP!e zB!#7oO>dN1BE4Bk6@Q91!?#*%uT7#*bFOmAE&?I0u=}!+`ozUl7-jNMvyj{&gXs$Z zMHBcQcFV?AVVUW??;00AGV5mGXMcd-ab5uXtu(8kVOjrL?WGHgFqeW=J-pzP`~!LT zu$D(m;-@%IEv&wX)#`-oQ5&;Ls4Em5G&CeB{Ap;me#Lgr-Uf-NgLE@UT{POOc*OE& zA>>>BpFGAy7qoLoMIHSWZ4oK9WW2%;+*!SY6|z!atabFwp$^kFmI663LsSMf`+o8st~@Qb1x{bBZ2N&JQ8Aepl*pc7RV zc6G~YsW;)vatX)awrYw{8i(I(oy@yNciL&p$nmRTvVb|q7v#JWm73emR=`!?{?J~= zb=vw!iDH@-r=PIT7hkCeeb%0vrOX!AGmS#Vr-e~s)W)0*nf1~J#I^{kVV2L+HKZU{ zHukFPwjow0o}uN6WNWNhGqiQ~sV}X}$>GZ)`}#|37wyAvOdG3hLsK=2pW_NTlLP+~ z&$O-d01+6fieNJ?qCQ&k-%zxSKO+={`3wr{J?y=pn^VloxS^fZ_M?axMOP_KdyfTr zd+wx`X#M12vEFDN=t&1*U<0ob^U#$&YO*RVr;A;ppw?aF&{I*8s_G5&hT6Ycl9q^n z`2sA8wDwqw__8_Fyk=CAgB@owC&`G2UMaP#!%?%0q4FEGIoh<3_DxJ!5;Dv?IkhIAd_KmPQ5;S+n&PMc`rw$VE_b6AilQjW(kwiq`E-Cv&_xtf z6VMH(&;YIR39oPwu_%i$w4-gfjs4h-hV&j?k%Cny3yUwwF8F}s{5xD|fkQCFPqY}7 zwe8wbl%^h95H#vfZ}FL$iM)u`KRX}c4WIR=(DSfO_<#obxX?5BfIZqugi~j|5?;{| ztpN&X6P@)D<~$lc43Bl;EQN~ptszY`DyIKQ(Hz7FpmQI!3E)>4%UgLncF+%~NtG!p ztZelDJk^%`H^2O^9 zpP1{cy)s-~^Pe;C@m;k;y+Syeq881Tm#wb++t<3uuhLXtk0-C!@~!r*^09ln+sX~( zB)Pr069TGKK)TVCU2 z8KL&00*0~0>ug>0HqQFC37p?^&B&Hpw25M6*u>D1hM(?GU(c_BU;Ld#6C9-<veK=cF9vy$1S+1!ntGxFE_vd;UPs-rUeAJT>-B!st9 zokdR(LDJ~NXMze)3;9)c315^Mm;A3b+HuNx(X+%=I$|l8;$v8f4_sM!;SimvA9OB( z<33=&j`ApKi^eLJ?s8SBVJl z1Bt;2LDlSoJoV{MTXQSc*oFzz*_&h?!Z2%uTuEj0rF0r@3{byfyh=9OY%Pp5G#Y?DR7}VK@h)AdkR&FsE`UB&eZW9Z$FvFTfdorh2F`RzJBf^1aJVWn^9T z3kJwIH3@ziK)Jjg9g%`2cmX&6jqhj;#o;jzWE{2EQ$E$66{z@+51%6>l|@YgD#st^QQsV_nWm&jN3MIZ+37@}-!i)GPB_ ztDUl|IdU8#c!T;)9LxFHJvvbHi<#0c^M+%wnrnrm)y+=K{?#`)Fvys#>Zog~62(Ix zpSa9ko~IsoJLN7@r*bD}l{3C~HV$rK-IBGMZ2$WQqtPnP%O zK)osrr5>U>Wl^ZZ`@*z;BR2H>^WEjSi*T zbt^13{}sP0b*Q!6|IEA_T+cVdx63;2aJY=%r(MjVgzgrvA>;L_=-e+#fiugYeZ+1x6N^U#68Px6~=9vLvNhxZNj#}oaG+l zhukM_V2~n}s}_n}+*c*}2;C80oTPu$X4Ik|RVAfx9lf|8U*=}CoPxv){hU5Q-)dGj z3kU9K2lY>S8!Asz)Bxc!e^*bqu-U_VR6Y%~_I&G2rQfu!o-}=Bv0&*>uiUg6{AU>k=BGt!l)rv6H{mqEHw+xfIfnAD>Zx&WrC5 zkCXH#Zc`s_M{7kzk;sGP-_`{b5sSrkxRD?Oe8KTYtP4I@Y42*h`&{VXoBH9XZKaya zW??@TGkFI75QA`1O_aS*7-{?-`R%2GXKL`+j?q9mgoacH@9;ZjU@cP66e%>E(okI_ zQ0s{^C$*e)s_67Wf0eu^j~WY9cKYe`?issm;UN^u| z7~3*;ZeW15-Y99A+}Vni1NnVmkX46Md<*<9%)VKlvcC8h;;cW-*UaT|?AH!Pwh0<+ ze;4@0vYWNiHe_8+uj|u%G5#aIdx7%)k^u@F_AF5iUGGBXn^!{*M>kf7WC3H2_(Kg) z)wKDlCZFPuXo97xDIeyZ4!8cwmQMz8BuSY|%Y@w*W9>95Ab>FXS(_^+h!1$dH#ndk zq9A_Id+?vIV=8}7TeuE$EwaD{}*LA^k%= zt(tO#s;2%?Ue%B5v5Awsl9#EGLZ_Z6jOpALCoq)v$ce6KQCLKK_KBtT1d8EIvs9!W zG&d-aQ_f1U3fuqS-s+(kPY&KsLwFu{5pU@*=3s*{#N*I+1}^1xbY|$~^ycY7c2{g& zUf{YCc}wM0^Qe^C!2e;AS)Je0Wy+;&daixczSoA3jaDO5|DGn(ef_waE;rGSMit!j zp0+=T`(Ju@qo~71RRg=y8pUl3&y`X1uXv=AE{1qvcY+!bf>jPi(GNTNB^8DXLcO@)PCG!&u2C1#5kz8g7(wPUIw zzKh=$bj;yPpPyFPeOle+Q#6jR^8{3*LbQ?QDTCLmr#uMz`MA2y@42UVK_lrY>R}Vk zi+!Svs3LFS3y%=c=2CSn8^LrLo6wzZQDbzl7K+j0i5P}rbX{ACgIFe((SIu065QPE zZ$9>a@;lX`oa>em`Xpnpco<{&e&k-kMO;_td;gC3wvG~+eZuEDZ<=jH9_JHBD>G5m z3aTPI%X^Ll(Kq2yIAI;lp2L+=)reY25p$5H03XkHt$j9Y*8j>*w{~-b?NQOo0Xg{Lp1na>cxgBGpeVw52VRST z+8tU&(KLdd(Rd2*V*Z)eBa}9Z5M+t@0a3&0_qEDuxT{ zL+(7|ym=_FLB6vlng^}M%7<=ojy7ji@g6|)MNwUu8j(Hq#;Ss%>0MlZ9ps>2nzE7s#*%} z(Lt>u&*WV7R8_z$9wb_cwHQWq#lO};`+$(vVH5~W`Og|GKDeISM+dEPzLKx?mbT8u z0lCT=f@_?r&Wg7Dku!OaID(P3j+lz2+D+AiuW7B-cXE~Okm#mcq5F&v?)-YHUPYYu zz2us7*^0$)_*djdJZAGt)WZ`jQ%UlgtsO0;cAREalX)o_Z*bRDNnBGD@E)^kVuw}Oxm;L zccFEiUF=uPTCOo6znT~6t!Ri`m0x{89(|`)25#ELD-g{lA43dvr}APZ4&X^pnV`R1 z6-2Giosv7Me0ZZdJPF>3JRJp9ei%sN0$i8p;vIgMtyPp-q4u&*U7-ndTr8x!yislB z=QzU^d9%16UZ_+|QKM-HYVbGWf8st$@@yV2zfspXAY1b-KA{$CU$q=jP>ocjI8!Sx zs){EFLl82w^_U~F&|J>8cB|IDss7l&NB>s!!F^8_3#pwooN{CK`Nnc|P-gIMvB$g@ z8|#|xafW>tT-Cg$4-Ky1`rRt-+!A?M_CUSh{_yzw2D%`^+sU^RCH2js8{+K?Y-iA2 z{6OcBD+Bx$WmF5vNW^RPE8M6C3m%@UIwBhL(G*_3PJ0l_EBFcCb31N;V3b1zWMP~t z3$KW$19&SG564e5oX1iUHA4Wc=^Uydi$0?%8mbLk8zs3KCF6kVfNAj13_L?K(GV?g z7u8SzwV~)SO5=MpMp1mvx6qODQ8u38XVoA1DV6?!N!L&o`SF|Dfl$s?iHNm1!pEH=z!~wu%~i4=T%2^O4r)cFl>`4xQ(+gc`ZJ1KHNuX z96+tQVfBI^FAPXl>6xCfggVO*t>YY*zGZCU-x9b z&6=ET=0v!ctJOTj>R~)IyZb-+elm_5Cju+Ys>Wr0rLPXDV)O8DYNfmuWLdF9%+klm zCn}FgxF^Po!^oo|xFZ(?@&Dvf^~7wVx`~ovjdo2RgX44yk2OEuQVD+LFUJ}DFFn9k z<#k@8aePS)p(qUGx>`-zL^t52Q)tH%>71vHIwAha%@qlbTlOnrvf3;rrdG^oZ{HdB zSJXq^OjD^ZFVt>nMFG`D{!A`zMVnBX$6GDAH5#I@{zz}At<*z|^SRTMp%t_BL%dPe z_}%rJt9j6dK&IK4hmKg!Gi=o!XdUf0oyF~A0uJ9xHJRJ+Y1yA{ihmGq>uT?(UkWZ687f+-m7Zms zm^<(5Yu6U64J^;DZS^$v;eK$%h|dZt!P1Z=ZRRnvAL;S0arQWSj&`2+>!wlC8%Eo$>4wb{q212E;Bg7HBDQh-zVz~M8x(97b}RQg zXES?Iy! zG_cACJ_LWzMu__O#rLCsa7N|ObD_5M{8^QZ=J0DpweiT$@2D1ZIp4g@6S$ZZ*n;wK z@(e}1#mYm8vW#qkvOGasFKW|4UP3FC!LNBE+NkBY%x~2Y?tr3{E{x_}Z>XxjT7+<6Fz=iBt>12W4LkoU$L4 zn*i1ZROSRB9x;0=1%AZWuyrjorWv4fNIOP z5G1=mMR0+JaZnY{zyx?qsd6|pgFdtylH?eA3{jZIF2Vzxk4tDBHpez_1=Cq)I8Uvp z6V%75Z~~I>5~V!Al<%AO=?FnxlPkc$bt z2$&d*i$TK(s0|u5*CkX$7qx|4;$}Oj=xk9a2*da#OrSfN&ZmvNTQd3kf7#WXvB|;e ztDs*BJ>k5Ty#iLkmQWZz!Z}$FeuX1Y5lm=}MR4(mJ%2nJ-M!4TvbJiQ^EJaoHv^Ec@W9@8Zuo)ufK?!imL)1Z5mPch<*kk@H znwa}|0$E|Jb%jm$i}766!6@Y$2D|8}s38}~7knCp(r$c=KVX=WkBRsRc2j2<$#P*A zEHXONdB%hvzGqkUO}6^}GRR;1mNCJH{IT<VyUW zjibG=na;u)us~Cc!5r2PeiCl6$kxcR!4~1WAmiC3D9m50ITp9sTyK~+P5z7C@UQA_ z<#&-U#+mh@6Lx?AODooyEl|>w)_ei`$2T%RR_-$Hx$V|m$1g!3`r0;!e)i2(g>3~i zQVUy7z(*M?AHgH}iVjl&1V97vlK!D(#w_YZ{@N+2%Ut*a?bHXisCnvB9EA?c4mb(7 z_&=6^XqC7xC+ZtzMnPZ8u;|M<9rSjV;qXG&v)4pqJNp`|twoe6Y`ET7pRPKTxw##3 zN;--ZG|4(1&{XNFC?PkpYiEpi%rv(epKS+!DBrFb1lDtwf95DVAn|6~Kd! zlnang4RJg;*=QUCFWEx&9qi()910bv1^f)jPz9PoCasdO0C)_0$olvO2GcaWiDw~) zhQTbF3RmeRO{QhEAI4CY*rUvrnmK}BWMMo7dO|Ej;8Cbf*Qqc(q+oguTVNU+gA3>v zHifOmA>t6sr6SUhx8*y`Vg=NU9c3-CDl~yDG@CYv^~QSno4#J_D=+%a7}tDOvpb(x zutTf}t>{gMeg3VLdTL>%hkuqj#`Ck^Ue)D3&n8(%;}~+X|M^{!4@4{FB-Qo)N?DNY znP+Z*K#iGgp{yRG?S@dQXU1ZGB^buQzdRgk;04x$g}@l*g5ls{esT{4Kq4%M3}`@Y zz=j5#fu^_tCu3JEgCk%ko5cpeS*nFSAd|Yn><=X#FG4yZn0SaCgE-uPX*dTbLK5_8dZX#% z76;w^y<_m3vNpe$HdRMgE%t#>xG(fsW%*xB)f95%~uM>7|VYRMIWnQ++k_ z*LXAZCf=sLQKsutLF)f9-o4oW;q#a1l#Lmq(yQjW3TC*PzDY})nT~09v$kd0a+5tj z=SRENde;`TE4b|I>}gq0&zGiEgag6jqExY0on)Wt`RIP?>874k;&>-xmY6H|8HW5y zk>)|FK;O&+>PNlA3ET_cynD<%a~&CCmi|?3fp=9m6ymShHMW8mW=q&Xb_`mJr>r9= zViGHj4`qbhEiYl3(g~92KUhGYENxgZva-_fLS&e+GK5ujb-w7!2FxMXF89lo)nTo$2`~9?ELg$ChY5!PX6fahvag zaotSf&G`t{5z9iN+^0;zO6a3NI0DD%3L;*F6I4KN;1g`cqintCVEzr|=!e-Ir)o(a z-J2uUi>LZOC@|`?48pho03ZNKL_t(rh)Aa$Y6In_(i+qFb5+m-7QzkLi``QuV1H$| zeJB2>T;R)`N7yrQGb#$=y%P$JQOfe%NI!LT~e231E3;<8)HCZW$zIpcg z0(3v$2Xn9)>lq@3i<`cx^oiyh1?pK|#R#<2QiILK_RsckWu&F3J>9ZfFQGN^{VZ$w z4NW`BER_d7o1Ie-($*cNg%wnYp6eWz-8Hz6J>BT%C*g0_(Yh;b!IxDuBC~(yRTlB3 zM2eNq&fDbv*IF6|C=>jm^XudmDRMLOa7sdS8=fMfEHf?FERUcN8?Ts92L|CL0KQ)d zf=qUkFN0Vt3zYzH2>%7j!WdWwpWq5KgLPDls>5v73^XVPh2$x@3^Y+mj3z^Nk!zJF zY%_nMB*?C^uH#WY$cu1J->c@F2<_1XEi^9$$YWk(p3e=I0S2|k!L z(tKRBnqCbu!`j=5TOD*ic#Y$kRyMdz&;)&#{YEHR2jSho(ZOw?j;&P05_qoHRX5Rs z+~VFtSg+u6!6{|AIN;6U^Q;3Q4)5ArN)a4u+pAoG)|P&(7Bo_JfDPmDv+zMY))g;d zqxrXZ1C4P9%!e2#M8zN#X8}M(HXTMn6rY8EK@hu(rQkLD3LQXU=MW*4WwW;sr?%(Y zA(PdGdf>uxunqtpP#|1p60Sg)WwmuOJW#@vyI`^a%z`)QmSd=bZA#z_8XMlXnq(yu zP3_2vvH*69;V#wb4drWBGmgvpH2$vX|r@_?}tN&TGx9mwT3A2fB;4x=!-G6NU>^UbEVpFK4OF212XjEf;`|uyg=Dr8&%zb{ zm^hr}_waA@;c6)%ff|S+xi0I*@T!&wZ?WuK)^C=f)>VFg8t=_6N)UG8<&}zT1K-H! zVHPe?V_8=|mFFmpScXv`PZ_VZM9U#dgs7w>S~4tcE$6HmvX=mLK#ISonqE^G#<|iR z@3Cm5iJT$>U^%>p*6cUzk9PJSh2jS=sS$pFRB=|W6zjo8?W9hLMmx=4+r~1Kc-}o;it-O%s*gm#FMzD8ElG?*s*zRv>X)UnNWOXqQFZ%V(Oh~F) zWT;Sc3I_%%Z&^mz9$SoGgk_WZR2>3w_!%!k3~ps>pg60E<)}Zkk`2slG@g!Y2gC~T zL{6eBvZ${XoS@ylG>n21qJzDK^#@e94fC6&EaSJ;&oabpO$F|o>|Rutq#IPb!nYUB zoKALsC7J%pn^w4(<8V${|ACG`u&LAh^8JrMvJzx{fF3@@GMQdrMMz-HU@WhzIJFzz z0$Yu=@k!sKPZv18)(VN>{iPq)XZdUq#G()HLKwf!C%}8AEAt^9YcU4jQDym{$$9V| zfIh(y>LerNbUFg#=sMh>WKoLFz$Civ%My>|DC#Op8)Kjzz7#F!Jp`Kl&57`@IhaHfba5@(UcCG52zj7lt1%UyZ|Or4YbRlFaX=YNXW#Qvwol#-Aa?sWt<@su-0 z26tc;_$tTW0o(09nc@G&8Rspqmk%$i{=v(HW`vxh2ez0(F~*33(|njvQ;uY{A;*^i z$xa&mebDEfxHae2m!J4)Z<*{z%+F|;w*oX##JvW3V3cth-oWo-6YHaxxQ(q>a@lZ9 z#BHz+e$Y?U6;hQt{4&(!p{ytXp2E+N2`!*KB;i2FgiK`zUk15Y91em4J*XrU$H{aa zG#-evAcghfp>T(-U_&8BW@rlhPd4y8LRBoz{{oX#w__znsWq$%xMigOBUTmGQ*~HK z<>)aMqRVm-*akEn6fmS;wbM2C)wla9YkOr145sgN%l9-dJ}b^zGcYG4Ip?@O!}w_Q zFs_=zSZ~Myi>t4@r@l9-d-6}2&PzRR?!4yz{^7@oNfWipn#0r4@=4vP7AX5+i0qDD zbCxd>+L`x!dB#GKXfD;7`)uAac^f=U3eMOP*;CfjKisn1+D5LoHw(CtVj~t%X`-P#y8yEFm|`+M<=Y0Ef%zMgpq{oAIDpl@EZW;=Nf7 z_fvOvMSY^rrI|2F#L55Ua?y;M&=hqMi({rXRGz~j>Le`8iX8m@U@a-X)l|`l9c7VkGaNt12Mb@ z3zBu1qQv1D+dao5Uykps_7;otX_o%Hy|`~=X&u-imI=404t0@j;1)cCp>U4Ep)V}O zG`^1Nz$2(5yVG7X4dU1x=*CX50_>@@w~SW&&1Byo*@F#KT528WXKEO%^HLzi9(pb+ z@EuS{e)XG54n2%V>BI7G@hnS2$5Zp2)>m%AB#)Al98eVU^@X_Dc_T%^I)b+-2N@caw zZ1I{3S!T;|a;#o=D?z+l)6#VH&DK++E=4W;C4Sr_uy$C@Ju)< zh3cp|YzEHOH<_=@mG zWhoF^K_|M%#$X&AgMX+Im7~s74}Rq@(T=+zA5Ky)%wr2!c@V56uYvPuGAPg+4r2BjLf>68^Dk1Pye(N3ph4FB)NuTixAJoxtm-MAnanjk zF83%lFf-f%c!}c4OCyzW+zm16pZox%q5ua3u;;KDGLcgy2w)%JCy0fy9%Wt+n-zh<9j@Be%B7;SKiMu1uyVpwvF6+ z!v7rGE}19_FDRULz%%H9C(wWTu9}I4vV^LGzU}PJIe3yNpMssat*0zjGU%NS)D)+66LRDMO!2dvz_s}1k+Zs8? z`dabF*5epSJE%T$SWDYpnPKWlo+l^hgY~9Rlf^)`{6~g}CbE>w5C@?$O(h2}&vN+$ zGlS+r8EX`;$tSQ5%2sP>_X&u{)8KGu{tb(%FvN6^4?gL-KAe)2#XebU>kfv zJDa5>%T3S})2xwH4-e`^{c1YbSr2(zW&I0kN_S`$9~|!7-|f;}E+ba7V7~v%n3W8b4I{SHGVRIG9H1J)kqxf^4V< zO{5>Z;$3AJgwQ%O0W3ay?rJFQy_)$7H@o7}rm%>d*`IRZv$te!JFu|o${ZS~o^))Z zA?kZa2n<(;+V&{)C-#PJ^aN``Yl;IK)C-jnC89po7!d!U`mW4o1-Hz4=6De;D|%~r z+PGg>t?Df`OmvbVXG8AY4cr_{mkh`MGP}SbyalHtZrRCGIWzUPx?=ftm}Wz${qbgTalsV!gh0p z?*lbP^o-x< zYxo?cp3;^DC~@Ln7zP)8NpOY6m=oD0C4qefCO=|19M2>hgmr0yG{tJOt1?IV&dI}7yF*=~C$Om~3d7)r>46h6z)WW2;lBC_tzsVd@jaFfhMzd3p0rdkRe0jw z#X7M15D&w#57=M|)PUZ26+{fblKPm@6 z&w!vFJ8L?{ze&N zwp-QyHMivTuq=zUeebRXc^!p1z>IJ0!h)>h(=VG%ot>Pk^Pl-j_^0~Pw85;9nM7B$ zJ~&u@#B4SSTVO4@iE2P+DpGKzpjBw?{E-D?oii*e)L8{q$1O)Mt+TS+epl8O2k|U~ zi4=1!E*29FPJgrI>KQn0ee8Sz;YMxq0wl@?lp;*Mh_@jO5IxM*kgRIfUtqsD3+dEC zn$!q#U@umqwa}Hi;xITu$#4a_$@^3V<1ijCQZGmq$7m!4Vi7!ot5^m%_&GrOjhdf zqV}?%_=t0O1}}i76o?yXhwLJppcs8+FS;&&P%%-0$^wVQ$XFJ9XM-F^AyY|Ge}WG7 z!*mZ`&<Uz_BCsgr$*6zU( zPqiP@=?-=IYJ(f}$vg%R=%Pr4LePnB;~t1Wf-Be_mcUi4%NQu^mWt$1SNivb3+hk& zPZ-0G@y^tXS5)2bg8gY7Cudu_sLjx?WuYLx&S>O2MXxAdl!InH+VtW>%;g6bpI+Vj z>&vWRUaP^qgGG_-SLt`YMwFcyQz$IcyVSnR`Q~%q{Kokk)23z~^$vggEV+7S-o09P zW&i6mhbSTSH_#j`8Qcn{=tLT22dxw^bM?w`*s*^86 zyJNP!kMUks(-u%kfksJ;lZ(VcwTC*%GL$}1C|m&>A7j~U+!j7L-d^9|9&|uU)~2#i zo)BZZdz#EALw;e;jNg3qu@cK>78)ylrpvaq@*(aOD@85ZZf>Ulc1_lXAU28{*n(Fm z-N)apBMO?)#w&EN*^b#7C8 z*nX7_TrPK(nPll`FQO)zNmK@(!Ugo@TX-Obdr#?(tGY#=k~IyY=j5eNlnE#CD#rK^4)=ZU zA5<~Ml{Q{GASTgqS(-|MjV3@6B#R;D0dV3t2m=t6%v5m7lCm0}z$k1SdYYG~>eexG zm)ynPu?Fe@ij+Sv56a4LdW>~_kBt3PO9=UwuZR(7aG{D9+B!A}mk)XcerI z`SLWKmLB>`t~A=?8rIzN#lL?*QSTbdF6%#{t$Bi0fg+#LeF%aZkO-NwEv$ohQJy|Q z5)OtlkgMPJw1hC~OTFMV{>3Xm6qdtM5JdCjF3{*b6@W`rm6sq48{j4wj7h97EC(0; z35Dcz5ssrN6$ijVwwiTh78;FDVYjqUb;=PB&4rK*v*{Sx@gj?mQY@qy(4IA6OJM-S zLlBOI5ipgCnT5m}&}cTSrS8}VreRIii%o<1)Dg}=u9yTSv|)o-OMC!>pgo-zLLQ>! zayv!B1EGq!peaq54uv2Y27^W?@Fr-i5B33#`LSN`O-A4_0QQR20)-q#4wRE?bq|=L zzP=2;iy)&K#F(*q2q+?4FABHCc;8zv#R1<#&_rFY2C>x8m=2jZRepp@a;>=u%3^ed@D9RQk0m7Jve*z5}VGl^)@lb@TE&R5yt<*&3;u?BPKD;qG zY|{7{pQ9$n6e|8}PD1`TJwgxk<%!#R?feE=E$w!z&GA87VA-KWq%X|h>+txfXRQ0QXOX=NTS_%Umj`wTD({H2{!i^%@Xfup;D{)$PcvG0`xqls!T3f*u)W!R6d&>wjz=R-=1Ho?eM}Wg1uEs#qpnvM&Ec?M6;>k2#9X z)`y#G&|~|{wo6+k`%@hXfoJfyavt|XFC{~I!P^D1yhRu}ys6DHCaC~S^4c9H@=)1S14a%6S&087OQ_66XiHtK)48anfN zFqECL?XqaLU79DaO2N0F2I}a%{cMBmY+lhj6Sr=}jlkG+Ii5KEf$3~At4w8lJG?C% z$F<+{91*JmCi+JiCwM#S5Z`kciqG*sB}^Gl-^^ddY>1{U@C+WxXxSXfi795PnF*6w zSNl-;x3>a3l8HuDd~7LanI-2^IQipWluPqP7kNv@V>%v&Q&N*dM1mivDa!AnKtHAD zdljQ2*xVJBsPHZMnN-~WPi2>JGWSHtBz0MS37fyw4qJ7rlH{<8E#?X?uC)SL>wfJunGcf`dwu8~%kKc%0_ZSa}%{&(Ljh%Rl5he1HIcP!Cqo zMaX6jSPU+h2DP9f9>ksSg`%hrNO~-jAVm6^KLN*ekPMYzjVubGw3Yn@A3*0VVhQs12Y<@| zXFlXwhXrInakZo~0jep(tc~HcqFMmLxm%eHU)fjQ6F$OWQH9ZQxeZ&>ZP5%K*lyU- z2|q7v_^k1I_juoE@xt71j-ksUT;|K$Vw_xP*2s(U95jKRV=64kuWalVpKpJ9vF*dn zAC&hpZ(+eI&!b!?i^Nj=Q{F^Zva688YguG-v;Wv>6h?pf)=bxaG1u$YMHXIQ@9e=b zbISY)q3TO(jv1vLHAeXQim|e*xM9rkjdQKdUXoH>vBC}ch)wXNdDj|+`91Y_Hjm#| zj+pKBQa&`>XUMFYfy>x`_PMMp3SAQhZ z^;>#|yyjc2^@oix6~fpMxf^b<(pVE8VKZpYZsBr>#d0too)Ga*Y#{3(b$Lk~ge}yK zxEV%es9s*IzR!51XIQcW1M#kTL@VTbW|?J;q`LXc-IyO?!}$-f3GJ3emh$LUCaS$* zH2dNiB*Nu?W;}kN*IJV3VU(djsxBYV3G~OSmUFT=>{KtX%B(1S!(hA%x%dF{;0Ald zO2c>l55LXIVFLAnFg!>Tz=X~e0t&y(#v2=s4mX`BCumtmdoD~sUwEg3XlmS-JBmOM!_ z47nIeHBLLM*+eje0MA zmd7vO!pm5G*0(@oIHKLuhil7a2eHYhBCD}`VjQ(s6zDCk@#Ex$WOa|Frn`iBo#w0K z)#6GGr;`^JsI1d&3#~E^IZ0wr{no?qEUNvuF**AQmnw2ghTLsS0&imI5 zxaCZN&gxBT74ex$LJjGWI#^jYCSf`>hHfmMy@JzhvbBu1N^9h-nbrSWFUN1WyR$o5 z#(*Y|;al`)(X*4LdB>P#L4T>9p^!Rcmj%SE2hgd z!vS#+PaWx;{9-6wH%kel7&rm&3*gF~n`)Qq72>-^j<{wZ9=E`W$a3yFEk9VjQIO!L; z0bbK|^9t0#@6-`&uu1Bm$$vy^02yHdfr);HK+7=YB53UCcD3lA% zU?qAjC?P!x?}nt1;)Q<$S5SOZeb53INA`xy!1sk>!SsI`Ru2?M``}=RSNGZ8hVD{k z>XQ}dE?SZSQzWDBP2V)V++v#DW=po0^`_*Ec5Typ8FNG*x-2qjmDNxd@K&;cxF@RV zMMVj6q?gNGqwmej&A;Vaul?#yD>&~6#iBIGEM%52Yxo-Kif3M)UQjq+dS95;^>W&K zx!zpvor-@c1B(ueNQil3{D6;mjpqCO^=QlyFT^rmjr@T0lR1Y{{(N#Za!%}(DsQlw zWl7L4YCZ2Ek5{b}x*>48ydsM!hgc=iNc}^6$jQkK&>H#9`99D-!|Tho$3;Ga)v(I? zn^KP(_H=uDWs17jvOW8jh=MrT$!z2Ms*GU0y_t?%_GbQN=n&fk@#cARsF*=dVKUy- zhtqf3V|~F6vj5PKlZDruA(`lcZ}6Zp)ik9pwn47EEuWbKt(IO;fVOZ{>S8}k%jRnb`VpeAy>h*jP*V9}mdQi@jA5Ad9g;vErrV;Tu#sVSRj zPBP1cUbfy>cVtb3a)crZLoteMcoWt`IRIP;^{^(M1dZ*(QV<5^Aqg`1Q+}AZY11QM zFx$cktaaF19;(>*1aOI~ayO;v$7L`)wCUtk}U<+j=nU?C0uO#2FE`N4ESh zJoJg<=Ym(ck=nmOy?k@oTg#tNR0J7op&iAFENKxv>6Mw9_2%n+n^_Q&-6^PTp(3Ta zd;>Mr_Vc$x<|)2Jtb`o84ht|6!l4GX#A9yN7h;SNEiLPnBXX_plJQeWS@tUkT#s9o zmb4QpgWm@t001BWNklS12jM4#T^s??| z=06mw=lc?r{g?&S%=t1_rc*jp#61*_SJZyun*PNYM76X(G8OKCPA9EgXkWW$i z+p3F8a)pe5^Ypv;4gim-BfLh$p%BGpVFkD^){0S3pRUVD$b~M@9PIKbMSup4zzr_; ziN-+?`on5iM?d5-3ZkCa3%=0|?2gTOG<1<;g@BcyQA_#<+Cd|kM;G8OJ;$Fg8s4(n zY>2stc{x0!1K3@nhlU-xLI6%T}}GzOPn*Y*5J8ltaewfSc5aO;h$j44hFaLNd#P z_Moxpa39-?UFI7oOQXb7`v_cUXqKu_N81=($2gPvki$i#I;%{PvKYioZZUEGj26ZE zmh4!%S=QhAqx2H_H{GR;ZCZJEbL}xr^ByjkrL4BJbL=aatS(W#PnIO_&Psk9@U-&N zE0Ab(GfrBo(*Q9+sfSfrSzk?^`EGhP`f|KI^3N3<$Zh_kd%?@Rh5mc}5}fU$j>Wzy zwcB$o|6Lx@UH73}MV3$3-aifKtQ^YQ7qB{Hd$DHL1^(Gl+xb-Wta?+_*Dm`S8orPn zQ3GV6m;pcYp|Y;Xu^ff*5X|s_*sp?kpsGPS7-2jo-HPSDqN;p2fE*PWvG9u)9n_ZA-i05y4e` zTjh8WB~<80edPaW5(EB0CcrzI3HjKL@^Jk~OXt+bwp)`cTc+3JtE+m0BKS6&X!LvJM-YRN@P zBlra;*(zDOTEMh>QhkFIr})j*Q|+y!Dw{>P+{K%?>f`U>`;E!Q95KpyQjN@7$*N!t zm@W>m7tS|WoCPXRunBF1L|lLlyu*sXcPeGv%?eE$XzNR$-&KnG^$MsZDq>MQEQaAQ ztVw-w7_5O*QJvZeyKlYr6+fw)*c0zlnkxi*2mvyNjt5wHg$6^P2B4P@#r&l{gZA8ypoTPSV@Zp)Nr@8E{s z1lig9Hgr9^lKGe44Er1^%z7#VoIdXlwusl3wM?r#fZy>~c9b?lC;T7n0F#F*E7@XY zfQ&a(a}6!<#Co4YXVax8vn6(aONeZ&UQs`Q1Fy5yV5LNw4GNTp%@DD!fWb})8QBPn%#hRcmV!|7@Um1!+Xi#43vUyV1xBAUFvKmtA&GL2VUhx zX)Fbk0ht&H-6=|5l~dq?8Q?3--(oRbMqI3+bR2;T;2nO(^Rx@Q(-AmFb?6FJg5?wi zvv4c(!;3JJCd)&Rz`6i~WH=!wg2s;HRBQ>oun*qA>FgEmVCC>OMzY;d6TeVL=!W;G zGaZ0g_=u`PG|YsPz^I-$k87!%oJEOLQZ&F&Hkj3x9jF_P$!}=1P$t%G6*lSfjo5kq z^}T`L2YCzQMx!SD!vBOXa+aQt*XXU*D`b$b!q;xaTgseQy(3QRBR<@=p7VT9JL#*Y zHS=9ZUebPw9k$nTcJufDygF&v*VcI#%@lU{;~!r?=7;1g&;3Im0^L04@;$aTzG+%n zUdFG3|ERBRo#Xu2$_+>0>fleU-)(sNB`k<-RHg=N>M(V^V$xc9%2mnxTiyrruzpKh z?mOyzQg9~og{y3Czq*fWeQ1!8=JWQGU(B`Hebbtm&a8)DU->s@9#?$Gw4f7y{~60I zuS=|?dB#1h3zekDvX5Mf9n>h}m&ku&E@Y;%;g&RR)gNhXlt$i$?oCP|_d@S}Rz*AE zk#tDp%js;8Qdk|Q`V=s3ds@Tao6zbMe#E$t;CrBn3-~=wpaWQKZI1x!_;IYt;nk? zgM5YP9?o^FGI+sSrG;f>;0Uv@XLsHM$8Qk}1FhdqV7o`)e+2blwA5K+zLssa-%@_XN`4t^sq)>w6n~6Gpn7_E z-XH!Sd0@bB)|l?lAU#5xV5W+Fbed*R04Ft$dBn8r^F=#i?-| zqZM_MzlaTbHrSwww1~yNnp#aQ*7wwy2Q$@4)=&Je==|8RDTVne#~@d*7G|W-KJiwc z3>sCS+0e{At)K$`$&z8IroJ`n`<`NP7NVxXDtxOP!Fi&pj8IzPzpSo|5Y6R4Wi?EJ zCw_gH4mHd$W02?}o{F|GQgkpo$q;>zao*S>qU3#J2D}h2jKa`}-NU2eS8El=e07$h zS<3`JfuxK_p|7!K&SPw0J#C%s`fhY`#<~*qTdHc-@JR0`W3V`DzEy%S9o@cR#uRB0 z;c$kysmU>NlJJ*%ef)oSw(y?9YP=^o%n2qyRr#mc9fDwt)B%hXa|pa4 z1Gaz@u3{<3#Qy9U#FCYkLlS1;4Y0}UFcD%Q2yB)JUQJXIZy^`r;XW2u-q3sc1zOAf zW|+AH*3n@41zO8(;wlc-{M{{hq*dC^$k`%KjG|C{%a36KW4yCWfgs)-E6VD00lw2H zxK6#qBH0i(@uipuajYkvq+vwR9}3AAjnmg7}w1>Lc}IO@30f7aKu6on&`=LGF= z6j0dx(5yPe4!sV|O|~9ZAL{oq|LyK7G9ay`EsizF@A6}-D^Y*;BiememKXl?4YL^7 z@B=<*>S~(SGT7P9vC?m?yP3Ao$kCga|C9e?U)cckxhlES3;uPr*Q?Uw+}^G~w8Zc0 z^4hq1y$t@iE9s#L^ofYNn$^t(<^$u3d8Yuh&fafYH`fBMLrd};axHMR_kzmcSKA}m8urQ{wurx`(^!W+qgiaSnk9-zm5$53#v7S0_Q2ov zo65s)4_+*wuKY23=_&-7+~Juinecf@f9F{Q8)b%BPloW@%2-+o7htRPFbsor&<~M~ zw?y-`>RG;0RF=C`l#}Hpzdmd-)=|Lwr~9m&i*fQ0Ef+(~1h12GN2C$$JLC!RI(W8P z&GjbxT|vBGwA#nwXZ(=k#a`b^xB@n_ERLn?W;p$j4Y0UuLIy5o*Vt$AR;&;o@CXrA zvCP!=82bbJTaKxH$TiD)S9IfpU~X zWBGEn6yHKcIgduMMXat7=<&GPtN+1KWf;CyG?oT;l<)kpvd~gm$+gF-Cg{5B8G(#6By@Xy@D2HFBetD^7Y-(l2Iz zhH{zPvjecLFU?yF!q`JNg6T?YULF5dm+-C-!*@U!WJ0VQ1vwOHCO|5jrMh6F>LL%q zaG0D9FTj+2^~&M{Y_-`~S209QXG3La*-MTPli&{BHglm7W zdt>bLdSzYxqubwjY&6Vm;p=MJysd)zgnbI1UZsEOuW>Q*pMuD|qm)`uIs0efHe*FO zRuZPN9+*T8#8ux;TTImN)DjZuAM=Ft$Pn2PC-P{yh0f4^*)6Zjk4|DaM2RTz5P!G+ zDO>T|yrf*lEAU?Ouu@3Lg)3~AD#Un143?FtJ3q`LeeKP7N59O}kKa76RJ(O!W(zjY z+Qjujyp&%pz3>`uXHVdyPPA>~r!D`X9pafED~}6th{Xbn**qL2U&^JH+f2tZcn%iG z0P8AxFB^yk@}4Z}_KAzazo4DS67jl4b}^UpC%BjY<6W-n#&1;2)7O*l@2@PkTvGN^ zJGi4RLVu+*y9z)}=o9+@-Q_VkRO~Vw>=6#e`sjv4DrSW0L+yXd=4NMWZCTftf(^t@ zu@GF4OJ(3XZI|yLMK&~@a2dvm7ND`KC?E+(V%ZrdiI!%<2Lj2w&z;m zGIELQgtMHIsVv7g)}{P3m9-r5`>KVzG}l2<%39GrPL7a6WwsfI&ESOZEwqOO(}`E1 zv)BqVq5Y4a@^9sKwsql)bw8U^ZG)?61B?+TSVLYI>ap_dA`Yf^7|h-YH-)kQCQ&Hy zcm>vqqskm(hU*G9JhSo}`xcGkDAqT;CGF-{Nd61f^<#&bCIiK>! zV^!^Mp8$oeh7OQpvndOphteK@2ZfgUE`d{g)+d9@bop+AQ)U=5;110ejomkVZWybd zGm~U1*bnpNVaQ}{SzT<(R=^D?&W1oNOyV2Z7YMU%^wZ@>V=)HH!}!)rL^mAKfD*-C zEeK*+aq5Iuu?QW9I97^U!wVb;%fUvGvIAsNTX6(OW3b4;r)-1p$Ig7dyoXJ3uh@xB zULM`BpLwu2C@`O9KyMboAHx{v1f5|9#X)!C)L!<*mux4!!aQs$?(*y0K^s_kN-?|0 zg%E^&kM;VnH|WKqfRCKL3!Kd}a{lvQsI1C8{^N4KpXYRbN6*iCLt~Y=NF&8)dCTno ze}dovu&!(i0O;Q5+wL-52oDF23FWvN!ohU{1gk|L<>RCWL03O23#fRgcz3dV9(;DQmuHA5XI;4}=9rG@~>yq`iRVwhDYo2Sm9P6FrnX28Va^`aAqMyiV==$OMoL$O3%4e8= zx0+uoPp|coCt53Tq)5bD)?6b_SudK4gPu^mpSKy?g({!veGhx^NYIsp5nlqc(~8}x z(%Thc+?JBfhMKs+XFE-$=fQuoM|`pQQ(V+X(sH;f4gPP$?FcJryEnyinYW>M*_xF% z`g`hPHm}P^NjI&AYI3<;M`QU|ktkN#XF)8~3cjj%*fEy_*HTlL(8rj`u4P&kC5UVy z)nIT(b`nS7w@jrT;E*4zf$%qP%IiA5D>r1FJOOMzTb122Zb)pB`-_$3%Y%Oe-U{mz zX17(f^_Ju#wuX$`nXWKb`oaeKu8wIQ}LYRSapZAq9*0LL`P{*5`Q>alO z^qs8xwS{j5Hre~!)ykR|kZJa`7Au*w$?Rxw@|C?Tuax4YaNt<>1>Q=D7wD_`htH#+ zl-!#BFY+GFo?zq?_4R?C7Tz1`Kdb=lCYyTKR?*BWR>{9`AZ?SKVK%LnC0LH!L~mtl zZ#z%JkP0}|HQknMe;z#ByV<(~x5}n;oC+ET&9dS>g^Eb=nLJdD`iLpwf&54Lgst%x zh043u7IB2eQ4|V_!s4=^^pV|V9=yl4v6J@8juJjy)Ppc8Xez{H&Iw>j{_N_yj{ozk zl<|k@>nNXn(*2f>i%4UMXW4^=;9W?OKV>pF=`xv+iIKcB6ebUB0EMqoZa^A&Wih)tpw^)Lc>S+)+h@LJUytZ!I*_`N}m z{;cjAXqC2p{Qj7g?0x9k<{>Mu)7QPqk{_SFZ~Em^j*m0IT5sKw7i6I?JJYn3Lq5fD zG?egpLyhGcbB5W;TMc|XuhLR=)t#E&)$vZ-th_HauXO(Eg-VXkUny$LgWE69{TFZC z@pSaYv9>&ky@UL4g&rw4c|9H^>jxJOFDz@?YWivL*t^f8$~@ldMgqiB5Nu)|EX=NQ z8w-+m?6c&RtXb}20S5}UW2KfRk2l2m{5I}WN;`^b zM`?1_2d%94zBeaJ1~!kZNFU`n(T-Y?$MRDrzzVpgRI>^D0`)TZKqDgx`om~_pqOd- zQC)ZfP2dh?(nH&Sd@5_>^C3Gb{d?}%pgiFN^l))m8dRNpZzUO17dNuvVV8Qp9?3UM|WCXC?Dg6LZX4@?zK$rF{BlB~po1%9{gWn9Y|O zi{UU6cS98Tu#&73-)L18iK4a@_QV}$gcz5UWLq2i zF!@=Zr8{UE@1ihxAyZ^e>Lr#5gfOfFbG?&IL1%1h;0NWm+F@zFR#e8_coPDl84ZAM z)R#WvAGnA-kU?WbWo180z&hb#X|F<(Z(oX z<~o_^V(XHXMC*eq;yv1k9|Nwi3K{tW7ubheZTVE&S0ATHq!%`q{3a(+bv6=qK^WWw z$&cW8`iIBhSTq2iR4%g@b+@#*(3=E-a6a^)O&8h~U% z&)_4O<{4uR-=#dnxAYf$6&1mtB#41;w1<-&lGvzY2;8*xXtcEw){~(<%VK9AG|K=XnM0}Rr zWG(F>gour$^ZKygxglSWrSB;5Dp<|htaxECOY^&@U9ydL%;J0aTV9fNpcQlhzRAll z8~zrL@fn}T{^Y~L#^fKDUAb_DOEnZ%QM0(={k0$~!EI)@c8xZlJ2lsNy<%!Y{L!RY z-ZFYws`uke!l0y%8NIwUGB3k)`OJu6_dE-8Qdt&tGfz`o?%u>Q{IR$owiQ`iYm~CJwln1kA5?0kaq}VW%8zVFkYDnRruHQdVM5 z49B8!F1W>3IhO?M&uS$mUG;8;o$9~3;oDkszAdB&s{ zhAOk5226IW!^WWas;n=qW&{Iarm@@`<#}mK4y+%zo}y_#9FeAR!`J~`S$?Qy8_Z4q z5QpPqYzjBfigzmAYv`rKjPvs#oRBpwcXor>=F!z(OIHVUG6G2Dofn)%77?esUhDfhAuQqjN##v;+jXQRVk$;f>iQdI5W zPO$|#{?krd8`UwsU#zRLDY#)W4WorL7V<$)c!^8c7Le$J$)GWX$3g-t&ewv5o7n@X zM@88>NVELp5a6CDPk(UgF}af<)LiDN09jn&nYdlKq#TArbO|Ow1$@QcLMhgauZMTC zy<|`ts(=Ab_$WhwQGW^sLvz`psGD`nio*@Up*O}u>=@UfuF%$a2F#1I~_&|22jnIH3 zZ;LJ}3(a6LRisPsnp#;2#zObHQjdKqXZ%_DMZ|3LKtP&pwB~2eP^PiIYAi3v>Wi(i zs%UD4i|%?do6hRtDX_yWyqG=C{lHko7q?nzTiYS{K>!QJtoB~!EtMFWKHPO6E03#; zxSRDR)t-7HWl3DOlo@f4QZv$jWma|G&uZjfK|LTg>#^DRd%SCvdrQ_%Z!2%-?E7Mr zw}$gxUPqpQ{F+=5*f*k(Jw$zI>yG zJG=ugMMY~gJ)zrDss*$k+7z{sI!bM!Icab9b)KxH`mauG zrv2@I-o9S^(mT*TxFZ{g##RfF%bL;_Dk^m=UUtR3*bO_0l1yX0wLDZVcePf>JKH#5 zE(*FAQ9!>f%kW4wN{+F*Q(@EQ?eFR4-z+Fdafq)%I4Tpj7QhL}M~N7TCTK7Rw%}@Z z-gz|laxV96>K7}=5Wxf7hSkx==i)~!iFct6G+|&C)5l{2=wR)X`k zc#}OhuSl=8k22W?{46(t6Yhz>AO@r4Q0W5;;Tao&>(s7n2OhU3V+me}cCb&}Up|r{ zB3my>+4#&mUFJZDr=q8gXSo*UsqB8kce>{|Kgs3B2Xm_S*6MEl>tos~2cCw#kP3f$ z+nE057*`Ka*ie2^U8`-u7Mmt7#9HEG^BCNr`&MJQnr`xQP=aRj zozMffvbU@yHDoDbwV9Wun2SYZR(qvfL}x39isP7|2YgDJOFgYLQ!iPS@U!n>`Xy1G zqFwNkYExh6!J6WJT!7Q8e$Hw+U+}#r+2AV1~1VutDLUOr@*iPc~B}n001BW zNklgHnI-aGw@J+T9m%#>3xJ9+#1O7Ipdycb#3|4LY5bS{c_z+7%37AHWpbU19 z?_dE2GnEx!HTVuzP8wJZm(fqrQ*H)EnQ#TyvshM#4W$4Gp)*v6{uDK7q6BJzff&hW z!#mo8uV{l6Y}w7@dWbR8b;Md^Px0K6d3ESQ3XFYcA>f8U7zY-%VJ9J5$y7Zc_+!Nc$=>q&5H2?>;o!CGvL+PC zjZJR`RkHUaMZi<{>afVWs#dNTvDo9*3{6>K=Yvx#PX^^(R$Sr-wEBd7xNYt^D@go`2ng&97D|*IM79 z&T`(qN*8^vwFEjEOdf+?)D-vfDat#gn6032#k$N{Ryk#^UP*oES1jpr^pEnYl4U$} z|HYQdeV$A|C8%pqF%b)`Y-N@AqLHVd=cj0H%)m%7)~cnImLVu?%To&{6-YiDEb<() z7QzWOOcsEyxK|lRzoC{|8gmeX|FAZDzG^r5ZzaF1$*1vA;t71E-y%BCzR1ccubnHi z|6|vbsmg15EvHZy_9yrw!b0hm1$Z)-Y=U+aM#u)PSb2**;i8SZ?H;X$lU zmqjb<345WBcNg*hAIzd5KAJholW8rKm#il)*O!@Z59cjdw0o5zRt%eAOO6S19&z{9 zA3`Y>Vm}_hg06(#u(eS4!bG!!XE)0u{znzfcyoX?)X&d0#FlG$jSe(ZiJ_NNk(;t9 zdE^&aUUrtfjb+vex^8W-ni)-G13AM;g+0vh>7t&JZvrJ%iEm@iV$1YSp3COpz)T}f zKLlVrGkv`Y)SpV2zJ|1liOO_b?xzm06$)WK_!yWOxGJZGPrjg8&ih8Bxm`wSy?uLt z*XZu;3h~Ap0CYf$zvC@{6(n5XZ*8G6&`dP;^ib;hZ3!wy!$lGu=Z9G|V`7;(N{rzo zf5HSOFeL1)&$K zf|DqUe$x{9(h7o_)Z6R{UN{SL=mfg>0JX5zOZ&^w%=Q2-`kFk~70gS*L~EwI4C)S- zd${pKJ7hFAzKZH%nJ6YdK$z%deTKHQ0Jp#f$|ueOh{|F*_{phK1B;TxF9;Gv3=LA` zS+4??rMU7yId4q%J6vNXXAGx3vc2hp^_1eg4PDWg4}&b#xQ=R@q~f)38nUPHT}fEHhX7 z8%1y_3xOu;AKD!IrGPK~YT(b%4s0OB5~siLhPWp_D&-u_r2~c-dmXcU8wEz?7IX(E z9c8y<9jGs&ta{dCSOa!!ggWFm+fphlfhA%B`^rMJo!pv} zf$&eP?&q{k@);)#EQg&U8tR8~Z>7bsb-qqIRZf}a?&I7UydqtXf1bWBq)F6h^N1d2jJ2j^_sY7Voey?| zhne%u_ms{HLNdR|ZAx9f236J@RZlK!&EDf*P>~{){dg99&17*%){%u_2ehR9xSbEP z{!veA%V0Ho!*|0bYo1vcSMh0R;0trM$7UNX4O*_RH5-~&_;#hfI-Q!y3i6=zhlX@R z_JYw^-x939QVFWc5SfSd!lMwOrLk{(l z(M;yO7P64N^;AqA_35?JD)dgFggl4T_k3AeH$B5_MXPhpXSJ~ggsjbzM+}qu@H6&> zX)@OAAot6X_K{Rf+Sp{}A8bQ6D3=wczeGE66m3u#K45$PFU}JaLBX9@{gx=AExi@NV($#ASE|7V9&VZu}T~ zXmYtsq{xu$*V!YL#o2XoRw>=`R*Twfu8}XzDs)7xXitDND-S8~jUR*o5CKDF20Wo! zS`q7o$t^G2sddn{(+1g09)cn~8P|$pG>k5?M|{4LC0@zCvKVX3^0TM#3RcrR$`$8i zh+Ir{WgNSLKj8}fW)EQ#Hl%7I436R?dPH~OH`SKD@P(FPN3`=9I0Ygp6jLbPl-Soi zq5C@5T0QLza*D7Tfz|C@><88Aj&!Ak(;oIzyX>xJt=Ce(n=_d{Iu0A%y>_*u)m(2T z-cy+H=dHowC#WX`F<-G1KC$u;28z*J=?`9V@E>6Ey7tlVOwj`eQdz51*c(uU?m108 ztmEzgNV9f%|Ap)B_*9K{Wc)9t9CRf^{EII2hjxnk$&=6nvg}oTkH8&``8i-98*FO{ zvA#0KDU{l60TLl_k56htI=At%X;PVxaQ}!@Ql_6dV`Gf)XVk2 z@DZauxz;=JL3V+!_>Jpq0v_}!YM*Lr8hR%nC9*pv+S8gENxE)|H-O0gR?J$je3s)Ky5$fWqw zki|{kKF;N~y1~7Y*M}Y`ebuVvZIf}zCr-TgzE&H1cAI@LSpP1Qcsyl7Gc{H=5&0m6 zPh{h@5VL5~Olm^|>>hor`CEUO(W!t@{=d}IxRCe2yGCzMRiFQCndQNv)lV2cGxx)rgUZ4GreHohh#d>aZQg!<$ME;X?7SUp!_>8gGi|?WiSRVT; zij=gAFOj#1w9@jWXu=!H23A{0prYa`J`{t^!#>Ldnq`%_G}b(C-Ic2;H0XYk|Hr2O zvGxPzjbG83Gr4c>Gh;Qs#7pM=5|x`#*NWjCv_EYPeeMOm3n&nD!TZ2?>E6q#&|wN= zl6ugbu*3F|_EOdz>$FIxxniyCPwT9fVmy5`0z^8?p-d`hR1odyu;?yF3Z(94O$%fd zS^{at2dt-nc3a7hJA;ql{G=B42N>=etqa-8my9IOL>MZk7(a|_=3}|a3No5X9}y`T zG-6CvP+BTM&UfJ$J;>8Tc7VRPS5B6XV6&B?FQgIX3Q-%tx-PnbpLyO|0*c>re;pfB zrZtWIpgyz%M~l*w#6QXaQAWNM;WQsR(rKDTv!tXlECbqtQx=emWE1R29vDa@1RbXn zGDqH`KuRORzi|N_maFL}Y@t8IMX`=e!8fcO-UUU@XI0onXh!$p46kT)qX*mr$03Hs zK{5UtJ}K|CtG4}GDlZFvs~>Q>`>A@9=IDRY1hAA=E|p9l!}w{OBAYm9wKexaniyua z1CWE{M)*yC%WY7cY_d0Gi7jRqC?waLJ-{hD89opLH{=)a6P zmEar=N5p5(b*dEYNG##zPa8#T22VYBRDt%lNIr!&}G9jd*769F}%7GXEPb|F)w z=D#A~uevqz&F~)-eNXA@`o)@CVa7SF3iNe$wZ%jI%njBG{eu=4GBRlxl2?@~64@qCX=6nAigd?bR!SV)A)xLWEW zRtdw0_*BlJA~ac>gBk3tdEfdIPXbuw^lI?Vef4)yZ9>?FBDIX-Y?E43UtyK-6v64r zJ7umCW{t51T64@zRNvfVMZt8q47tSRwPt(Kr-F&XoV^#u^X!)Q+X>sUP+hnW>xZ|HZ+Yt;g5Jx zxqvqUm)tLjy;57cZyFzrm$sV4&g3rjbd9~4KPLEqdsBQ&#vQTO_1Ew2iJ!x(N98Hf zGG~zcqC3mj>e;H?!#PZ#KW>Ja)WJJfwi72^vDOo#27ug6f~Hdn)u$Kor#PtPC=yr7 z7nUv^q9;tH0&+6GN6tp78?}clQ!J7nl$TZ>@xr^?yKWblKe)Sy2L&|U1hc}=+6v5dTOO>Fp_A9+)a$W0jVC%5O<}}-V$KKpe z@CABUMRAXHjkZ7mV+MSr7zzS#zR3AWul$1p$IwL4Qcs1dScl$;8^<2P4Jb&_^hM4UP6&cOsVG~Br=beeRL){K zx+7JZB==A}zL#C(Bf5a=@HzZsuXzb5N~$;n_aRX>fDvM!w~D8=_dgsBBRsui56Xk5 zlwLkheZsU1e$dB<&+y*0f6(d}_l-sBRdu93n1ZxCRw-8*V=4s6Y%C6GIFq%7WVoyp zg>v|e_lMi;s!|9FlaDnL0KX|5?BK6ngfwN4<8Shn<^AqpBP&+l4K{Jt6$~w@mr1yV z<{N7Iq6nYnwxgrbDOxCPv*h$f-oKZ#X9`X=5a3{!UXJvF__3R z*$0fKP1KsM;|v&rS>6JsuUwp`u>Vv?|HA$9#D;drS2t+4{{!Z!sJ|4^&YeabtFFFN zzNDA&+kY?0NizF-Bb;aWV|GdTCb!8PC}sX3hlzzoYwL#f+-N6m3YU>#fh-=eGc+*o zx}1Ejxvp%f$r6xaeZ%)`uq#3SV@>kzr71F>*AFJ)cWbCQ#r!Il!)SOV&WOVD1uKHX zSb)4Nck0uzw0vat;HzjiHm6It4WlUv=iy$u&&uFhv#v;_58jo=CTpOz8w+dg&GyzA z>mBxz6X{={JQ?|tTHph%yrBAPc+mc$Ys$ z2mj}3sMmMqH*?L!vWje=f6#0D{~1@~<4;>l^_%?&)l*(OmZm`Fk&G&?SPG%!%xz{L z^HXkbQC?hi$0;qa-G5@?8Uz*0P!Tg97TtaAJ%e#ujcojfv`HbGmABpV{H3(avW?*a+dYuSLh%g(~( zsVj3Vr+cs{Z2oKZWK-BFF@+t%>GA`r&{0X@dHHIey|xKj9=2LPX$Fd5F^}@P9BORg zK^Y(P=KkqXUqh0pbI@>Qs%2-jv&*D*&zBSYkIxj>)tuDKfAN-8+AHpbF2S5^F*p=m=|M1|5g}GQf-yo#j(- zu{U^>4dBVJ1A;8Oo8@dWt|^m}`qAl9XVQbMLjL6)cQh?q_y1o&XE>vNw~us;p^7vM zf6*n`O`gIKv$xTNZB3n%SY7L%x+~E?_@*rF^(uSqJCsR~rEOz1AW|)^Gz8#>SRACe z^5>3|N&J~@4tc1N`~s2aqCoKwCegn#gr(waIS(T!#hed&WQJ(VLg==P=M`zV=*A|9 zwUR-Y><;gsHcX?@kOilyCF)QJ{8$-03cwhf55w_0o~5ER6tZX^6s4c=N)DE{cu7`* zy@c^#gVFc`64(sr3t>1Cqp2W$q)a1RFYY{?+uFT3vq4S)jLhle$?H>Fq{}3&i27U( zG4d&^9VqTVEuVRODvsBd+XJC7yn|Yhg5m5I)W%qz3t>vO{XG0ux7pW2ifx&%6C#v3 zJ`zqS9ew_yx0>dE1P#*UA@nQVKpC}D{H@O60NRyp#?^EkKl-;41nPO$82Zo&Lt}whG#R z+K-~w!(Aa6g?oi83RxAoGVr%U=L20M>=S2;f0-ffee0Uq$i+XLO4PGf=Cnxd_`8nR z-?_%!PwNL3XcXF1W^~Shku9LG(Rg@Lv zv%>F;yNJwyM`|GD3Wu$|x1TXz*+W~Q3Oum-TX(Q0%mlaZ9{Wuz9;378Z* zQmm$VYy%rF$6NdOXUhZ?96V-G3GI;*iaipt(19qS|n1W z7$f^aXWjH%R_1!Ex^iWYpoGZcrk(CV1n;J|^K5iHuwV1J`k!?HC3W@@*GsAyXN9E%oTI#IyQhmSWokfayDd&dC9Zal^*2v-|s!< zuH?=z@#9dLx{os99Zd*~|)I&G-;yKE0!gtR1F8 z5$lm4u@O$eY#2fBp$idprTNqoGH9<%p?dTT?EF2ef}bG<8ptHRkmbiNFhz#I5p%eA zt)aOp<5%DPF2ptgHGDqEV!0&)DrqIXJIr!gS^H?ZEpzxou}W5r*16c)NRoDMBmi(~*CWxz;DVjot98 zwOXdgV>AioP$wz@3jGVkzz&09BE5hW)IyG+P<+Urvr`xk%ji$43JYKnyU7}`JFtM(0U$7Pj;WIo0L$Ne2$FeXCpW#?| zBY#3;RszSe1bhyapf^>P59Bo~zfr{c&sr|t^6L}_33h{jVyiLIFainrKv_ z{xD9w1Bp#x4E%&5=)xx0faPODa4WCJH?gOf3&SCXy;RQf?eGAu!2lkB|DxcRSsx67 zO7stn=fm*@Vv)Tetpa~m+86mGGHie zMzRq0OsND~Zm^u>_g7ee9>}}c*YS2D*Sf>TiSyPbm~OSAiRv|;%ND7IA{B*|<9o58 zs#0^KlxvCc09WhPJVgv2WxU;{HE{jZzW6y|h!|rqo+@B;4TMcGaBVQ$Sl>%Bh`$Md=F?s_&0VU3kX=4IBIC!4Xjm1jd8GuXHXZ_Tz= z2`FggHM60X2sOEK7c44DeW)_r(x%d5)AX-OSB+EZ*xWYOb3T*3RR^$jP!8L$&v2an zC&!D;;$JaBdmnPm%4c;G$q|=)_j=}dAL()C#O#Hc*K>zE-rL{X;zf6?g2JdR_scj@ zM6?$h4MQH3y5S5JhK2Y7_sZh*ksO3nkkX+cw5I0BaSq*=O{}Y; zFwO@9Gp+9ODSpMN*Z>#vDfj~hQ$MSVIngl8=i;j~l?4Sy8L#1oqn+OgSeYH`Xs%6> z8Da|e`V^pl!~r-)hr|p~R1B2SbjGOVEn{|5N4i&fRG;5$6xnV;#O0m*m)>XSv)p#;8!ICz3S&=oJ>Xn2lip(XfHF$$!-aKKXJHmYYO zQ5$Q#*vR_HQ2eEwW=Z0dzEH{G&-Hxd(OxRca4jxVR?s>{FECWZgyVwWI zGAHham++kK&DdoSK!`}678 zyy7bF%udS-;yMd({*&F{3U z2$wnHlzbvbz!p5s{$e5e$PBMh3ESFoSqpX@dr*DvXqUfxWk_6PO(RqP`L+ zHj-dH!K*Y=c9#jX9@bDpaX}6h6XbmegIDm4?@)ABmDXT8*u-zzBkX5Xo7Kpbobw#7 zfi!1(I{B^&*R7V}B&XBJKx2l*6%)d?2wu*&%HaA5vvz?M9mx;ps zj`WJ-Mh*TqABXkK`r@mQvLeif0IN#wc420OC2#is6TIG3r41c&uM<;qw^{f64;e*0 z(=mVGIr~Io?1^%YUYdf8VlID3wTc*%=&@N+uj!v{>u+o2SP6~fp3vp2E8S$L ztuJbGYb}(*qcVv4h~~fo718&cOgVOuSP{1su{}nDgY1pwG5RIR)fGtFQc! zK00fhC$F678JhEyedqI(9W+qsp)U1&G-CC-#yH3a<=8yr(3UmCTxfvjz<~BLm*&6| zcnb<_kZWKWT_k_#Pg7|W41~duiU~Meu8=vD2wNZ;!k`wcfk$u|4p3#@lug7aV~DZY z<8jv$QOc>@pX_nyLG!CFwzKe78o#yHN z+$X%68J(S(v%{wVcfc1_^>1%$BR0@{!Oa`FnVDndPM)T=rkYGu>i_^C07*naRB0Fm zP4NY;gHkku%2GOvg-E)GzSNq|gC8xC<)DHbCljDH*kmVG9|N&2HKqi}0UtOGiy#TF zz#a4lL!M-b7)_TUF9zan+CWQ4&?0sQ;wcyYmOo%Q9Dv=_UN)whaFL=ZFSUZj@;Uv3 zyJ-;1;K%S66{dcah$kqMGNCz~pux}^?9>bXr5bVr|3=HGI;B}@@|bu?@3oh*INagE z&{jW}ZBsw%SAM0d>BfhoB{ahx<9NbV+d}`P^g?-JYX}W-u$l=O?5-OfVNy+<_K1b`HTzLsV!8e zIZDf8BCj_9hp^w!1)|t?+zh|)0ZyQAtULZiB`}&Tq+HlcRp1G>hwgX^-@+Llghes7 zK$W0?z?@3ak@xbZ7Je8W8`LQ8si0l{)BI-nHn&&iy;csYKWeD+0d-jD6G9SRgum_b zW}ETd6R#h~j^Y)Sl(z!nf|__onhW&~>OCt#Uo9pWJ*}?zw|FbU#VUD+0=1h;Yo3qy z;ifVT{%0P~x`VgX;SSw3!PgdkT`MK`SVO6_wZ$@6A*-h>g&%1-)PpLN4F+3dt7&`4 z-;0wO`Sr~*M9ZM{qKSB={3K0&)53h3_yxIo;z@0m$Teo`4!B7BMQvF{CeTCq)EXvR z%OLtr#mx>hU(OZ8N;taNqIsOR2=9q=mG!h$HkM8KV>X)^w$FNVcYCEZ9}0gPIRVE) zZ;KMvL(>IQWFK=pq*6^WPi?}Iv7oJ)SZF-5Maw608UMp-VD_`e(-65xsS155+qRDl zh1dKSt#Mo0sqFf{&TG%HD++GYM>$9P&ga9eO|V<;f{QpqFHK{7D?3kQ9SUo#_MtwZ zS7EI6AmkELFg$pjZJx~$8gH*+D;~DgvB5r8`7O&aFWadfHU@)Kzxv!T23q;dhu9O# z;$$Y3UCalEK^XqRcnL;-Y)gZvu<=3333Tbd^mnv6V7@ITdnsnXC8)2bo6$UV<*K(>@YJ5?@viv;S=7YrCrYeib9FYneV2LSTWY7*sqiFVlO=ktHKgCDsBPNkaTiHge3=7~Rc;FNdhZ4}4SCoo4ircK3#tL^8 zYG(81DExQWCnZJ(x&CpSXUB6(@_)2k--~oaS#Kk&GS#JE=|h)AVbN2(p-t`{&I0;M zdEWKP6KGq-_v0&UYg?f7lsTR{xrx+>Tg-;mo-e4{YtiZRkLJ&a-6pf$Y3M_yK{0eQcKkURmK!cI20~EwA_!AGH143XS zhU0Zs5SL(WSPu=UA@!hzumKXGEfEffvbYrk$w6%(3ku2BFdS8wLWxvfjKEnCLA7uf zM$=jx$2yTo?jANJ!bY1-y|71-^#s!CQmI`8#~C+i%&{ zX(bfr5l;`JGas&n+j^uWB;QUrk`wKD1&Z!8#(I`ZX7u!S&fh&S*}kG&se(ldKE86~ z`Nub{11=SMT;jveo3DPp?~|23cVg~uWh2#uN;H?QLU)7HD2l^hutHul1EeOhu>c&Q znz>QNUK$KdXofuIKV5n#pZp*fnO(FaFjv-+&-EzdwAvQNurhM3cp$T&3T&pua*pE1 zmazz^3K2>=q~K$G3O(5&h*CBy8II4kXO8LCR8I?IFqDVKR8Y&#h)ev$)`ka_*p)k! zuTzuACU=S(zWZUOsLBOirA+k{%Yk=r1PZVk^n*xl%BUw_*y{zwzyy62X@Wy>rK1wV zzVM+e0M`3X%4+-Tr0hbYV0-$k;MVy1fp||l z#9!7lj1u>$L(pSB30!=m>_K_qs^NB(XM30r8-i`&KbWOgQRhW$ajx~2Pbd}e!e^J~ zsk561@V($|rU!dAS(g(==ayo<(1Dee*V-`kgu7_CqwtwurG?EezgF9G-YQ9D!Gw^2 z8rDwC6Ew!Z5N?GztUT|5^>GP)keke=)>O#Dw!vYkQ#va_TjWBT2s1=g5#lMzn}@eD zjtMtkADUv9$t`SU`5fc7>@GhAF3v8VxGQFntwa8sMZN~?RBx~*p7Tb8S21ehN#z%P zQRXtuybc4v!Fu6G=!`emLx>Oyq#X`Ze|8W*lT&_{n(R!2tReC>l!FQ^2V-$6q|p|b zL^GwEwqZwXgKpoyZPS%T?n`<(>$Rdvw|MBD>fCBRGXk{KkiQ&KxgV4od>r>D-}hIw z^Q?f(t8DZ?Aa3SPm(S#Niq~&IcSuFW++j77d5oVKrJbXZ(G2S~#;_b~0r+`KI0LAo z|Aeq(_|D$OZl_t|9(@HrTp=axpa2>JZQ(WSgLmi!g${#1B~x3dMRjmLrb7xmq_fZx zHo|?}2p#Y}v?N1pmT##8X5$e!Pp>JI-NnKf16!dX?ZRi!u!JPkf=SpAkKj9;E3Z>t8AX?^P3h-xr22~Selw%b(6fNlqPOV`oYf>B zOfRX4SY*96oL|0X?Y&{3ep?A%b?x@?t(#rzp{k=F@rRYu#{X$ZE!y z?1f0M9QNTMHkdz!SL_Ho1=FwzY-Dfo7a}Xg<5@pmlclr%!S934hn6WmKi}B=pGy@f zloGWdzc;F7(2c-qzFm|Pwk-93d=D<83&P+XH}$$Tti2LPE+gXHqsKQtRPhV*(R?7q zCowX{@TmkZjRj4+RGU@)`-MJ_tG~QqHk6gfrhEtt3{96iv~PAlm&@7G`I^?bFX+Xo zj5Sm&6-@(g2CcEin7_P(sg@XJU2sigj=l(4>8N6hK)iXZ2je3Dod z*BUyFiOLTXf*2bC%W6ZLF;c9g_j0NT6vc6x90GA_eJJl4C-zxs ztS5iSitvA7G0Vi}@ZNl3j#pOm<9xB1XiM~6t*@>#tl@8eof3g7VCR>_Ay!GTmQC3zqj>BSDC*CTD)E_vDH`Syii4SKBd7w0n zru;ZnlHarx&|n%{WX{*mYq{FWkafZW*~(Md+>@O>$nt_;GWJq3H1v(5BQz8a@g}?- z&wy+g158bDcFK+SZD;=&@J)Xr%kx09r<_L>=mxxn07o=CCpGb2L=)0%Sy8MOb!BPU z0~bI?8AltS6Mn!6{G$4d-DTxjRU_JLV%oV=Rqf~KB=}n|S%0S{IX8SCaGR`XmS9zV z@{E`~J@2ws({a%8FBX?$s2q6ckQ`_I0|wFz?+jCloj4iCqZ=3EA$%s?vcEhI+3=F4 zNTAm+QSOrS$ibTMb=Xh2jZ>f*s}BH~@^?`vzQ`Sr3&9GovG6i*hV`(r+4HOSKeHz5 z197rG9F)_h7S&Mz2SRCqV-i|r64&o{7pnjAnTZmpT z7k0sQ2*Eu(ncA3_QGtIwk?v&pk+m>&5`_Br_-+EbdQlq!x9Jldhnn~a&toMxi5sB= z4#RNBrLMFAbUcWU@gJB9^`qgL<<`oQlLCV!zCaF)%)<WT$EJ4UB z`06eNN1ZZ#4LhJW%$o_5Z7E@s*;jR@dIRqBYJ4!<$r=bdQ zybUwKh5k5*y_41Soz@Ibz3)kUmRXQ-g=O-8uqXPlN$d)~WmlOFbNPJsm384+$}2uq zsjK`Jwl$nZpD9r$Vs=dbO5dXwN3JTHT>MFBe#DF-<^8KUuJ}~($x-&JN0fNEN7OLt z@it#NruJ`re({~74`wPYd?Eu|7S74}kvTuRTzbpYOYP3oyjWzi4S*mG3Cl zoo=}026tR$D4zIT3*0AXnw`CWvAVFE<|)U# zKIUNM3It%f+$a)cX>r3GWGrP-YN|5YQi}dpSYV#gquE_k4<1vjy4gks8hI8K_QM() zXV#HL_*8ijI~~&Hd9P%cF>nyt!gIa5ce|d(&#=y{qc&N|R3v-siOc^( z)^+?9l9%5ZC#Z`>FBzvxkps&`IGmt#`5cB~XQtpx*p2PD4=iWRwAI|L&A=7xrHGZG z;S8eq*zh*~TdEC|1<{pkTf`$T!zPF7}i! z&2MH!NQAnq2)vh5sVB|FW3oRmev6GG7ySYfGT9_H1iRqxl!PP2difbVaE2rA9!r+IX&w|`t>tZouVy?mj_OCa3Nd(FI%R2E$pw1~ z&m>Vh=@PYJGq%w8$d3|abNqrM<=-$L&ao6c0sp|iR2;gq88{U>lY_G8B7eI&Aq5NCsvIE8$W48Can6FH7@8(SdYBN9S=S5o^ zHmGnLOVActyynj|lI2sp=xVNlD=-GT!vQ{+cLT{kXxZR`R6YvwbCT1KLPcJW-+&a` z1zS%@q$N-lL!mddq^jg6n}HP)u{*rR`;ZDvfWuDg$&>IE2-raDsXw@=CRT-0vJS-Y zTkI_igsRXGiqSJfij;?mLr)k2|4}Kp23=t<9i+}+2QT@;WlV>zcm?agYIsBc$OO!1 z8}KI7gp05My2$5p1Z;+*6v6heczg)eSp!@~y>UFg#D%aLE%*X{fwRgHwXG!x7DB#k zqUOO^*w0>?&%GUBIhBzCECJmxR9yzIJk^x@uIt}EvUf&5S0IJ^ya@RUP5B^xlA?Jx z_M7(2^2u6_H&7>9?oewgY4oJo;=A{_?`!?rck*4Wr+oH>5|A%euxmJjZ-imEfvv<9 zJf2VB!&suS48Ew-wEhuE#d^hUj++uzr1-z(>qnO;T3G&Zv9{sGqq>K-5Be!^r=x*A z#u8!Kr}?q7*aqI2SM_zCOT2pWPo>)QyX^mN;@6fj%R^cfY5r~ervvY16e}CTZ9Ttq z%Z~D{&265yz?E!16_H{NZN}zs%*+h>9tzo2^LrLJ%o=&EbJN&Q=5sj;YSTD$DvKNk z9D_WKK!GO8B_WJ- zxTrri`u?E2!cI0o`spVudmvJDQa)p{T%%QHXV77ps0>lRFx2Ndr{R0^oVt|t zQV#pvfC&%)$@D=j?stJ|dgkQR^PSF{dCD6@=}&F0xM2MEUQ|+GARdL|^t)NsnJbDjaF&CP0SB#-bTOxveUsM97;5hER*}8r4-sON z@U!`!b=)w27Tw6-@|}0XNVm{CdMC^M^h}O3s>s>$S89Ye@FFZ_MX|Y90rO!fBvK5# zWIgE`y&|GY^b7dmT$sbkvTZO;?ts1WSAC!1o zl9UedP{^~CeNd8D({{mCSM}V^?6K@_Bv_8vmiS#4v2v{ZAJmrTH2H8GZJP4%A(qOF6?qIpLxRpr^7b?O9n=eNzN z5B+Seryyv6cUfKDl^@3atbx*r?@?U5E;lTshJ`(hcE*j4EgRw=kx{B;M9HwP6?etX zi5?!^KQty7g6jMK>S*tK)OWGPq5$8|8p->zxcP@`k1HjA3j4l3r(40T$n$<{edDs5 zrX_w3%1h4bkop>5_&yC>RN(P+ffI(mao*@ARI{`>5KOj{&CvbOsZ?^c)JN+P$`QU_ zy}^>TI7=zK4}ao;0=N7V&$=o>F#b+Ms1!_tcpk)GQFE9JLzD$tHLWY3ro>wsh;*}w zdZLSb;CYWh@hB(Ch z^vPnpInZ#M31U3og|oSV%h)p36{^5v))pPE&xHdW^>P<}xy23ZNZ$ii8zQ#nQB=tcWGOT;TbGOJUe-z8%xAiDJ6pyJ_y_Hw@?7`;;0d{at->VH7so>; zI{5#1D|U?<&eRFUtu7&1P@H&%c&g-Pf;Z?YM>`d|0J`0*RiKL&G{EL6c$oR znPRp!8W`Wb+s!WKYjFqe>lwyO`})-TFS|yZ;y)Y7tbvwc`{CRz4?`32SP52Icw>2U zWubWi;_<9Gi?3ss^wT&RdKn9~l}eiUUAdxojB-kXGR>GEkI9E{!Q5!{G)kIwb}(}+ z`!`{uo)6oVfFgHo#l1e}0vB_a$ljXz;Cm%&0GM)^Rt@{XcE7T`8mr63$T)dWHZ~K? zSR4eEskt0U1E?Iz zZTQo#?%A}Ju8D>6p^>BY<`(upnnGX1I_!t@=(>`K&8U zpI?*z2zkPnVHeMIf5SJF{dk& zPVf%SD}KHZpzv4MP|rDU}6UdhIl*- z1L1<~D&|8B)}Z3_k&TC*v=GNbH+YA3c@JJe8U?`x+zlUS9L$3aG#9&51GtTU(<2Dr zbD#_TOQ(f{Cc$aqc!SSk10a|m!8_C%%TpwkgDA3L9t6nW$u0L7vm~R-;ss{0ZS0cq z*3vAj9PP7)1w5C#e3ymg$Zsh4X4;F7Sr<`(?$RVEEi=R(obPJ$=_*_8$<6JHaqilA z@2zR>r=OX{mVfnj1vM~--+H8;$`5_|8#@H6MMIp$M#CMJq%>ydaRob$efcY;4=byN zYdtOg;T?*xgpKiGAxop46(1hfEf^}YxRhck#Y+{v5PCDHU6IiNC+v26OW(_Ch~>7W z9jmD}?Dm-MeT?TYsZ~vj`ncxp0`^~FugpaA5gW!I`F{v%S2QuFQ6A>q zFh-hB#Cus$2oVn3cmd{lqxFj3OE6s=GB03#wvxA#f0{*&)3O@Wp^4CfuE}DwSD9{A z_+dVTm1W-msFt!u8OSQaFYpWnJwz4rZ~}j&Y!`K;E@#jfQG@DhpItKwj^H5t4gV+G z;vpEp%3=oG7U3vpq8v<#vZ6W1JS&ch!)760ul+aPqJr$N?9g=WFlOqEh0!q5`V+)Fb@-O zH>-x%F`8Y&XlO{|Xo(ojemAS=W6Ue=M|#DAHNp{?&uRW+q_wf}(Nmk;t6EJep(F)R9P0m?W`1>$D_OzwFd)}$pV=$T{eP$Fpu6s6l{>+z>3Gwr|coAAd8Xm+wP>4+_4E}+lvIBhJHW-3SSSKiici5W7z-aV=cT|gB%e|}) z>qIe_hzW8k*kLkVgG8(cql|9mJ0lg2&sy!|8M6o@k?Zx*MDH(A%^h;oDK<5gRV&)c~8T~lS9TFX| z)bE>pj=hs_RcmeSyf(q|h#kh(a***sujmf)rhD!e1i9z9MixwDY!kbwD8|oxWY{ip zbHgv&_hC_I>_)f6`Ns8M;V9=j_e`2fn;;Qiuqmto72qNoAfxFgxeC3k6`O)d#xwH@ zEt7uK&HOGmQVAGOAMh&w83xk_+K0Q@DtgR9VLSDdJ87^wjcKf`cdgN1KSr;}O$CS z8z~id4}8JD3soGGCDq+n2NeIlN(*&kP$X`}->kbtI9;Y-b+Z2ek4L-d|EChAjd3hB z22f?z3>u>o7m5CIjNB`;X)rv8aNG#BaSfH1Q$#XfXm9U~r3S&<6e|w#*7K>ST=gO% zo2(8rtC`DX8Xr;>^N>|^;49M8)!U!^r1Wx7%t=Sx4i7sh#Oh@*u= zaAEYD{C)a0OOImAp}Oh@H~0C>f91o>J0httH|_a1U(=DdDR*vID{U}LvJ|IB<|x}` zs%Tu+ei%dC7QR~?@*YqtgH`%kORQLG65hOgOt9>E&3 z2W*BoDVssOTuhtk8OCBdG=WkuiC)os_yrz|1hUa^{DfI-3N?`_WMgmS7AfJfe$#W_ z`;WVwvA$rhoEQ+LU&6iW3O_$h^X?4XZf~hpWck`@OPK5isgR0?_y@ii6y=?7MPr7? z*XbghspAPd^XR474RBoNk8!LP`ChA{x2p0Py-@pk56C-bY?gC zR0swSEM*N8;HUA0@`1T8+i$5v!CsK_y=so&Y?rJ(p`!bzvctP=h})}M_a<|@s4!?EgTa9V?&0PcojFN zj8D{~P~Y(Q5H;{lV7=1ci@yqAT6}KUmyp1KL5{D!pKQrK-PIIzhEfH)uu$l!hteZ6 z!RyphT#X8X^nR|Bc|%R`qFZ6+3i-|8Y7Oe8_kb7doB7e5!-rWOJu}{Z`CQZZB7ddT zFpX-_0P}`v6`k2rt(mf)c4IWl)_$X1vZRuU(_s=Wg%~^rLvgFJU5uk{u!gqLE13^x zAqK-`OS0e{(*(c@WI!M;wS}quEZ=!?WrMO`B#1$@3~R#yp3GkAwG0yP!NjKeNj#t| zbhn}b@}*INB~gwyi)BEX?kBp6EPbzW&$PRej0SuFt-`PDnz&9ypr||qAE5=UX6Kko zA7gYf6|d8@d}06xrhTD#=C)3_mn5qC@r%wt(NtC<}k8ZJeoMkKAj_fZOb;Jc^gp_Qr5|UY>`)@he9A%&`1yHTf9$ zNQ0?`9Bmf$t}>dqONl}9l=<8osZ_U(EclKkl~f}UUO_Lt2rZZ2jA8Ou3Nus5z$`f$ zcJL1n%<8awR*Vhh+sxXUtLWUq5#F6XWt25ox*&&NRv(xr=sK2>k0gUAaocPs-kGNU z!n`cn=zh*&uBtZ6j}|F!ZRco$vkg_kBwmjFi&vG$?zdEyUU@R11)E*aOBT>fSGF>Q z=V-cRG5Z6fQHXG|v7MaLd^T87WCh+!X+h`l3Hwh!=Z%(n&P%quSjGI&EiJ5 zW?3Kj9#Yb1v$DvxzOa4Xgz#gJ2K-4hyiUGK((o)0V;Fg;!8**(|?{ z`xSl94$W_clRVE|hiQ%3#P|TWaT;`iTeumMAdL-Yb>SWSOp#)O)J0pcf)xgU30r6m z6ha@_5#rfs=nl8B3N!=-QehPD$_FdKbeK(GtCe2bCZ#p|O`XLr!9)mv4bU8VfdV7> zY1T+wg{$<=+z0@D*k`_vIq?wghBeR|%is&6j{smmpTCR{TlBm1m)XL-+7cGD*kdYn z0z9}`ZR)q0RaE-g%2}RSE>m|cLOCw0(iFLr9*LUbl$=92`|-B#$WC`yt=8G8EQYwf$p?S zj}!}7Nj?t}WmEiw(UwiLrtpfpjq!;;!z1zyCGgHgZWje(Hs#5DX{YJtGMZ%G7f&$V z>_}7P5$Z+X>4}3 z4f)m_BYyC8uD$u^3KIQ)ge;X7mZ9#U4|1uQ0vULpcF+Y3XH`MPPLPTmi_jLij-BD* z-g{;XS>5X4Z6moMYToZ=hVe-@hFm;^ zzi4~?j(ImgBiT$-SUB92pXC-aUhT}z@NlRpkDvupa0c7U%Ckn;57*Nl(nIyAE|fN^ z^BlXR@w$Ijw2w`TGGFQq#AVAxqqA{F`~(lsgCkfh%;lO=m&fs0>;p6q|5)yLM&#er z)@p;T-DMy3tG!wNa%6Lo8##i9b$`EL#zb}P=W413)mz^%c15YZ=`X_%!W2}gI)J^DF@-Qnd-Yhn?n`k za~8o3xySe+%8M&nZTiPRawuc42>z_%{v*E4GBXP?0@`7`QG|=r_2EbHPrNcmcbG zBh{mP4wZnT^pE^Q6qhe(6ugA0P@45&`!NuAlaHKE$uu0B;ZypBwo|e!EmoKb=0C(~ zi8u$Zu{}1V?q+XeJuWaiIAif|IB)*sTgmb-bhG@)UFp*xmF+O*ibUB#o|00ur$Ty7 zEhq+-V;i=f8i1r8wsl1|1nn=rDWY`=Eyx=%H)whEmyi)ri-L#wuLwvF-xl(v=(|w8 zh$|qj_}^jfko54U!5RMjie&kXavZnxQ^&9r{z!~>Plrr-$NWe4xULl*D`=WiEhE+W zOYX8CJKWpy9^_tiWfVpg2$oeLKR;_ceW&}zj~71f)^e5mq5e#q)DCF#9D_uZX^P%> zlV@mCSU&CY%o5?sa&4;-Xq|$NkPVXVtVMX~-?mr2Yl^D1HF+Q*TZc+>7x%L#M z;$-YirLZ4ulP|$8BgIRY2X@nLnQ1M|DUcJeicC<8YM*_-+9o>gSpT$sw4G<&*j>EO zda}E?LHr@|jN!boKGOA;H#a-$CEbs34=-+X<@>akuK##4A8Sm*Z`>xw(@*p>y;bJ0 zU-f44sjO<&5bx#x-c#1$%HQr%^jPi88+w|H z{l-CKt9WnT_TDjSi#4K^&~d!{EO$Z_UywEMQ@X90vqSoHpCrDI6*C^oi*!rVS)9^B zoWj~T-8~hCT5{wu%QVYAGZSOUG*&+lB_1m zK2j42-f_kcW2$6yM4qD_v_!;;=d@a^JaoCUqUy?h;vn=|{o>4A0f z7otYssgf#j=tI-E0$)}=)jHQA22Dd;i&c|5T z&6+}IdPM(FC3r{AWC4w1H}Q>#mCRfLsWQoX12-7o3v ztQ_^WmR;@ljq64lwKKXJUu0Xc*HmOxSwxhDK6FnsldGwe{ExoFYd%HmX6+O^IoesW zf5eM`3ZZckEO=pv*YB>qxbMo)R>8Z2Hw1eEDhG6qS`pSRs8U4FP%Tgt$@gn!U!qOm zKf!7KR`hqfaTfWRoAorGW$8Oz6>`euK5^|W=;<8iZs$JjRk^aU zs3&2#F`a+rEh^^K>|5v0{7c^_|M-y`oN+$=h+H6_n#^ab@{3YY;bxEQ&t{pBW5vd0 z?=tqA8Oqwi7}rQ+jy~Mj?j2&LiXi#e%rY~SXZHP$HifU%E!8~e3jsWhr*$fzbgE5fC z2D6K@y6h&qi(GI~4Y|pZuD%zUxSz4WAj=_3G+z$?(pB?-`Ow3l2~?&zkSoiYYjB;J zvpk zu@5w-(zKoo+9?*BT`*pGYQ&3N%J5l1r}9_8S%~6y%{a3a`f&7nV ziZN2B1JIp1Q3r@{ZOzNlKJr>nTW+;1bv!Yf~Kzob~i~_cu$* zk(9?SvaUFneZ;rQW#(eZXp=gfWiHHO&u|Is!#sFmHr7MkhmB?03hx)pvTt=v;4O2y zYf)-|*-9@?E}uQ-Qco@CQMuGrxzNw@-M?bsRPBwjS?OiG7hLX!m#_s7n9ZoZctTU; zdhv^2IsO7WK>^g28SDb=qXKA)1F0K*0-)Y76TDC;LdZgk@hA%Lr6hJjE`%+tA-L!X z9h0e$0%?#5YhgEhhInWMw`m1_hqAC9*U~{aN>17ey>Ty`WHYc9E>ady848geQ!@{E zPm7xDH>kiKYyBxS1iZgM1Vdb>p&?#%l2L+epoc@%G7hD@NJK$ksdBGbDrLAT^ zR#|&$T|k^hKqBba8m2=&uEI7DrJm#e;X$lL=`@R`i$f5Ky-9Ud1Lm!9yQshj~uHbQj zK8}+Si-Ml{Z4Pod(yiAlv3xi6=0XVfIIJNv4K$Z}U%E>;zvtG^EMM3$dwyDW;iar? znKkow=KYyxFZ`!4r7*>P(bY)TxK_Go>$tAPo|d0pZE=f=HCW}M_vYUp_ktIWEtpY& zwrk7|J>(f$D%?H@Cv!i`8X;-%io1d5g1ZE(?d{=l;bk+MhO)78t!$({0UbZ-Gc8rD zJKguR?TTbp@yV!W`AfWz-7Kb9OBSUV^`$%NBX)(I=NL}OJWpA>6SscNZps?*CSjKr5(=ZtW z6YvFG!vYu&HDCnR!DDz%)~69L#27@aWL0TEZwxU5VF136cGFZCw!_Tdu^4#es#6F%T6TueD+z%?Gpn&ALG!_%;Ev!d}x=tVPpA_G>i z-ZYt(>qX7|`Wc#of-jY8*cyyxDcBsp@NzI179z1RY!lR@f9Ovfi|z3^c7|850NPV5 zJ%yj(5A&mUm0Zq(T^6>;Kh|d%j3^l8Q$sm!%+^;!l-5fu@$@s+x>NF+iWhNf~B{*b+-T^UCgWr|3l^YQ}LgI+LB9FZ-gFZ8CyRN8zl?NCE_DG?%M zQ#eAMa1dJtRvZfdLO!OkrkI6$s1Dmn!{~qHg`3ddJSwy08N*LLrBToe?&AggN|E$~ zaw#2xu^R5D4KNRM$j5eaEYt))nMx(0mRV0$WrK|o<~`Ps0JiZ!N>bzbdZnLr5$u7P zyf`Mo6Y&JHsiZ6ct)Vi+U^b+|1Pq6_7_RPuPKs3v6=0el9xgbO8V`xNJwAj&L0<9H13U~bdY{OVY&1I#{?>lJRUyO2Zd<1O zXPL)$v)KIHPo-EVT0`Sl5S)TJxL&5wEvCRWD366u2hyyy9j6@LiigX$t^S}`|G?;w zR{_@@W&AdWJ`9ne5I#Ok4KEe&)IZO$DR^ao#}-;7%Eql%tu=TTT(2yYef6@qPi`;@ z=%BZ_^P=as{7E@A3T9pSVDL4v-%&Mr;qfAEU*a1(m-lXZ&?B$^ulZO zo{ff);;j7F951WT33G&*AlE@vy39I5`ND?Yegch1Z<1b$8lu}whH+Sd&liU9)1J~3 zY-*jY#;9LugLo+~;aYgASg;IOjRERH6|FO=zF9|KA#RG^W@VPZqFFQTAZxFdk#j^d zQ3sO5OW4dV(+}E$iDHZ1ANm=UjNv7o>dy2{u5Sc15q}zc zXcpT+Fy0&L*{(D1ex3CC zibvU~M!-qj5ADE8Q?Wf;3#;j-lAHPZ>q5tN!{58Bu%3UTsGrR8n1b)2JO7h4Vzr@A zwJ96T^;F8-2gAffJSCmt7v2){AYLhgTfiSx-iaSHKgcgygzP8xQY}QoZj=!Yc>vyJ zY~%C78~W$)x@iwX>RML1w-pS>MYdUb1EU6Y&wrHpDhsWN&Tw}%Teq;{VKE{?iB%Jf zHm;|I_dS>OS9*c^H!sQG@*Fvvn#kVN8VY!Kva!yv8pbl0vXxDTBq&WGxJHJsdHgN? zE=S0{R216Ck9-Di$NI94@KKF5lZ-iZ(`+b;80E-8hiNzNXLT?dOnSibA&^~VCutgm z;B-i(qO=+jKVcs3!34YpnPfr_aYb(=I+7x8Vhc6d`mZHU4Z!+(e`Z+buz}ubVgjpa z2FscM zu35}|8lTAHq7e*-ZstMB^sde)ct!5bdqtgPfs6w;oT2G36bfk?PJ+`=okjzrAe;tv zFtH7Wp${)7-s_DBEI$R0G_1B|fpZ|%zQk{fyyxDPI)k3bH)1Hfp?fqE=F%ql2``~9 z9mjc87gG5{{vWRyJ}o97PL_C7bb54m`&Y{ZOMyMu_p?1d=tFQwaFpYdBg8gWQ~ca1GKHuChXW;738S3R*+aGkSN9+oYC_Smhgr;4$z;J3oo?3LNr ztgiHCNodg)Ni#1Jd-74TVctr|*eK2Y-gzfC*^(e{Qn)3{*bg2^qsMrb&dNq&uu;Z0 z*!Q*D=5x;Xnr|gDLZ6^sp||oFYhbpLqu4reLWIbtaHCRu+0_q#V`FrI@2+$H? z0BnZF$~XQ925FI6ytcrhA|lOQ8jp z6823oMU@r>gIKq`9^WVWnfX7`-Rfs`leNFMc0il31=5OtDedWhlqjdMdGwm5s$oza zKFU_+67Nd0p*}-bv`<(8BKUT;O`fCPg@uLt6c-NEE3t{p1HH|xg5^$BHw8S3XyGj* zs>nhJpv$nEw`KRS4{euA%w8-EM?j3M3tg$V(cWk&(`W>phK0N)cEK{rEoNtvpugNt zi}mYj^B@kt7(Zn_^F6H|H@_J*WL00gFvMRXf%-!YsKjnzWo0PLf?F_OPNqd*P_naR=4TcoRR;4^st@PT0XeWA zH-iPYQ8tB8J5gE=A|JR$?O+uO`JXvb)H8DR8pcU}q)2f}g#+}(mZbE}?_C4*ds-`EjCkOli^`Si8K&LW{&ftCAjN4%@cxVPR!xiw7{S3qKIw6G^ zzT%tsU^WyZUdlNfADg|MoavXIm43*%uyCYvyUTQ4^Zr)wx5rN}v-N(GI<-9P^A3GPsrT7uJS+2RonZVo%XvSO~p8O z%YyzsH^eR==73M|T68h<&8NyC#l@F-#U8S58v*;uKuYi-|FmE~jEG-R0dwZ-Z7uWy4@I&f5SA zsRq5kzbT!5KzG=W!e~hw;5IwRYVwVI68}kFkU!B(?8MHpIB1QJ*k|}iN${B(hy${# zh?b#H9s8Q!FoU%)E zfGbgtrPcH`0=3mR3@>UI;CJ{H>&ZjLTXT!pWInh0i7lpXndJ^~-Ia^ni@g=tV@S1a zln;%=t^>9z)?L(6-V=(r1wP z2DZXhSVZ&SAI8`iHcjzk_mzdL2-vZMenlSU&%_Lfw=A&yfCkw;Ss0JyKjdxbWA(@^ zY9ki%>G+nlVDp%tvK0%kAAf+rBJc`iVix+t&z@xGPX5R^=+2PEdb( znH4>=OsQ229FBX~LG8G`1)lujraxF|pVCGN^D@g9XU!^RswdbT=2Z0&-u<4c{<{5k zK&rV_+h%DZ{KaY`M)-(8Ig_t}PzaX8u?M}PC0GKum`1JOoMco6&f#`e2^)Y8zW9|E z$f9z$94<@CO!`Wvpf9`5dayp=j~j3oE`)Q~hDON=@C~0}7G}xSbd}0M6YPXD=smcw zGE{=W5Wt^eHPEOll%`hH5ynwvEWjS{E9T)VK2SE46`_d;lpUxW9+VRe&erGu^CpL{ z#O1EWFh@OLYbvL~JN6O!<6eH2CNZhx$uev)|CJgut1=j_;C^PO8Z2L}0+-bs>vNfc zergTr#ue&tYQSG=*%YG=w0VRk>Nx-aAOJ~3K~(uFt|`@dG7D1svZ>5h$;DT^P+5UZ z_yc9EGEMnQtHDny5!x+T=-)lGmdFWM5}gB?<|@6n(kk=Z%T+v&>}EXFqZrx@>*;@x zgI_U}oepU1=kWg?+%#lTc#$&iVq?qjpaza3j!wb712#FP2BrIz4ybAC=Uc=#l+EW& z`C%o}sOveaR>1Z2L@tt_(Bf_XYN%iLZc;Nb9NUQm?+atG#&`*~ z$z9XaQ6DTS@FgtYbH;D3qYErGvpu^l4q-F*tCjUVGJ>5KFQgxhrX(37e`7n?YaT@d z3TNnZ^m{B&pzH(>SY>#|W+@}7xH*BM>`NV`jJf=8#xw@QWort87-~vm=vSD)h!#Rk zm9qo*JIjM8x`m=qWCZdX&E9d8xOE$ zREdVtTG}f@#9H$%RI{M;fr~iJZzBJgyO!2qoK^$=PY_%HMsaij0Df;~ckeE_C@~-q z+}+)w!71+E;_k(r!jHSVYq8=`+}+)x#FMk#nR);By`VX^Y2;Hi#9m$vm$Rl2hW}uH zzKt!#o7!|%8w>IPeuAZAjJI1(O{1;qBdS;f{Q3tPRxbLbCc|8;0y%6wOJ^6LiaMrt z!7>^^jnz1~L&M;PDz6&AG`UU|#TUSFF62-r)kiPQih*JSA%ZUBSUO6>EKchjJLNA{ zHei53)HUU&wq2j7PnA!B=`OiYjI?gK z)pC_VL2!-M+m>NAyw)F3XG$_^;&Uj(3V@-;;bPjaoY;;kqEw3JVI~Y?K^THVc{mHC zr${)3YOCdHA=}JOvind7oe*q)`DHsm0vomq~Ec9f&uzKdmd`3b+8TO^K3r5^u!K%R7)jV-9MleJv%^Y-mJ81tK04%cBu0J<&J$WAEG;-b zxLU|u|H95hs#=*>_lf`jijKQ`#CjTt=)!qweP5Rms!U8*R$5=wx(E1#3K=Al~yOz2j2(LQf%dI zxAr?Vy2kf$$v-obU(I`3>v>mgg!P~K(XodYWL@>PEKJ?jgo@8D=pNy}EYE0Y0Ahg1@ z;*dIqm0>WAQHF|>KI{j9-jY-rQuXn$N391XkITi<#f$2fu)3^@E5OXW;ol~tHZQK` ztM?eAo{I6fmFm!O_(0_#zpAbl<64}Inb3w7sJ1>7AXdUWa?uxZhlOXV1P5uR8SDgWd)hX6OJ-}Iltv3#vRbRPH9DvJ z+kBym$sgK^T8?Sv0C62^i1~6gE>U-}ww3~RK1lnADcdO;Pj~nzx`HN+QRn%1+dZ?f z%P+IK|9GRK*iWNrx0WEUx}LkIXeR=?1zpI#L@ilGRt@8DHgAbtct3Vnk$gnk*dnNe zrMbl}sB`86Syj5^CVHSI(0%C724Objx0;DyGt4_oYZ}l?X=W|gL1!TznbSueRvV2X zqL|RhD=W&nRzX@vRpck-r~Xi#WrBPvyQxib4)uV^bOHYdjXd3R8fjnYmo*7eoGRCH zU$}fH28&4$3Jqx;9D{Grh(@cgER8ZLj=hF`@D)5#SBH2x9E$;b2aCiSy1{E;BiWKh zuyyFcepp#Bl_VZ}OR54a+T5v`A-@MSr-G>?^g8^w@}vB4%g9it&1PmhD5<)rYo_w_ z@zyYRdqO;YY^D6KIfv^H?QQKEdkCB1qj^+bvxd<$Dytf(?-WiyEU<#DS^9RGM{%kb z#mTYIg#C-nSt*tuJ3|ESr!-X=M?y7BWYGh zGhnGQEFIRHH#2HrJu@=BB3{Rj)R@ioTNivE8*5p5L)DX(@mJtx23rhnUXr(k&oF>( zfhIg&s|298F|r^6XYe+Z#1|R!Ra0JGeA>e1$k@@jT1)g5DNNV_=k`@ z&ilc$0+s&_|MTIV;Qk?l0`K~L_S>MBS?>Gp?e2*=3p~Be{jPkT`IhOtsi40*` zSduM}kC>bNr7oibw_+S3-coI4LAA|($%xl;>pk^V-m}vAETb9kkE_^UxC3`!b5$AUL2hfAx-JU1UyCB@taViciw1Ir za?6eKA2pT=({K0=XTxd?XZNutKg8aQ6uU7_pUgtVTRu#WbLGRnG|FlQ6=1N?*lzq}erCy7(jpp0 z7v*z$3UB#3zQQ&Lhf)xER6RX0vWr|}-GI{$sr_^%!BLg0M!>s-#-HCiV&S1`OZQbC ztjO|XcWkaM8t?Ucl!sTwPHL&#N$aUH<%KdC^~jrq|JepTOzh&$PBJjAQBVHja# z@gF#zAA=(N7mvl7vJ`fNA2^FG!0)Om2t0_A9b@0&6irf}MX+zaXqE9*w$B>`CB#PY zG@zP(#BCb+9n*~Kyd5WK05#weeKD19r0;yj{-kyZFsfKp1e|>LX z?8}z+6TrEACGt3)=1I?y=wnuTH~ zkxqB;m1*d${DX{Y*w^;Ofe_6)*cL(oEyK14O7P*f;{d#eF$pw%kG&;$wFX9SdSW}} zSBl2kUO2bYW`j9@YMUL;f^6_jf9H1@%J3jZPwMXYFIQe%k#jeB4xDkPrIn`1G!{PiMY;``+wR$j_hgOA@-I^~hS9 zIV5MWOV4=e+Uu#DS=ZCd?U#MW{L8$a72zvsp2{fTy(wp<4{$FN-pp@qCHJtLcJ2?> z4Jw>5D5H)x0Xl%ceVlfW@0Ro6U+Mzmvj+=0o;UkFq$S|O=u-cLJ^9j z6KWg?C_xYLo@gy{?Jv+|cz)DDD3q*=Lz(-*vNE&NNY# zy*HM*K4A!JMt50nrl>G2P_?id41|}~NE!zptq9mnPO*bM*M4F<+@jT?zPyI*LE2pF zjSp11Y}>rDC$VTxsPJx8w*}7dFE{=7tg2lWk`Cd1GxL`=IwU0$HB4^$FV@@vhlv zpB<_AiDhZyp#nN!K2K-KG+g&-8pdf(tu||BEYN35o%h6k&`R&gdP6i@zT@P@wAEX`iVHjn===dRw@vClCE>N2-> zSACW)IngR$<+VCwwn(3rnLBC5PiNeDDB%0#8KBwiiH` zIS8P+tRJ1AKH4RSRdux*0IC`POnKEZ-NkRhd+#z!P+bu7J~ z9OuCzcJ`VVx3@*4n{){gq9B90ReQNgYnR#S<5JuGw9e1gsy;qPb~gOO&f;aZ)>g~d zgrjZ#_6#^;+_tsE4f-7WRfy6`*oxx{t+l-)b7`6_2D;f#vswdGm)$lKSwFVo=~nz=D0$}=^y(yymh32%nGn5VC2fO}g?X41~M z@VI`T54|e)<>%)IZ`XX?^L_P~n@L4~bx8R){Z5vV#WFUyuDc3l*LHVyFL#A|9-6nZ zwtLT+DyN8VpVd2in74{~CFiX-%q*9^*?Y-k*x|;?l4suitDTO%;>NBy=iSTv=Hff` z7@NXwwJ-b}Pe@y4ZlcnD$FdveDOW_ZitrEAhjsR}$^pHAZ-aG_j(8u+0G=Rgs(7d) z-df9@lMH2bNO%xwi*A?Q0R!|uoFjY-ykGqy%wFQFZg&TIJL=ou8V!Pu)B`r?E-dJ} zuE-*J`J{r;0SK{6!QIUS9ww5$wKB9fc+a zLa>U5;aV`PX9jDkuBefIl5>`naEmGEX|J?_p&LzLLzy=b-c&rpL{IcH@v#MvtJ;I!}F)f za_)W^Hv_gn$6^38qcz9!}NJpQ1~s;`fG}s ztDg(J?N*)#yt=i(SJ9lqR`XxRXZD@BwIetgIza*W3DI-|qH&rEgbcbUJBwTLnCz=M zMK#Dj!+PVtICur~g8~=_P3Q*AKXT<+7NxDQdriw+rOn8ef>60UPE5{%9)7)d$vyFzUsb}O*IVQ;C zz2vUver@eAYgyT17*&A^>H?I2f^-6#U{`&SV}zfd{rBKSfeYXZ^u;q)D|t`urL*c! zd6aJ!xkaqAqt=HRLWoDIMa~hiTlSJZ9Lblkg?y+UClb_maLQWZvMPtaLkvCP<>)1y zP){g=8o@YvsNUdCC;=Jt6Fb5irtmM;j5Wnnn5R0Z*2?6!;THDR8fxoVQ?`+fgGca@ zvcW6rc#o&P)4ydedAl2Ct7U4FRoQH8hVnyvC~E{8z)3OOUkkRydj1#Eu*wI~LGK<^ zSq1WKY%#sXtuPCMVIWO{ht?PKPw>iR=02!`Riq2Ps(MyqxP~Fp!9uZ|^#Sa@9q#;g zEvRsClxJDqFC~Ef_wvNyFT8hl?Rdfq@E;7e^|w#5wa;(nNiG-?TqW>I@J+1-TgEPi zrrCcQ^COzsqa62)T-ryrDym0NeMhOV3IUnU0k$8S(2x0k_dPPpx;ME#WgW|YlXEUR zD!y3KkmR!I9a5(M;^L`qPtIDl5Bsy}%Akp`RA#D~>J`{wy_%}}%St%h8m`XEdZM(t zLWg8sIfBYtv*dJjU96Xz=m(b}rJ6PTl2SI|K#IXi`CMib-UE_}Y+sDK(7@IN!?V)N zj;fd6;q>ysKSJAhM_A3=<@g&>(>%g1i)++F`v=Nkc@g62Ykq}zh=5i0r+S=-bq_Hv z**uYF!^JsYC#Woo%lcXmnOj}a=3@YC1Evf)!qMA4jE|5rB+1rtv^=Sbu$k;V zZpI{RjOSQC-WrGSfsihN7q>rh6wEqptoJ*M;bM@l2Ni=DbqTj&G;KErQA5>Hgu-n; z$PuZM#h)~Q+M-7dg<8}HyMcz^ytl=4^M`M$tS6k*hJEGdWl@}tBUD2c!IxuY8~HuX zRXY1t!e)Mm-4}wm{2PnFLv5cE(Btdl+vjxtJ=U! z*i5-_4@+Z-R1V%_SsF)8sRBJv*JMjoIj2$PTzj72f}uCz2;|jPkWEDpXFV_**Wz$C z96!jdawc7dm$Ie2g=H}kls1H2!p$PT%moWT!x`9?IWZqL#Rh5+^u)4MTxoKOEFhbj z9n3`Wm-Qbt(-w>Myuag>y$0UN$WN28t+9dX!4W<~6{5FlJ57WOki_?}c09MYkur!aH+0~A)?D3z3Dlc!WJB08^`084DtHiE!4?`o z1oKoUI*(=KHuY2`h#I0JMRR|)7Z>9>t&@W*jh)q|@aE!m)_RV&AGnDmJPR$c3*6=7Bo z%cREynWA=^xvVwnlX*ZKS8lIn?N;N=3D$kOr(%^K{Lbx_%HD2P%)LDIe`)>o8CG?X zY}~}>(B9StF8Eg3Ep*5j0W{ICo%?BU-_X};ubOD><}*xY_2bXfI@F<;h^9Ae09^;W zET#rfs;v?)ZGw52JB`O~x6vyw4Lri%QFbTC+p)6dJWg`16E>ooX>t^F|4ILfcHsYWAmdmjW)wZn4$zlN{c$dclA=l!EDH~Hoy@zP1xvP zYzs?SORWg2XGi}b{&lhsrVO@s=OwMytQlLw4#|Zu**+FM*biUud2BHL^0_PRn$egNLAr9;rEaQgqNu23{btQV-5#SFsX25i=%p>ey-TiU zYy9_He~S7TCECh+zT#{R|I8NgG%U@oLnnFx_o&a25Tg4TPao*4V?d`CO4l&4d)XhRyJ?$`3!`B6fkz z6hk#cKjEbBbXsMoRP~3Prk%ojxE88YkO(7CWnrBht*sAPYxCfm%m9ezts#z0!B@r- z-VWj+o{hmDP(mNdZTLbyrEE&IE}Q?EZR8E#Z=PN}fE|TB4u^fP{h2RCZPMn-BAANh zF_e3FR~RWvD$_QaF)&yJEt2_s%j6NU7J~5yG(a8y#MQhxJHyYj#aIe$cmiYa8Rft; zs%)I*H`z^*Ulkx^gYhkH2B499{W9Rgn zY^JiiqA?$YQ6#;l2ULj$Vm9o?Lo`&*&?lHZ)7DZKtsw59Z`vupKzJ`FnN|C{Y*9nd_lfAP@gk)>>#RotNGq}dgp{?P0Bu((JHHJ#^(6Iq&Z3N(jR06WY@L+GJDBOvboGr1t5@e zsiUg1`bq>1s3FXzTxuz7@sapId&DML4XVflb%FkpJycJ)tInz%xQ_$y9oRKY6&IlZ zRkns(MbL+3)H|4}R?7}$vvyrT5cto48z2(pO z3)oA8)WYl~xHbGgn_(^d*~#y@b|Z6F`bae^WSsYycfIeP{c_^0gaoZ?-k|*Xe5GYK zN`sSlkT0d;S~5GzTHE|Vs^wnITi=YJsZ^R`K)`amj}`b0wN!C6fXc}zIbUp*53MKJ zxBLc&H)Q9%=YLiXDQJ81%PU{%m%?0Td-;@ow;squVTXe%O&x>Q^bek4$m+6W*oF7u z2O9ysV87MbH9M!Qu`nYkK7z}j!%@Y>H|rK%)GERm?5parEBHHf;&=H1H4`Sm1FQh^ z;JFrqDOgfm_EnV;u#^8{ebrUGhC{I=-otnp48vJTd6Iq*@rkS!%P;>BTg^Z#ojS@1 z)?HgbL^0Am zLKfxIEATeZ#k?v7+{Mx`8VBq3wPN~A<(ALIc={7g(?J*o08gkZRH5=LPz!>~^hNz8 zi^5Yp0$s#*QBqot1KPjZMC@j>g=y4YE9!8;V9%wvSvb-s(tfZ*)QG;&3fMs|xS_s=!e&s?kQ@hXvB@c+TqsYq3tcF<^1LwiQ6+zDW}y5`ImvsoqHR;{ot_V>Wm zkY{;M)0KWJG6w(vAOJ~3K~(Iy3ALee_JN3UbcdRktX?b)0z;sZrYRzIth(5w#TJ=aiktodK+&a^iv zZ?n#(El=%|6PNwQ{m5IycfeBu?Baonl1;@4d?zDmE<9Ab*kgKbEaN*_A1V&zX|&2x z3vdHQ(|GjBU9^~CcB*%obG4H3ankSsr$pS2!xl|0D@suu7EZz!LUMOQ>=)!bOf zU*jnM!bUkY+MKS+W6kW=0dFFE;0;mT+-HBNs!>gwoi;zle?Pajux2T_K$aj2ol5jb*ryj$Q{_uI%p)Cxzi!>lBm zC1at7%nN?Xlrf;G7NF5B-)?IXb|(||z+Q;Qyn21sfc}S9Reg1x{N*QcKn=IaklpN> zbvQHIUPW6iE`t|)2Ka*)h;A_jw_zK+q5o(DxA;YUF)e`-lvfU;?NA9DtG}rP`^)tt zr=;f9R>BdzoBgx@RF$fNR56~!GI%%D5*N}2wHr#oPi+E^WMS;R+93avmFcqj6W%~Q ztU=|hzM_QstS(R{e!*-O4L~t;4BO)q_>Pxo1zez$R7jNok4zF}R0Hi$P!LXp`SOGP zUo7j2WWieFfG@Bdx4=-FTR)}S;IfKGAJ&F4I0jGRZCu5IVK!TiF}{9QHXd``^Sq$Z zGFpD)73>4-1KgB)aQ9^XV`hRSlf<$V2o`G&tx^MOqHk>sPak^i~T~mWRK~Q zTCSXG7&d}UxBwqOPgu`p!hIP_xz$9R2czLS=0=Hy=m=b*qL4_<;3}($JK!-~q)YIP z8uMf$nN70!2gTW&8l9bY)p-@=IZbi0nROdXXi4qaGUx_-crEXQU$>cyRpZUr7%ap9 zhlr_}RU}$%GhSPjXt~-#>Bc5UZ&10$7pw_E&ac58oyot#pI2qK-R%+vm&z&pPqCgc zb;4dmOfPjdW?kW`F&iRSEl6@xSbHY~l@XUl% zkM+h^*|RpMg!{LgDJiRxt|e{Bxu1P7>$1tL&sKGLUsWVi)}lm+hf(Sylz{8%AiQGj zaS{mhP&n0rtFRdc!$C-b(PALY0*7p`B50(jpmxAI6-^Dn32(7BCg5Q<6dLoOKBY(f zwnX*H8j$grKT-K*Me0O5AuMo**4cMUHQ?2qP2>P&K$^e*C`KFY9TXT5@J(EiXUt-H zwj3&-tCQj~oqpO2X=7m`?cHID$#~MRzt@3J~t9Q`c++3RSnz;%psS?u0 zN2=E%4^)%0)ybU3u34g${ty~twvMpW>?oq;W_FvUVK{iP4=?~ohUz$)rD*AV6Ak7Y z@q+an-{3WhmHmD1%#O~OT*p;;0kWzJR(-54SPM!(6LnWLQ()B60`(3?ls-zEp?i(_ znlJ-Zu;maZXag<8Ml2SOs|i$?Zcz-bg9hRomg5PIyw(aUh$*>NEwKH<-tNQHmlmla z>MVQb^x{ag@!b5IvdfL`yUqb2zgyv=f@s8+$OE!BbJATpiFuS0C%{^20C&|5VDP81 zQvmKYKgtuToEoJ*!U7d0_sR0IIQ!{W!)}=CGk)1h>tu})3n;gLUA|q6p*yhKebw_1 zezBjlRkUtsdyS{qffeU_s56Yi58_`{L{(F@M4D=#RilLvNMluJ`i(7T4_SNGleS=a z-VO_}Hh7W(Adgxq@~B{$z_$gB)h@|%>CcQTt+&iatN1s6pQ!5F<>PPxeyBVw3R>YQ zIa#hCn@T5w$FPz{s;_VmG}YUBoHHRq>C*7yE_iDc@xQ3Da4H_99H;^v)jBAHH>_;B z0Yl-T`Ij$UJ*2UeN$nw(O0&PIoT>%!_!YORyLbjF&;S|=5tsl+X+G=5X2LyQiq+y% zVXLY|338p))Z5xDtmSt`lck^aQ^skq<)razT0`SC3{V##KTcESWKC89wo`rhhP~ma z*-@RO7pxOnFp14&b-<&}TPv+EEX9-UD$6CssY9^Ye$+U_f9Kuxue#JfXhq2nby!0c zDM3v$&agmyp$f~bVht@(HN-SX1Ed#f7(`QfH3}lABtOdvVFFErn)HE5jDY~E0=J}f&0ey*`cFA= z16+~qSaX$V9)=<;fUlI(V7b{-pjO&m7M}QdLTper=y+5zd=E>=zshf>`86Zh9-MIU zeu)DAL>~+tS2iblO5q0iALoh)-&hp$`R5m*)txK+t2r8K8T^eST_2_wh7$6QmF&Kf zy)yk(+W(SzeZS+qpK~H}yP0gB^BqYvQqnV~r#?!(np$3zvr1Y)vbapAPbyUf(_*SY zGF5(}FZdAuVQ0Oo#WdQ^+*koVs^_Yg+{cf254xAZ3EomaNRh@+ zUL7lF*Z6XE9=ftT#ssZ4ONLl@z+vz3g`!YK3)T-=d9*+FwQ*% z4nVZ8w;CdmE@D$E&fl`VSW-mE_u@aT0Nt`xD_677jhf(lT9m1pH)pmg9G z$}4Y)x7G{!msQ%l&;NxkMC>-JipYkr?)<8#ik%$qep$FEVk~xt-K=ly6ESEl-n4`(BI5bo0t>UDWD?rS#+v^2e+GqKrG<)}89-^kgISYqpze z6y1O=W?O5#MRbAMD~12j4XD8k)`-`JJ~)Tv!Pe+wrq`LcHvR&h%-E9{#Vg}~u!ogK z$wIXzSe5zVc}#)=WaCZPFRc!gpxStvmQ#W)$aY2F%TMT+d0t*zi{&qH7UqVh5R4S@ci?U(nowR$)eXMf^WSPR+ePc28bFyFIAR%34>OHj2$4oWJm zX0YF(3@xF<&_J2ggFjJ+;EVnW17JJ*of1@_dM~2kqdZ~-L9Cc!-ZsN!YmwraMw?kN zZ=zpdk?K}UV_obh;em4^Zo`A1 z^|^fzENgmf!J6q#`lIZw$nGK2YV6KixyYjE3}^X(F)@)r#r;PG?BxGaVSNI-g7+|v zZm9O=Soh+red1hJrIg_8wrRVP8+o>8ZOT07IqZ9HmP_fIHOCd36rLLOYlZK%+061# zW3oXoG=LRw7puYzEC31Q59c6?{Glx7!~ek;$e~e?LkHj*H0F6RH*7I2ZrcmqL@&ogLZPZ`QG-#?U(Tn zt)j`$Qx)Tzjk%t4vZnJQn*+O{1pilsdc){A5k8ZDsR(t)_A8^rufDeGyt*DDnqn_r zUlufm8u`UXJ_B1zNgLEjIE`_Xi=N;anhNLDVU?dnK(y+tHQ}c)j$&kQsarw%t6T%J z@A$si6aAWNA2V~)O&lRsdwsNC2rqd5_OunzKc0N?*dMCA(x+Cc_i`hz2TR#^eSz(l zy$PReB-nziIrt1(u@=lk-PX}|)o^2Org&pYfJ<EpLf=h50U9vS@z@22H!jMSAX&=RU zZ~p3=Fvw1|r|w;{pXlK>=#pA$c85vwwN(P%Vqf)KPK5K6En3nZ+>S=h|3Qe~ zsh~1Y)c@bWdQiaW4Yos7WQbP3v&yVyJk=8ja?bp=w1sT)6 zXZ^a7dLgTAP6N-sR;UV~AEKLDN)6~LaXJl?p(=Qx9leAnvLH2r1Z%DgftF%Cy`Wj* zf!qZzt!YxKJG4MGhV?4997(^)ST%)WD3WSJC-b!mr~AGTIfD|QE26ejvs#Gi7Wr3) z%W?$I)~2z(c%F5#!eJYC>bvNkIe}kRA=E-$g8k}mij+g-AZsQb!#MTBOqI*5?;@BE z&|^Iggd7vF*uFw9sqNHu@?(%8QI=rMyrXj#;U1ik`=b0pjag04Ip_&KYRe8nKXptM zg-CHtw6cU#F^e_x)D~ZTNtQ=Gp$qVk5ykM_uo`Y~7fi!cR)-ohKO-K;(lIj@G;C(} z!}s`8O>uqC@`De7Bf|SY4SloS4^n8W*&a@jKLv;k^-0BgBGnw2mla`+Qrr1gWmkPz zP4!9~Vh`mIaU37WMzXSg)qEtUus)9Xpi!6)o1u*s>jMK`iKEt&v};CjUf=x9`a^Y4 zhp-N-!z$oUZ*5OxXl`e?wLkej#4si%;B>7C&&AW=D|^Sn#aeX9n(7DSW6!BR`e3hm zk6UOogwt$bFifw(U*JI%AwN->Oc(YKKjZgZ{PQD5tF1lFYQrAN9{w||5K+xMfT89a zx(g4Iv%g8|67fEtsrJ!dVCRF_KGw{4U*D%6({HKivZ;IsUolbkHZPhcYsxcOAUsmf z;W(?U@4&z1V%gZr(aOt*GRa&>KW&XuC(E?33SnW|snm1URr}DeNY6CY-Zon-mA_fz zwcW5wF*S#NK}!~gf3q%_4o`Vo7>kARIxd76@W!f;_U2bIKPtLCsg}1b+`tR6RHIdTikiD!7_MFwG+uCD_gq|=0m%~6hOMMur5wwfivnuQX{*9zo z;|I7z*Hu%N4O!Sm4W*t`+T3PV^&OE~MlN3ldt&!G2kVISW|T58e!f!2k!ffe}J_eC|N<3=a`l#f@->5TZmO2Y5(ggwOi{HGr!woUM{>7ckl#uF zbXTazaI~aL0qy)>;75I<^Ajzi?&yYV#u0lI{BX>3+=D+d*CokNf9@5t%Nz(e$qwds z1b1_^@V{nTZPe2eH6J)xJ<-nBifrEQuJi7?*)7w9QXglqoLSa5?-cjCR9}uWbN<%` z@fnHlvfrnl$efq+-W~3a%DSEt?240rSbJ1{xrLvn$?5@=fT_9-U%?*`Mpdb{SV(`u zTX|8X!AI#+`vBB_wOt)^*UmjT zw3_#nn3i)yP4zW0Pf~(eR{7aBc}`0o{2``_sGvsr@9eo>XFkl=-8{k`9k&{il_v&T>dHQ%i(e~^;CydJl|xq>|Xu%d|eApH_KB8&UF{Agjg{^Ojez< zYWZGSFW@NUl1;S~U#e$4I-mqP)Mwd9rg9s0guZx_`~lQim8_~!w)&Gc%PT4ieu9_s z;wRHBKbaq3J@legSPf6J`?kt@Bik>(t-OWDvI+>VYb&l$jqE*TYq8O)EXzSk7Ml+-zomRyJ z@MfyKRT{R6R<2m{pd4=Ym#Er`suZg?@ zwX_QKK-+JG;yL5G{{tu?bvjHN(i+A6U9VTf1#QJ|o?M&FQejbk%^a)j8;n!-4mclI zU~cwWd#X9rI@Vr$n9?dMPxg}xZ^osxC$2H>X1*HV3;p^xH9X$`*PG<_<`AFHj2BC- z;&QLBTZ=?6^TW|J&9;w!fnZIe>QrBUhSRYtYXS2h*8DDO!6I>2c7y`5s>*^CRSsT3 z3LR!m=`MYs4X|9br`veX`-fagYdodIEz0Y8tFq{h06Io3c{O~8WgMCI9HSOzAM@l? zYK1rCc-rfjqa|po{A;tHZUZ{-7S3681A6FJ)l51>k?so4EXPz)QcblA;2PCM<%Ubt z5HooTbEjhV-Au47_YGf?uZ`at<0aO?I`jet`c`|^iTConW3S&DtEw^8HV)Uy)jl8V z14r-;H>3-W@mlC$pScGEsi3t{j(-Dt`=q|uu@#=sW_6dB#5C&V_ek^C zYEc_&sCTxWD2tiZsJDs`t>_vB=)dVU+t8qN+=j1MEUS%M@pm;?EdZxCMTV+nevA1r z_g)oE5@SUg%yDEYrK+hp)ImHKqmAp%2uDNjM(WS%U`MtB?$Q>uUDgyc)dDq99#f_Y z}{2K>C1hm6fXt0~qnA$@f%nJuWhym6~Ia40f&pKZ^nwsC!pE(}rE-}qqPoo2p zXr3%8?$TLu!FL?$Dd>%|EeLq!Z08-rzVb-DHa5qp(2a$$1L7YlFJH(YTBs^%Bh_Fv zP``l#u@PN@^Y9n6hnwiY#gK&;V7D5pR?8FCK#?2A>%&8GnJMP~JpCM#c|hiW?5h4T zXuj`*6(u6%F6EP{sG~o=;&I&4POHr{2Y-k^)NYtb6=4XgAPp6&BGlj39`h{?b@^w$ zG1diogYVN>_{hKz`U5QNx+ET|LZUis$85HlwyF$jf(|&r4VzD6@Sc8;U*;?2 z4mD7@RemT4yXY#sgALXd=}#@G0gQknES49+LA)pSVDqdNvb$*Sv;ChScmS+xTLJ(& zGy9x-Z=Bk;ZQFjeZQHi_Qk$=~&0n{v%_I#PU!1dN{#xu&GbM}p-oJ}6U5=&BQNJAP z{2ud2v7oPgSBKe_BucdGq{3JW7f3jhyut+WVhReBCy$X)1v#$b66~TLN<~Qz5Ai6i zR}r{`e6-YbQ(Z|gR_kr4tBoi8F*SoZoa?3RbF7weNvr%m^ZJwszqatl^U^OJ9I>7U ztueE3nBN2#?(St4q3`zhgh@#UVncLR86ru*)E=)wUcbuz^PN@91i!-``qCz#u8H}(&KnR?csi_CfUQy!&y)IRWSPdmo;h| zGt>jz4iBUbFXs}81Kg+G`um{BlFw9N-+j`&FU1gnq&hMc-_Y;kWVG(qejWPwUNeVm%pInp9xK1uW4BWR0=rmw^_t@9v zy&fPFyF-A*hSn-}sJ9%ls9hIhtK zegLzTk-7$z!TDBA z*#PN$E9HN(4{k{k%&V57?Q_MLb6T$~NIohxdOnoKG1R`+)NCAF0Nxy;Xd^ zln2R>l*!DaTB@7Ia8+7O@N~1b@k>#Tf}s`kJ_y9++?yS6o0sqg=mqPrEt=TW@C3M1 z_j|WOCG3F*Q~mWJJGXC}Y~i(Vna)Z_d8+Qmb&QGeZ(soJ(>bIWfE_~>pqtf|RuDlX zCg6PT3IB;@&xA_w)Bcx#aXtN5+UoCE8&60G?WTi}2h+$hwL3b*S2fLWQC!m7!*$9S zmb6H>rIJQr-#&Y`ec8IE$5sTD8>cMGxp{j8o zZx`#H_s+L8?^{x$FKtQ_?;&ouF zYl}qVXggVY(Ew>h{i!K8#U79zN}yMs;sg4Py(tH-R*kqPgh6I3FAL$e9q+s6MPHmZ zvmI)Q<9CKCL>%Wi;+SA~=pd${@5FKpy7Db+m>y*xmn5hq`8X>yg3ML~r$TmbiXJ3A zaT=6@$7%!?F>}CI0P~eDAtA;<>pst*MC%$4C)V}(3I2l2(pF}vt2{w&s@Bp$w>Anx zGv6k^6tf{WaygA1dV!;(Q3;DXf>k;I3c@}#2Xd>LH~{{tuSSfqKX7-b%U&AyC*cmb zuq}NCn`+=Hc$0i7IYt`)eEj`|9`UB=v#LZmxga_M}B(jE=?kaLCO3wL*3 zqeLo-&-n#+&}TfoyyxT2CWQX|{W0_~El_x)!g9=)F0Z&eK-hnWfD%2Nf2+~6)@;1W&@9GRh?*{3=|LUf??8t{wROQ z4P9V>@=-kupltj{5_JQ)hD+5te27`)1bl@?zO-P=K<~tqte*4^QvXT>jb?u}-d-hN z^uMwPp2m<*#!{|SAFvhBvz8Y3RY9dd#Ds6|_s+%;F>heDrj&&%9M94nq zXZ|8men(V%YbJ|4^2(y?4>u`;fL4cE92ma&Ih`JsGd z#&3UA$4t$0)y3X06q=cNRFYZJj^$o(&uC!Wv)emX$|INxIr%e>ga70{xmm-a) z#%||OeNdP1cIIk)N!QfzUenhqp=`js%(u<`*3)mnu6M?jq|y45BPt+5&UvT1o9Lx{ zU*|DjV`lXqo`hkTpqpVPnt@InjOV49bb%T$0*|1Vleh%`FDc`pFkABTfeXmoCq$J1#2L*Xu?PY4}3*)+( z#+YJnwT`JI8KY;}!?Z0+c@36@47i+g!&eytoB5s;^j4EFD1>hCp%a5>mr)Bo;xF>I zNAhHy%}&|u@-^aRRL8s2Fiz(rV|FYNA*#8 zK?kq^4RP#sWQ9naOzU90>#%=8M+x(jvteq>l-R%V5QrKDGGKce1%X%&mcSSIDcvAm z)f^sapK^fMjgH_}Mf%C%ivQBapQTD&Otc;L+`i%I)T7C}}x5BgVk}~6E$ShN( zDa7mcIuNG9WGM*$=%?IU3c(z#4D^Wq#lQ z$3}i;-G=&s@1YB1#yr+uYn!)<@08yXv!`#jaYya;d!u{UO)*K{aX&rFGg*h|C~rP1 zmM2Rh%yQ0Ci{TCj>NR=^m)57~BKYYO9HWZ{wF|9lzf~2@fl%A};x2+NXpFOs+c1to z7Y>5fn3ldkX>=N!s3`AIh3K?xP%1`Szc3z3fB zTm^#$;BWh`fBV#~F{~!i17kbR=ZB*mQ zAODYEU_HZuM`)Gtmo}+mQj=%GHt0-yX@uR=_tFQrMZ93!BY7hgcCAqhQm3m-cvSv! zZy4#H7e2#DiN$a@3SCryucfz!Gz}e;&TBo@8SSdP6+fWjc%6YS;u|amFKh*mcpG2G zU@9SJRBbpd&7dj0mut`wH&Zx{#O?>U5AF(n<+6f&6KU zFN>XrXPNH}k7_~Z@CZ(V1Cj;m*;D0*H6OO?4KmUFLzN26X{5JwY(h{U#}+}ki2MDY z=q&tIXOe?_5P!%!%u6(n*pdOUA5H!HZB zsc2r!oAIgP(YfqYo(f0c82;vIQW-v4Y4|lKNo{xntZu5ARGXJ#MJmZB z$jUwC3*45&xCnynP&o_d?cC53zS}i09DnJ%a0pU;4dotGu#fX8ETOYQ6X+~8;S4s! zFgy=UDJ{%{D%2X!LQW_rWnjK6;i+;{T0mXcr&O`5H_E1`F49uk`M))LK}Kf*vj}a| zTVy9ZVfXeC%9-N2ErWw zla9xf4+_#+h_E8OA>54{h>07lHTVe{+CzgmZBDAML%dHNop~Am#G7&zpK?*TEy71- z6c4lW^G{s@p3ol5>*%azsJ(u%W*cJ{REO*k?`bRl@G3h8_Jy196|3P)NKf&uLm@eg zWc#DDQA$F5MQi}g)B=cr`7jKRV0$@Dd2xg;u0sRoW^~8@q(r|m)(U%oMq?5e)VF8~ zzQexM03XW;*eNU*p*U2fcsj>#?JzzpRcMyc-T3UdueDVrF|TWyInb(ZZL#J%%j;G4 zZem%>A@CE&Nd&xhW~4{pZ@qEY&SKsU+)MUybJ!+1^f{O)x8yNCg7mmd60oNtYN(#W zP3a(&^d=|(m!Y+cl~Zs;SCupNRhUexQqNl@$dP7-<5l7oA$;$AEFELGEGwZc)P(c+n0xE}?2yiE$|v7GqgL=CM-SNY*J~Q;0Isyg$SD6!{DLQP zL8u8s@tfp?hcFeM;TgP6{ox?~h1F0VTNqX8423}n?#YJzK>o-6zAK)x?pS-4yGP13 zvsPe^v<>Yf7>ErK;T+61-{T&f8hFDw#YlzfFa)o`YiSNMC>R+1>_7H*eIBaASn#8g z+tZ7?gpLFa;8+9aMyDR2OWp)FVVF>R9dkg(3d80=v{p$=NSB zvgF%<3Oby7=oVH$a&XFx3j;tN)jVqVPDLWDiCM|G3cHi+m@%ijFx|Hsv zJ#6qxxuWmsBXSU)$v@CZ;-DH1P(#!M7*5}@9{6Jw_zpH+hW3zI58&%EMgNiqxWxUh z?j`B$|KK-lwdzYfylT&er?6C~z;!4f``{rZavQ9S_c0tdVSv1p>O5OM@KbDUm*q(A zCkG@Lc1l6{1mC4GL_tmLfbAeF;R4Dk&9F6QmXWjzAL|O*gyQOr5ksrtyjlXoB^~-< z4oajyR)8+3YQa59lv{X#j_R{Kgx_NVJmb8whFkGfX~sT&Z*P?lnTTz%JG7!Hc+n^% zOSmF8bnZ#M@b@X6#xiO@)Z?PK*K+eAZs_eMzx8C>fCG4j+Tsf%yU_*?LO=7p>T8t2 z`$jp{$k!15gB?~!*aD5=JR+vWJ=h*wKx0)&or4dQh-r-K@YNFPFO_AGJcOwjB>8ZY z-#4y{2V5>hxMIdQ)4BhbaKgWlaln{GH;uPu8hf#1gVt~uTo5Kn_Hmvs*Pt_`r`7f% z&tx@9?N^PUwdzYF;P3v$C(GlLY9fjEq_ zLN_=AyXCN4)&n5a&aU34+yA$jcdz4`-)e_{(o-YG9Ot;m4qQ#ION@7638^>r%9BGYb&fO3Lq0)*! zL!=}@W*G#4f8Zyk!R}BA5yGK5-LqX-oG#KzfBK7NBV@Pk@FAQ%`0*X5q0J1xZ7FdPCg5B zU*~4|O9TA8@<{Cu=wj`cOh%|xj2Rc$ANhvtu}kx38EY4pY4X>ejS0{T=NN&`H@M1G z+5f7%@xJyIh{sub^En7kbm%fd^@1=sEvWv@x_nz$$DSe8MjajCNjfEQUw}jPd}S z0CgmcTi`7CX^)4V@X;PCZaHn2me$zG2I&fg?VV6gMA|}WNVZP#V_m>*BYUK+E&&sD z9G0L%&=?}26WG!cDu5via3_R93}(Sugt7P>OUOLT0M+4v%+*<_8QnDfN*O50K4|t$D-=j_V3zkVAsl@;Dt@kB*D%*?oayW+x zP+r3DGJG;CP(`)hRmxhRXPBE(k0!K{`;Kz{OPDAaGw^OTR65BDI)lP_!N8)^1q1LMX>IdJDQm{qmP)kCZd9PTtV3^vjwz&U8FP7k}FdM${3^h}2 zr(oKPK{(xR7W4DZiooZFM?FqyZc(%9BpAP0L=V7i$i7LsM zWY^|}lF5#Txsr}^;37EAcOev}=_&FM(!o-=LwRuz9W!QOib_)HRFWgnEW;U7>Le|- zA=>@*Kj#wX5cPu^Qa#+`B|A*}S*@*?-b3aJdIB-d7&XcW^MC9Z?|5xCR|9FK`h(w~ zIck|I`*D;pk}Auc_|o&U;+RRz%6nQCtz&}!78vnIk(Se5AqbS zI?@xPH@=`TE|)XR`4qy{zvfOW7w?B;Z+SbAC*wN#i0xhdRetl1y+?1vC~KZ<$9Fmm zB2`z8P~BYHVXD#3|GUqhw)s`JtCPd8wv|hlF@D&&WtmFH?bHIJwN+Nm!ZRak)eN{Ms<#xkhnMQZpsx;{`j5ApTJsBJ7}~kJLLC^bqvJ4 zW`Y@o(>Q=;!)E)o9GBzPAucP8d>Q$qxP965EAHTn(^{5Tb98q;7IL>2EK8k@b#Bj(G z%tQ>|(Z_pq87aqg?d{eF-ytgr=1EuaLUq_9d0~g#z_!vv)?#;BOLwHW%Blt!+wqZ^ zfeOQD&WF3`yo`r~FcmuDQ}SU)*Y((g(cxCQY@Kp`mU(tx*D^S6W%YzfW63Bp?MYBc z9rJ89o*J_p|Ej@8bI6W+bwTr!It~_9g~pi49t6Z?VJ2_kX!uNrq!Hwi9{fu!;TFCZ zGC^$J>R^rlhf7HgEw z4b}M*`(P?Wl7+YM9k#>W7%S;iZh8UrXp2r`57QA+-?{EjH~XJ7b+Qd`I^xQBry6-LZ5?401Rje76C$(jAP||7lI;*;`Me5F!`}Wt=S#C>Cxy1^C zQdW685db@4L&)r3%@GLMgFm<$LGIvN{zY-G>#|v$u9_#*zeYTKf_TW2a>h5tehv9# zpmgvY;Qca6{OoY(#YtELe%R?@AcRUIXpC26xO|7Fa0x5oI~fgbStbvq0oeQmn#p~r z0wlJg>LN5H1O6bycei47ELC;7I-{ z-LVed!y~Td>ba`ts7NE^p0BF4Q~Kf+xg~|{cKW$4Af3E5-Q6WRG)I~@x`^6M4DnVA z>kTAJxIBO%FdVPJCCSC5)d3?9RdnQ2jcF9^rJ2x>S4kg9@Xp}H+)ortfX~u|mqS7L zuJ$_`r!OndQ|_sozAhSeBeQbgS72ZQA#o=_L*12&*>Ryp513UCf}JxfVXXVn$DLXm2i zT8Vk+s>-iR+2Lk?ZXjNW#zwS27Dyd&(tbGy`EeGfo=EgEs z-o%Chg`HvE=emM2LMD1Y_^$f^d!nB)2aC`Q)m>iMIb{G$)g3LTx*K5mUxkU72^Ps> zxDNZwPNvK82JVPIf2X5%qMfR$>peP)-pXU-8$Z_N`G5Ki`?v@0#uBtj(!wukfDbu? zUTUw0H|l~FqK5h%2`tNf|Gd|E9MuA^dIIfZxKPiR=e&Wg!dQADd0`y9r$|^!hv_tg z;wu$^*YKMhmlLvEFJT9>On|hKPNM8s-&}1aU;P^nQ~g@HGSg1itiaE9cOHb3)pa}s zT^#$(GE_a~mwn8hqbKovE@`&IaIE1HqqnMry8b<<^J$0@<@ z(mc+;C~b!{Aw?qW;=kL)CC82YlakOgVNU#e?GRqKa zsP=<9oXQ_zo1>R`25N@B@^6lHgZKIkqbk7*T%)j6KodtVBcFdo=Q?wP5r@6hAiRN< zskA#CXX5qVueu&gO=)Qzmvkxptsd~!dqJm#(vk;W;y8GNvCvg|Lpi+4t>7Y@!b9kQ zcJd86!D;ARue zWw?Oy`>!{0c}L3!SnkjEQobUcd82IA`xrP>U+_Kl4Y$I)n*y>09^z!f-^d7tzH0S_ z-}(-pg>gEUJdick6#KpUY;2|TuBtRd?{sgnUSma`VLP;?9^ef~l{L&{y+p~Nlps*a~$`DVx@t|~&OL{vrhE(W?*)R@<=vGodU*jKAl>6vNxFch6CAP*E zvJnUxVQ<`M_DP?c+a_uMVc5l2*yppojs+M=E7U-_AVV;W%E}}DNL{fwti^Qd8^k4- zaYTl8v#RM=@|UjIFRTwLQCG1mlc6ix&m7yG|2eC8+Sy&`I=_WFSWj-@MR*I@F&oXb zZRw{9;ZpC9q|vVSj$fYZb}pUEFN!lWsnhznG~}jSMYQa-N3p>W`>CUQ(6P{} z_IiA%w(?o}Nvq(ZJmz5eAB!$(U4mS&msj&_?ghE6T)LdpgMkz*i>Vft!S6U<<@6P_ z|LAn~UE@^nk>I_sKYj_crQBFUHmTNu)#Q=Aod@w;J}kMw1$FV1UEVv(JKN|YQ|(A) z`d{^HC0}>|0M>-Fcup6>{m@i}0NHaTucwkXi!Yn_N;#;1OJky;Pza&pWs$ z7siJir|(fieTCoJv+d;|Rc$;yRRSd&8n9e0udaTf)Bhxhdb=ms2~s=X#%7n9|Yr55G?B;kBfT9j>cF zPJM@O!Xc?7tDvdty7M=54tnc%gQohOarmI5vxND;DBuV*nyQ1w0oBM+1IFM*?B@;9 z&ADc(!S&$3)Ft+IXp+?3x-CmmPT1Wc%>K^#!Q^p}O$O;^;)bbm2Jg!l_#Zri?BK$9 zY%H0;DXXvr{=y_wxEnX&1L%ukst2aQ2>L;9A%i5q7x;=dz=sEE4Nb>vWK74JsudN* z^i-2eLMWVvD^v#WIQknODG7Q*VX*K5+?HETp|bQ?<@45eKcg2^!K?ruVHJ#%Oqc_E zOG&Q9ySS3>r`xBza+k8^ro6Hy_^#4t=ue$#41K2~h;R_*nTzNTnvOFG8ROQdVs?LC z1EG!|eo+u_);69&jB-*_xNpoi&#(jLK`ZzV^5ahG56@tk*?~5yTh8KCN>zeSa0=?k zJoo@JC7W*JjiSAh$-WJN;No25qd6)Q{Ny*Po80HD5Gnt`b=oDGkw8!Y03ZNKL_t({ zB^pQ3;M7(L6`egf*gA!W!G#IbQC(7pX$_r)NwicI)EU$ULH;W%GL z-Px}|{HwTK&XdG60g_>rdTD<%8asyAEy#yqFo@!?j&$UGJjVviBceM{Mi~mz)or@V zbMb=o#a(!Z?|KL5Uq-XoLu{rU;2#I~;}$qJ(+b}OUI>wrNtE8;o2oiWgnihV4vT<^ z*%ONl%~c0_1o6HsZ}ac*M9xV~nqK6B--i4cmxR*Hk-v zpxetEJ}SB7mXw!zkehBvE=<4&@X-pfx_Ud?tNfpOhGVa^kDaaH?Vr5?E&Y}|#~3x# z1ht1J$_D9+mEoh5k_mE)OW95Mt|;h?2}U|&EwytzreGXzt~bVFTeAR;fP8ob%WzSc z<=qt7?Pmt~nj9Ec6pGUv=p}u{$t$vy37qX}?W$!I$Kh%cU4&k?%kHP2vqLP&>02yfvLW`O!oUfqKxbjC=A+G?S(5pU3EM;eTRP`_2=fMJvuN^?$a z>H>0&X3Hb>l7^CH2H`LBl`}K;znbbPvMDjZKF?}^TyC*&Wn=-+h zUl!R_cpE(A-?9%Ta!aWWkDxH#fgOAlrUP&s+?E~EO1@%ixdfA8r!2&7Mm4**48d0^ zdG!T;m-H99QWozKD6O)k{7=Hj-+hcqKyOJh3S)QJ=sbtBD5oPQZB!SHCiEVD;&ZU2 zA2f$kFj%IENA1Q3XkrwMhHkJKYzc=qcv!aR8Shy2 zE&;V9AKipka*Y;XtWh0@$$5#eKH44~D4)5eT=TrL-`Vqg_29mghfH{$oF)_P+PYAE}*{E_DtSg;O#_zF3ESSs_`D${e^xd5o@lAHG-HWufX|#KUok zHiib>u%c69RRp$_YJ7%T(pLQ$C*x5g6vjbqt1$pxxAm z|L{U_ajN9}mP89|D`5gS?tY-Q-YRa(G-pMXi z$onGoH`S%1^xf=aoHm=lD5JKC&R#}O^SNJB6+lBAIgOucHy)-I5}_La=q@rub;Ygp zUOl8PDwCdKolY%E1Jd1#&+VCNR&kBQoiUfq6tfIe=Js4uSBJT}uDwbuu41=>PVz{y zTCJ1XCJ%#-v3}9LR5k9PPv9ci#TQ|exOEl3eS!0w6ZKiVVhI4fkcGO84C1?3T_(w2 z+yd8I6LGW7#+4$wu695L7J zVMg)T+4672^s>_Mrjx$_B?q?L}jlt}Z9<(LPu ztCm<>TETQ!3RCeHtd)Ma23o-*43h7ZN-bf$^pSSrhUQcQo2eQTT}Sz=Voi8!E~iSq^uorW%Y7As)Io&Z~5o$x5dWTZP2Og>+xr z)Z4sY^f>ROlx=E^y}(;Dw5wj9($e+bds+G zB>8hzS%};1P8`NJwU&GQ1SZ>o+T%ABUncp%6C7r&viI;|$86{&&*7_4(=d$Uwu7H= zc^mi$KXYER@_T+7JwvRtwotn_QjVx z3hwelm|2xs%<+ZY}qe*za|uE=Rv{ zJzU08uz{E2O#FcP)h_y>;xIQKk#;lD zMr7khbKG^3qW+$-kLw3g)>_FQVWw^K`>!?W2LJ9KEM@v<7D4ewvHNxi2I`B=n^7m;vIbIIQIE z@Jkdv!4@z|9mARU4gKH`L zLV38RGuX>@4lYSWxvCt6`?>=(z#2RPhfzAXO?T;qdh2LoUZ)l!QY|p|-)m9h!BAh! zFaQ;8+=lY>9E->*nt)C0I;yYwH?^G}Vteozj>Cg`iSA`x$8lC)Zw6zm{mGsgl)#nf zq)dYt*H}0L%K=njW4>%=e@vi^QX3bkA8H6@FoU=lH>6Fn6mJ;irJl#*z7?9@Zv3|| z-NG(38Aehivj~n*9nDTDUcPA0x3`c#jnK7u2@kZ6>(|_nKTA3OEZd-;9LIO~nnt8t z#FXGoc0ZkkRv6{6ehNBfn8VdtN~JkC*&eH_>Y~y^Z<1nQ$Trw(E%Ltit#Ca}91>f^ zlL2nyd^=v9G7q|g>6{U0eQ|6wH%c{nO?P>vtb*!XQEEdn3_=qY$p(4{hIDgWp(VHm z9&uY6_?r)ehsiN#_AF($!K5(nNj9&Ms$ zGtT1}x=V?4m454^@Q_%J!2wFuH@v~_)9!S#Bec9#5A&!@=6NG?auJowzhV$_L0YWb z+yTbIJlF%p;jmpEdde_857I+@$c|s>s??|9DoIbl6lx12v8h_aB&*bad{@L6n6xOS z#GjVQpCST24L8HgD=Grh(p_wiIaD|5fZf$mOu>Tm9e+BqsIe61C~178U2;`MaZ3q+ z*)UA+#K!nd1j}GkZNhHpX;tH~y1PCIC8VDGz#urohhQYsV~4!vlF%9qsV4#QK_aba z5&X>U=m>3q%IdHYi7EI^ZResKA?JBLmy;p(b569Q?BiU@c1aKJKK5 zi4hTy9&X_`aAOUO1p|*m707}noB+d|q;7*vlT|pRH|zSp&XFJ|eRq9t{zm-x;p?fJ z%RYOr^nnO_9*>qV9xs{T83#)#h?gYz2sSjpH1LmFK>qm6=w{fs+&pMJf^llC`l5D2 zdgXTP(ve1tvxoi6+~B|37h?2vf%lHl*cI+=8tGc- zTIM|LO6UCO9AZS7Bh2S!amNPQOs9<1s-+QW&Qq1m-p&n%L5b#bsz&e4%k-LZ8t1VF z-BopQ7~aHfkR5j78vKSQs37`b4;qdc;0+#;QZf*=R909lb7;9!Mi02kWsPqz92g7W zd!v^l2Q|Vxa8_RFGg1kzLuD%pUc*Yg3E$vr2~=UU3WEJ!h2(+HzO||IAXFFRCE((j zcmsa&XFQL;u?7us4W&a^$EXj>XofQ689d`O(hr*Oe7Gs^?2)h;>PTH(+5Q!%Qq z408N&je-|kkPpKPXve{>p+;kj(%`G@?(2L5@!Z~-%!BRK_#t_F+)uy1RKV|CFqkWX zhO0;1&#v!$soMGW*vI(;4Dk)*{w^0jgK8=pg=2TRATOn$ypd8+kr(p_US%DjyuJeo z&D8@}&(OBsLYxZ6^*6J&>y(Fd1w@H=F~1 z>63Wf5WO{JY(OSQqIa>?1@HJ5=U(~~Ez_rUdOg=zOe1Km?1Z#fm#WY+m`vlWPpQkP zjNj_OWM2i^qn64Jngo^kD;MNmMyO6>|05|fmd6{v;UydD5}v1GdY6=v@wyn)$DY)e z$M8YfZs)Vg@-4@^z>Y>&-|eI(0gW8%J!h>D7!p`k8roN^|0pL^l!NAHHOdq`jEz+% zX@PIBhqq5kNNRECDwoe00_Dsc4u{pp3ib6d&pPhlPAf_um41%F#%M<^wS!Em&Hnsa zH`J*2$_wZWRq&UXco~~WBVcSyrQr$h5Vj+H!}*Y7X2@e2&W_~ij&gL|x5Yk&`TYL1 z#aiP1iVpGP`m_lF)5s#220M%%xPtbWca#$=nc+rmHIaT}HmpU5X+4%uT`?z`Mm9A- z)$wF@&qyky+q%;vhWYu0o=qPQUuC}dnHw<)BU~TNMDv$*i1%@37%ZD437u3!=J5_# z1}iZM&cOt=0@K4h_{QNpRu>eXOu@4>9@pYwEJ67sn_L5bypA7Wmrmm9yd1wkZE7cv zX%W_x33SkyNr|W=Qr%VKB!jN6b1*|m{mnW>_hqNgqds#HZzU??x}=6U&iJSCJCJ-g zYDPjs@bD0OG_d*H2!Nk7Q`LoB*dG@|yfN1C5i&V$IaAE9K{tas6zr4`@Gg#vB`uAq z!puvdixl9^P=k}!>Av!bNzv|xW2%-22*5FBb4uhaDRD&)s#W;+Q8HfiQXIwRm zS!$+Vkoz{h^?TzkB%R$w-EZ0F2ytGu9+-p7n$~NxpHbe|3hSwNoQwr%09J?eun&LH zL7FOau?^j$1k})4`BO={4>lcuFxZ02q$@Oot+D}YsK2mK^5J{f1|jeSX5vO^Bpv|xoRRUz$vsAN8)LPWz%;lo>2kr(8bhd4!2#p3)i8pMkeDs zg&S|^n7-l*l0>QGN#G!_mLND8LMeHM4}JB$y;axFQ6=KMx>#ltZgZAvN~l) z`~x$;fBv8+?!)ezu9==FJ4tT>fS&kAb;V#I0e>X-*Y_?27{rFY6&>8x%W>5Pq(UENem#pF;u7WzqLeFW#q zWPSt-<&+MV$~?rb2(96aFE2Rwspo%w`!f7U=e2a)rLRqV{`Lg*2z>a4`$GofnQ8`% z^(iY@@|fR4h+F)2!gWVZXXA)jfhB@kg=PM<`TA(=$70U0^p0}0HMujLK%pK z*APY*@HMoAJs69Z)lgiEe{l?b$F9aWBM1V_4z8N!TjzKGHSQqeqU=F^8?U)JLLm$}-+rb|aU=uEZX4n_HLt*TLt?@4o zh2=5}GGbYbRF6;$C?$C%QFn#B9K^fTN1Ehph?&%LPfsg{R*t2{er#uH`zD{ky%Nny z;4)sSmyjf7q!3t^$MYYAImQLng!`UXscmG8Zm;`8N!%k-QeN3H_98k!`?a6F7hIUm zY=;?OpRYMr;Pdc;7eW#1GTgx-RLJ;%m9Q3EkS^Fb@qGMvsO-KP5Td?1!W`+alU(BO zRwmz{l#KDS|87ZqEThrwKfDZI{(Ryvv^-FfH zP5kOT1V;jANGdjmEpQavvJ*4HSTy*N&cy9$HJqW&c0jT_xj;arl&IhL^%K98P@iux zHO371$VkEVe8`+*_arA4g^K8qf$$ST;H=D(>by(F!cLV61Nel#j-w$vx6|otTP8bx z{_KP8A|`|h42{c!V_a7e6exkLRQTN9n?9kZ9iGEfja< zQv_U#+?Tbf^zxO#kIvlAiB|8FSHAIfS*@fm_SYHI3hYDYcowddU-l_J1;MxjuHboN z8RdmM)^O~hnyMd4^g#Zt3tL-K{ZmgQEOf@Ds~>Qj%Y6Uh*I1P{LO7@6sq33JpB3xb zV)u|ge8|{_N3fWy1iqqv(pMgGan22e;fVeV4J1Z#N@wUt8(rMJd9b0j>|_w`Gx z>1~@@*YbSI@a&Z%V}|!RM4ff+3QjcI8WHqDR{QorFl>V2P)(-k&bk*Qp~3&?{Twal z&63mwW3dj5!ay}2A}~riOM2amGDBBH^Q`JbZQ&Z_qdN9PN$J#)cbKLX_>D8w8b07TL?5_Rc(la zZI~T0&^+o3kFBL#1PYmtToa)bC72yEzf7s{xeGyPUHE<#Baz-i99LEAxncYI=`SAv9x8FfdiKVJ+NSwYTrS$^nPM6^~c<2a}m)epnA@C0@fi}1j7el<% zl0z^6pHLwP<6&@8vJlk8Z`cbrOAaUsVVGGJg*$KrJ{djW2xLw9?c1-1*|Fls;W86z zV=`>SbvOe&>bCkwue>vpCaCsiKO+qCnSstO&{pMv=@6^d&|w(P1&HZ5d^LLT0bFl1 zl%X`oI0=ieEipt&9-K(WDS`IOMm&sZs1ZKIs_+0FaBilQYYCU-XzJwTeR`bdNlI={ zy`(`&ZxaS+?RsN3Gv9@lhR4vC3d0Sl0U;2Jf9Zd+41ZB|{nRch!_^UrhDH1hYUmXl z4I{B2R>V`#5=+r;nxsOFmPRg}8OlJsz6-I&zYuG!u`K7UlrBG_F&Xx9oK9n0Hv^?R zBF=#g^Z{Pr0vN9+vV-F+froHbf5vAt7dn~~jJn2r*al@swVqOq!R z!<>q}%>_oBG*A<8j#Pn9kdvqJefgpL^A@?!&G@mz;Cw1h|H5Kygr#JyZF)c357dn` zbK*-_t^M{m75@5ufn-j52g?4) z`z;@Y7$K^;cMSAW{(hSr*G!E~)nM|0%W>R{G9IY^R2xSdvo4LJ<~S81)gKrG^YOCL z*Z3-XaGyE@wV;*S3}az3l+=OVj!8+LfyTbDgPcn$C+#w3shsvS{N`{tozfOFLoC08 z8}dibfQqm$wXSX_Wn?p~qbv3&CTwV?GyXH%Iog>W&B}C{Iv4|CD(*IN>Qc}TGs|qr zAr(@Ff6n}}GV$MEA6uA(RBr&-M4NfNtd__8g`=cBM>|V6DjVR4a$Hse9VaLkzBGOk zQ?&I4hQKG#axcX>ON$J1`@1i@BlsA_YXvHZhIcglDfJj{za`b z!x*XWO@nE5xzr8R2O@QAN3YO@AxFL4zxaK=C=(2iqcH?fVe~=-l~$u5Grq^o5C@U4 z67s2F)gB^^zRp*Hd)>oa9m9{Mw2BQ-KU5ydNW+OJooZq?o$@@>S!j znh$N%JCz>dRSmV8_8S3|n_kfGp$iShXubl&@F9LCKXhY0Dvp(?81;gmxEmKiRa}PU@EiM}om7xix+XE4 z7VGgWyDKh|*Mu%9V=3&(Lwsd$w#=hpdOLrR8XN}$C6w;rApE9cXr}6ltJN!20)DFb z>I8&fC-Z&ClaxWS}zYXrnBJ;{wW!8St32kzk&SxnL}6^+YvMuU$2;9X`Zm+?`ru2DLiS zrR!mP=QMnaeYwAGFVpygT!EdEiNdiKg{$L~(UHR0`JM`uOtJ$EZlhm$BKRF#;pP;M z^(ev7jSd(s)dj3()KG#qgBWVCOhmzJi~PF zDNj-F(V##&_Px82)N9OGK-tgLq6K}=9Xw>sJ@001BWNkl3F6ZXME{ve^~)^3@kN;{UD+Nq2a#vt>pF~uxoc7#KaSMK0vXsG&=pZemA z!=v=w{Dt%Jv=MB7{hm9vo5r}cFN?a{8dY(a>xFYC`HXgs+;rQ_X9lVP4nOC3V=_H6 z>tQFuLuVw9zrXnzcbV_akuXe`!%})=@@L)Mnw)Zgds$_84eznKs1`6tKg9Mp&}x8B zv8f#oIVFd0J)%CGTnIm@t|<#ScR(19aOX37(T1c7f5Vd+!F|{Ub>WZ7jT<3}9vJ|U zv`rNOn;hyaB)JCoU4R$#Kci{(?8aMPg8vCjV<{>}n^Y!sSuMdrG#ZORQ^TXCz*v=? z#zHMMQx%qrMi1jF&Q||3ZlKr5>-a|3otOQ}IXjqz-1(eG9o5{KU0<=6tD><@l=(|t z;o@p3wGm5=QN3Z4@zXdfe=rSvl^R$X*Fj6H40SOdZl`M4k9I>O-lKVV2>Thg)m%8F zYN%lHGwvHzApoOc0=U#a7%20x6?}q|IGtl88V1R2yOPAPPj>^Cq;he(40SjNvq=MJ zNmHe)IPszgw}TgQjVD7dX{+F6rin#|!VImf>6il#I9_;Sb)2N9%akJNp%L|hD7ypN z_|1NVP05z!jyA?uct!EFK76Zv=4Wug7BfF?k3Znt9XuJd742;Tg>66X1}V}Snu>=A zThV4Y&LgZZwjXzPS8~p!1oO3FlTWJSGQ0|;isCj/Xw&#P-x6ISCYOy;HPx_Jh6 zN(E`b(bg_~9DBk|IwrTEFm}S>G{k5t*&tcEU?{he3No8}a94YPJ=qiB_pkZcs%v|= zHEod{R3l)Lo**40v(=L;aef&OesEIGi@(f+*YH7B;VtyZbF(D)dAFzT^WPJ3-%2*O z`aji^;0Ujj=iHD1rs@9N-C*eqGmX`#X{NN1rtnVoLJ~d}k5uCC-qpGbJlEN|l$3Gh z#_UP$?K$Fh<@K%B`BY15J|E_~5-07Xl(U!-L-+8!oZ`z=2ado=s?5`@zOEZshI`U2 zSI2;?+8>Y6W$y(D!Wq8$T!52&KNPwT5M9qspf6>tjn|M-D!<8DBJ9wq6W7c}xoYrK^OVxT;irP9U{T@e_&>4ta} z>)NAJT6&6@8Qk5Rsm8ymf>9J3Nf&qqU(kVrXtI%{7Rxv;gTr7a+=CUq@AhJCQFe97 zSSf`dEuN6c94={;s50t1e}YaBjT_;V1FL`a$`K=*VS2?IVX%W=3jB=RBK}Kuqw31=a00S{lC+bB!TA#CD@L%xBaU;7LB3m4t z$WRaDqun`UUtE#A%{3I>`40X*>-Qsl4zBK9VVU65+pIWvDH)}O+{S^_nr`4^IOBK3 zS=7vD9H-8>7&fWFcn%h-ZcrM&sozjuM_^}I2q%fDKP13#>?aS*(~h-z39poe5@LPO z>EVo&mVZ?QEmB=DJ4UJlPz{P<8oUlWq=KYpKRF^|4Yq2y?x-87{;(2H%N$=#84~b6 z`?D^F+wdTCK(8*xdGu!a0Hfg?J)ob`gsxL=D5*N(Cfgq~}muTACy5DXE<(yDO*PaVy4% zcm38~@H2han{cS);0|&{&%mLwO+Fij{K0I{P&bo7947yR04za!@F@tEf(m8_tEcy& z?B-_XCOb|_(h>hQ{GU!7;mu3@LRr4adez+gC9m?cjzF2e1i*eBRS7mB`tJB3lC!* ze(C$~Zv*egKil57RWFP~YNjgT${LW{SIb_*A&xIbR@KQYs8--=&LCU)w4GHaTO*_| z53`#>1nlD(stMNONv_Fi9duE*<$%3Rosd|4-FPQCJ?k{WO%K_lDHZ3zNmqNn-f2#z zeH&3QF*Rn~+rgY(<#$_<8TN2~@T!H_7##3e=0h_YgT-hY_+YZxKe$z}FY8GE5$;}* ztrNREJ(@DoD(J13TH4#e$_vHdfn=5+)mh47oXl zmrD_iG5|tk0epZC`WnY@2!EAE@(~UCu&9iH1^80g_N4R(@e;=uN#Sh^mmZ z#5C%Nu7{{T@(|36{pA`zLI#yeIpj5Nr?gPf*y|{yyW&~p&)?`0-P6tC z96Z&ZaXy^HRa70f(Kk8-LtqZH;af6W&U1I2f&I8X57UM8K3s>hp#}ToBd>-v(nFr} zc4=e3CM$)w%n8RGKKrVXelD|e#*a`9cp23=|TA^w+w`%m=EvbMOaCpv|W0`d~9y5@Wdx)RtahM zCD-@dRdLRtQsl>6^~Z7CU*M~3mqhx)%Q?GIR7Sv9DI*_o9WMj}-q;I0UsG$~WlvpC zPWNZ^gVy6U^zt)^fV#2}B4M^pf*aHy7aK0aqb3{ER1Itd3#bWZGlr;8d`h*kG+)NW zGC{7vVi*Bqh;fsOW7ZEPt5w2E@5{}lWJAb6XADJ*jJh<@DCcadvN$Tc_L;?R+U^cIi+pvX;O9PnBt#me73kM`O zc)-Q?WhHOt6VONI!gwqM8=xfxK|JPCEhQEw86HW5tJFr)(OYu$|@L; zK4Xz!@QEq+IjwYuCs3R7q6q`w6<(lvW>XA?>t=q6TtA^^@~8yA)N(j?{^KTbfBRoxI zYZ)rb<$zRxcjSVpxJPbF4EMlBxJu^P5qg6?0hiH4$1*ciyW`73O$vwezA}2Y-#+lu z*K~d;E;-~NtcD5TF+Qp4)DZol0DqulSSc4ENWZf48WVk#YQfEbS!wdAfohRjicMv_ zT~Rb#lop&%G2AJaWYp5a9UU`+0-BEL0=u!)OqY_ze+9Vx#(Xqc%5I< zRoC+o`%|T zhOfa5I4ZAYfINpnSOxyYx)7kJ*+cELs)=Jl&~98L>d!p2#2|MuomU@|U$9qF`8!nP zo7~5DNzc>oV4E0n%IDGpeKYJY`Y6Y88ZIvPxf%z-ILXZmS_=mQnCiu2W9vuG7#SE^Gg1@3PN& zpZXs8RwTZ+f_+0$PN!D1yx!V&BvrukQc-=Q`f$kajlYC^2wk48PD(DxX z6Et;#pHgfjy- zab__u*tcjiUe}GV5!yNpoP$&H82rGH^pYN)ag=za0!Q*}nZu*`wEl-L$!4jjBPA*@7YCT&W8I3i_bwPfM`(i~nubI2=eVGERoN6-OFLp#U-m(d^d;B5n*z*orwub~Ibdp5t4+lXX7bfOU{qpyiZ<_OXU?GU`-t(k5HmItQ*XwWhcDg(4Q8Ns=64F_{a6yC zu%B+pZD71nlj5lkSJ1!M0k@bq0Ry$m*BLrfYa;|oON{h|i!u%RYBwL(f7MtBr~c$e z4h;1D=EmF~+v-AcPGyJTn9F)-|1>_im#ejy7hB0>Z)NBt%IL~#xseW%(Rwmi#x_+4 zKj9)=MXO*cW;6C+MYE;r4ehfd5{iXfO5FN!Z}R-K?eZ@4GM~bMD#@&7PK9>d2^;E{ zzAe7aIDsxfahZ-SD|MjJ*;LVk`Ato4T{LSx=R&u!PUSe!6QQDXOD3Kp|1~18cW! z>*PPzX+T6sNJ)Fg&$(tp$EfW2y)R+7?=9qH)>o;CG?S~+S!zI8+r_i+hA&EYw9i`| z;~M?g7BDA+8U6(F%3Y~#zA{dzjP9(`RzK5T{a$*ppA43fb}s!2m%w%CLH|?1Sd9v( zQ_viWYg4M|X4sHQ5#nTg1)&%T?`4g=pdQ#XWuUi^^`ENjXdQahFGZ#%gqgLRIbG?v zjdwp@!lJs3FV1KK9<1j}t3TU$rL~-r(=-)7Ymfd0k7PHkhg#TI6dqIQI7R-`l~fMO z;A$%6^-){^E*fU-mD+sVogv3Yjnedc< zdbkE1;RRHsDB2C5$D>7^}b9^CZ}^`Lrw&H?PuDBu4UZKbRnu z;Us`ehMDpcX3Gqii#f3=Zj?OoubhQQnGY_xPPrgn+9E;{^}*}%1aV2xyWMhuK};WL zj^u!~e8E#jcfv77Bpie%#wR?5@6`i7FFl;aRURp5OfX^{$5b$yxxBu2pEJ+^dF+9d zbxFfgj{m%4j)(~RyE^JXv1$;T;*S61*H}MGeeF$WzlM#5EfEy%ABF!>Yk04tQ)Wtj z7_Ez`lQ;@Ldz0-Wc2iFWtETU`PAB)U8_smBaQ=`YMwrVM@n-O4r)$2;c3E)g!{Rg_ z8nYbloP8YO&I*(Qjj)m=+QWIZ5wA_(8f@wt67L_eFW`0pB@eW!Iv1wa;ab5lb|*a% za#AlN)?BU%5Mws=FI2ROU_QOhn}ITNd2ePKZ#Ddp)BQl5=akxFj z%t_8JC9H7UA7^;$`Kn_lcmuQ51gWBW;Agv?9DpSF7jwaA9b?KR3dUQ(`91$hjx_r?qtEAkYJ8$@xJiY=Pnn0Wc!Nfx0VS|cro(mat#VL$uB}#J zJ`t;L_}xrvldJk|4lgG?tl8djsU5B6-uK>JNq19%aFCU%M}pHHz-gf*mzFr*VBgUi zM@R`M1jA$mSEI3dV@hlLlUn2W9;nsyKYRSj8&xIGpHzC5@dSKLNk}cNy3#S&%7>&Y zywIB^Ms~;*7%icg0T##}|Htm5j{gNzOqVY$LlxM|>#4m=hlu3<-a>XkZ#wI_uEwNy z$`7BzyTCq6!|bxw5XW!w!_O%Tz2oe7s-${TxFk|W43r155vxHFC;=|s!$G{-IG{4A z+iHqqu$u13r#~m8a5((ujJ7VOruS^s=j5?{Q%;M;$9Rq2&#U1H)UeLtPRYbUbPsHp zh3Dak{zpv$+q&(lhEthUqV%I|*MN#sEXQ_p=8awq4X{d6zro=kylcnpftAs7h%(LeYL9?&Fp1qQ0eYKS_3>(%5uzKrK0 z-lTo>WzzZ9##;MkYADUbtN331s}dvLF(ugG4Agjltjs#Cc3$^_1CPlN_C zPR2VsO zkq<`71n31t)jf5D4iSWyQ_N1_qWo%>43%g!@hYvsOL&S(<0cuXR$?PbZv?A2na&QL z31R5Pa(ab+ubWvLeGPoq03;7*r1~_HUitHMUwx zdN?KpXaNnut9*ge+VkP3)ZqF$fm5I@{Dec8Mt6rrU_cU)c@{SEYTq{35l_Qk?eINR zgb{L(CejRAt(^FhYs+()CX2YVyyo9_T{{e#$Y5>_0TRlSC=%BY*L zN8bvV=6*>FlJdfIYA0*>y1H$&(r&2$wzW)WmJL{uiV?#NxK0_3uT+9!Ad`&3Vp5i? z$r&^7N8*>|ejoH-PeqtbrRb4kq}k1p-|^o4*0I3=GabGsAI^j+R2Z`2I#n1JLkKj( z`dF5t=s2}DE2$xlC^}(<`lcmT5og#_8R{F9QatFM#Qn-@wsRhLyn-02LrWlp+R+NG z1q0A47Hp$y#$+{DIXyX3b9j|FLcf(UoKZfi zP#7x(RVFHKPla7jifXAcbe>=1FX)K>U_P1zZDhS%m!mQsf6*gas$#f-bdf~+r|-Od z#qOxG1Wq?%!S>E{XYeoLIE_7Vv-%)y;Sk<6+R!oTZj6^j#&BFNA1RxBPYM72>K8(KKOQU=inTHjeI-o zvv|&Pz?y)WxiLI}(GWtvv8qgvf_w|!;9cA&eV`?DmF;S}`PVVh_0;{%)iwEH!oB2n z+DjGSqVC1np&-2A`*2oDaun90ag>5nXgD>-09C}8@BHJx#P5G;>*$y9HJt_IzNepK zy)i_+HtMOt6s6W+HdT{mVm4zI&BfhDtSp5=bdNvKKI4)d3t{x#Hq6Ookks9oNciNP+;ppZ|povIHkUtXzgxkQ2s2K0Ha=FqxR1ksnPX z2s00xCd8=D^hhe`N|HrFxjdq@!CbH$YtdM$11=b-O2AHNi5h+ZxIOPS($NxJ zjM=z>Bys{bN! zg;VSf_5~fG1GzHRv$sGW*y#!56_Ue#qVBs~)*jAdp4P9py`4iw%Qz;_N*$)l@f;}w zZMgxwfd|+U)1e=af(`VA*GUn`XS=AEyO7>a)tzzvsR>y=*6`aQCHb`agU!sR-u;-F zMuA6#sxwlV7s4{=zzw7bmuKR+I0c5m6o`d1xD95=URbXGdOKQ@x>~8wD|VRn`qsE_ zQtkK|9Hh#q{gMST-~!C0VpN=rz;^0@Ji+e94g0yC5A*Z~P7D7@f2jf!B-Qp<8`RHV zwPFgm*VAFVEPtp6b(X5;PDdl!Z^WB-VS(9H{Sl{eRz^dFqZoCU8svbk`ZG*}oYri4 zpoZgB=!PBPq-UC^$*-y~Hq!~Ct#$41Skuo~#NFirIgGuQlV5RdXp7%419+$(wdVoY zTUC+M6l7jXc$Da$Qo)xi;afsuxJ}!QzRtm_gK<%nz)ColwyDw(Pm9%PjMw$(JiLX; z#yoZ0G0z?5@S|F)hbjT{a2Je#PPiB@s&VQE^(Q}7Le+r{*ql~NVF-W_Um0(8JHS^P z1O3|@Lma=&v1#&!bO>th>fz`I%N+@n$I9Y=+cDMN=U1QFTbUdS)n+JYWTwmb5bi)% z=n6d$DXpFk<8ik(VT77xHPFXlXX;k(OkK&l!u}~M^b4r#NO14>zZjm&Z&%vV8Aszw z)xs9`O~F>Z>4JF9@yFB8=Pb4qZI;L&MLIrjbe@v1PkhL)5YB4w!T zg~!qYnsF9hAopa5yoN^b75hR#J}mPr001BWNkl4tIEn zZ=uw{%+_N*jOlbT+@Vsso=k!^T%5nlXr3&?Av61A2*lcjcs2Z|i_2=LDW|zJ6@weH z25UnSG{Cp~fg3{!xgiDBQkf-AqlyG`dG%a|^DO)Z17sG})9<;9RN)j|jl;pkALIt+ z5k7@(be{a6pDIALF|!d#6EPj# zqg7OnAl9(uBjnPDJXvvy@q*GqyxGx+$Lx3$u3$&rp$Ez`Sf;l_3?g2Fcs|YhrG?ai zIJ|?^VUTXm>-o5SjIT)!nt^UKgJV+o_hcg{y~BPoLWMe8z)oYiIf#8~uQ?sl;3f#s zwfKsR$6B(R9RWj(u5=y`S$R_Xn9Fot@4kQ^@W5BxvBv6>@EtteDw~}mVqONlfyDDse5dI=P;DvRbS8!TyQR|Q0%ePuTl)k)A%HUr;NJ4QH z*JI*wxa@E1_*zE8gnzzyaj-;7s+x(F)JMHj4OY*woTI-pMuwW3%)9&=HQa?BIt}NS zADmI1OLoZukrJ&YsnHOE&GdOY!0w~x#OIT`L0!ydR?^qCFwyv_Cac%*m%bZG=!V5A zJ+8oJI2~7F8Yznt;f~f^U$=}8_;Vrtjn9#^E@7}a)##_<%o|h(E1BJmyyP(5#tSve zIn$NojDkobfPy3u&*NXM;VW*JF4zd)!Va2?+h7o8f8gr#*}RW_aJrQ}^eShoz{!b^l}M)Rrj;Z%QRDOKkCCt2gtXmX8NoCG_3z zT`0dJzjwJg-91niDu3r>e2Y&V<<&j-X}mEmaRr=j)YUgRO6?X))~Yt>r37WAw6ka7 zFue&g8}l6O=%%W`Nz5l_;Q{TS;;@$fRjCl`c;r|QF6Tm5PL(6zL9qDydrl-C_g?ny z)_>)Pbk^k{9&&ODWW|S45dzeGV*vc5M^s-HVQ2a#qN*5eGiJ2DCR~Pgo?7wo@Do~5 zFDQk%)DBrA3Ie#7Y>~3oMf<8%gA0qL>q{Nz2a(VK3sDnji~Cd_`9T}Zs?bIS8~xx3 ztx$*YIBruFAR8>kUR)0XU>Tbd2lKSatDzfjlPZ{*yGbaXf+U2$lwQ#n%xg}^x$>V8fiK~)vZM=)qy$OT|Ko7k!yS0By+H5Q zfD7i(V%0?zr`pC++E1sAH?)vaXoC`ZMPN`q zcmqki8U{%bn1~DDE8bO3&M2?dZOOygxi^IIV95`q)CCm>E*OEmFxeX5F>xR6g#hlx z3GA~yIvwnh3RqG(RTemk#Quf2f~&l%wjccSIq#cK z(d#Sh%b$D+p4u%v?_Ej!(fipy3vWm%9`Fgq+E>*$Fr*B4=rFd!3DjTJz+@_{ra@Es zK*4+-1EmAF$PE?kOI(0I;6)-zt5F)l8~ZHYbku>L@PghOPf*)7g=2N9t#gTmzq|l> zK!(57tu11Cih9rM1=5FSLQ5J656ykpSygvDv-arDR5`hoccOL8@jao``{u5HF|)4c zH_=($H3qhtnT>)JrRv}>xI@e3h)jT%uoZqn66BLV0Q?AcV4&QQQ0j!O?5BF29sBnd z)=E>*?xLf;>HJdQ`Cn58;yCqK+UtzE71o!@P*5sB0j!A0_)T?}$ub{X@iVE4rN9Oo zhZ*gtALQ{n{%69c6aIOu>E2&PRhTYE)OGWrBf;uTw_vEwtj4SHbPZS0985>$z=oU> zs-rQpQ3Gp94LC&!FcJbOmtp9u`j;mLwgjH_4A2jCX1`t9{jIy|>6~dM+Zm(_w18Pq zMry!So+dl=f40lss>+YsarisO}9)?wUBjxx;lrtugnSfvSgzj&OB^ zYN!J^()UHK@E2couq6w%a1^G87-VY7Z_F|G(Ij||cXc+$A41N7X~D{tC* z3n#`rYvXa(%!$>-_J-$!C~IxNrmKv&c*DC@Ea?2^bD)Fo#d!yxa@@x;2EBJ z*z@9Bz2wdK>rbW>pQ}-77Q1W^+kn4(Eqde1USYGaXlHzY# z!M-ei-`WQtKKY#8Nu%7LYxoNi)nGLlKqVL_08|s#1o-LKI`CVN8MHslm00xM@8@Sy zf4H)Q41%Tb3C@DeC-@iRU8nq%Kx7m&3mI9Bm(EGfK1NY< zx#NSHuZB3*t5NET`O+9}lyDGTz;ec7BdTs>kMEvzf|bJ~INSwNX?J zR87p|MgiLIT%xw)AV+Rx(NtqBg<~&un$F`@I-)k;*%DAsbBOOR^PjKvOJ-yC5fi#!XZRd*fSTdM9gXF+P(D z=*3cMJ66Mc21Hm#t;bLoBj_%kQ+bFal=D-6h{bcpU)*IBqFYc3-*XwU;Qs`{1z;7| z5&+Pdxp((jJb@%YaCf)j6sNcrhvKd+@Nq3giWG`Nad&rjcXtcK<;m{7^UrC6#{j6C zjDW>3hsXP?s0?mNOCg-QVP^o*4Hn{K3K6;G|F|xeQUR+c=A?lXV5TMF6r7;vAYmg` zrR9z`bc6%q4c@Y9=#%QdKqKbEd zGwoK-FqJ#&^Uzb=@+GQX|=8E(b8J+g?_uPR8QBk>z z!_C0ZjBt3s`KT z+QWG~iBXN9sj)%MqP?&L#YpM#~zIwFi zYux2Ou@;@DO}v;Rxd{Abzrkk|L#vIeqLL^p#v89>Y5YaTi0nF5j+Se9nYmotWXl{R z%km#Elu~gWOrrC!3J#)QZ|Ax&k}{zhOyL615ju(~R2zueLq9Bt*VzxhLSHz>HSL#r zDOb=_xHvx)t?8sLqh6^lbRWau0;Z{dc#3tEuhR@j6&t|KHN`1>M8Q-L?}@GU8E6Y% z;cvEZIv-^>mVr||8$$6aFX1R~!d>i)GoczZA`i91E;Ix_!7yxvxo`sd@dULaOr5YNFV;1u`4YY;3VC)Hvehi$n5_Jq~E5}I;1dp_h9S7i*u zQ4cXXd=C##t17E-sNFEIJW!2m!X`K(O7S8rq32q&gSteO^kwrG@x=p@Q<~7pt`X=A zuE5vKgS1b5@Lj-F_Hb*mBN9I9vg%3Z1K)mpphjS%dBAM0kV;@IZLt!J?&0~Ydc2;i zn2mUX{u5sFG8Kh=XboMYp1cBj(mhV*T{0b$cmZ{QAUG=W>hk&z)OMy_(~N?GUY|r! ziVp;)@=Nz_6>I0Ug zqV+INj}JU?^@t8p51=Q{;6CVw>D)}EsY`U!YNIX-54_|*?0HrkPsLDrCU&ube_|JY zr()DI{kt{VdZ!*hH!&c%AqA(7<{_emqrH6*)1|3b=?SVPJfMycEyvK$)SL?8AviCu z<5>1k4d|uzVP$;{yzr5$@_YR;&^2v?S`c(Cv>1LDSLE|RcRhw1;3axam*JzTWR13e z=wu~uuukW5FdMEwVQ2|X?9UD1K9=Dm3W#Z(1Saob%YJUR{^Mf$j(^yt+5@E?)%cobtXJ5 z>lhfwRnQAnp(d=b)A^mU;5GiwAM9`9E%)}hZE;%q14ss>J2 z^+Jci1EZ<)0~E=Up6eC3g6Cx!0IB%Fc@ctq+Pg`tv7e?MgT^vdJkefHzNnXwDjbjv zsswd$xuCq1E^%wFDwQ@OLxc@cxr_PXDqF!OL zvy=xhyRpms2)AjyV8|+ai+tR=3xsV6Wq{V*}oGZ1>C&9UW(!Y55E3%x`Vv7B$Y>3qnMk zQHBRVFDk3Y!)0utWB3rq;a<^C>@fxyV?|vVFcM^4kzgD%s)(z`az}*G-~3@_hkuRv zMmzXPbd^`hM|TCun<7$rsD}7XHN+6gFW7izJTf!!3yl+>@HAzWlVwXeK-QtI)JM!B zp!ZZ658!gz039HV1%8BhxPbp~TWG15^IQG~%XK;Z2&V9T_=LH*E-ZqP`jP(ZKW#4z zT(ZuoKkb3)zUs(F_?^zmzrsKr2iNEVl!b-J?1Y!13+=?(kPjPUQ+$HMVU}@RoEOjK zOL0KVH!d2N%`y^->*d@W-eM}|gdRA7+;9=9^B#!8+Nj_NjKvRPpNROh;85)#*0Y!jUs6q?6Pm$h2fHSkKcgr@X~t(_Nvo% zB}ljWc_(GQx0hICJd(dViwhr*!fgK9_G-^8W1r}%N@8K?kUm~6q$ZHt6co>)I81(( zIl!lz>-o&;l)A0%sb?ye?!YPBLB;cEh0a209men#AtH0=m zd%cME`IJjH4^;K#)}tv=lZsb1Rq$Cxux4tfhgk)XdlW^(DwAR3Z&yEO>wV1gqYFT*|mVg5%UWM4Gd}xxdCEUEaNM{^)zwVS&CwY%;X7RPHw?T_7xC?^rD;>YM5T`mWctQ) zKlmA%IqGNZN*v^FoNZmHS(u-88Sfn{#7-Tay3jZNdninf91&R%`nku1SA(B1r)&cc z`2#qiGUUK!@X$V{YC#$K!l(@o&GoK@;G~xlKzgkoSK&wK;w;KYNN~UIhG}nbBHK$&9nQ zQX;R3ppaA8ie^y*3>KB}BEM2s^&SA6NbA5yZ^R#9n`2#jafJJLaA?R0Z?UvJ$=y@N zCNHwjyAQ`kv%oS~91h|l-V8yo1Uqqaxleq9*J82}EAKk0Iwyt?HdmXg`EF{-_;&t+ z@}hW3{}2feW@Uk$afGU)-Vl)~kHB#Js9N#|IhTHi;>K}@i4A0BvmH6iP_s9dHRl-) zQBYLH!&sHO(KL~OJ!BkCqk2M%aNJBwxeMNexBLRi@<`Dik5LD_3uCbikA-hK0k-Pi zoZBw0g4HQ*j3w-bVj^FoBiN4Li2-~a2j~m37nT+d`y=@6Ir^+#po{TA{Xcz0_tEWj zc`nIsxCU=i0luyuscq3{73LXcjkH$WL`O&%TwIlzfN-4A>q zWe8=82l!gl5nl-&!r$Bte9#*toTQFi6P%PvQ{e`b2Okt6O_!m!{7sYsAODZ`Lv=ew zrEo91eBixqY(KKDsztgS|HtdFty~8;F^3ordvzuj#Wq|=uYf_c12e!C`0Q^B>DGWi zPOjxY?aM+ttf_X8JaULaugQ5h zAFIP9v7OFSQQCxSpf4ze}A`=K|yHM=CDR$Ns4M!+6yUI#WmJOqLix(PlRL0H=vM zdJiw5k+|K?upI8Psn_4mb=>5kdcC6#Ev2dYvAk)l58Q_YSY`VWIa%F;Uvy|-jZU;y z+EcXKV%=NM(y6>3@?$I>QysZfYHQUqtOxhF)}VsE&U9H!mK${(r*Tu*12fdT7Hn<`%OM?8;1$222{c6SM}s1CfREx#-7&RdV!)X1Ka_%=C89KqFoLNWbdWLf zr07Hu)Rumsj-n3SGrEg`VvFd5_i#Nur)-!ZzR2I{9oC@X)Dv1zRmzH&{D*$D2|Txh zunI2=KEypO@knqLmp|cWUWEr=Wyx%-|M@&2(PRCc7@xXU&$oWVL70cOsaDX2 z-rK!Ash$Vk?U_jl)zd${=y~!ZY|3c$Xo-KTh|U;}_hJX9kCTWamy#cKG2g5c~?h8_ygMAa}qWc#QREyjkBdF>_7gQgPEWEqEzU zcdZXw0zbg1O#s&@vn1qqw+sT;$>0JZc?)E*mc*OxSq_0!VHaOjO2t}*tnae5af;Tf zyRNQbzOc7O0r;Cjc(U3oTGM-2B|aDx zcV{ovTiiC*K@`YDD2Rn8fB^=)! zos8K=oa>aK9EFWCBGfoVL}NuHO@c?*N0dXq*h`1;s(3;-#06R@E>J7HOwF(vZGwh8 zgj;D-&DZrIoul~!H&-{+8qR0$1jbfj&bhaI4K9E-L+81m3{y2R(W0X@Q7SdwO|0+3y8vj$R53dXj| zC3@Lcz5jDvboBGgv?^m9Uc;|A5U+}b^jswQ^XQAZt3-6+eO+B=)m!;H@5kjbr|2On z={mZjJ&f}~F>@8h>OGm0-FIbitFZTw-ORb)Dx;pM4c18YNTom-7zZtk39>j0C4#$j z41eQeaFYI_l8#Q|KkVwT096U!aN$co_33mlhCI*nU*ezgUdU_6LHFoas4Ry{&=Yli zjkt$u!wndt>%&bPj0Tj~w=f*ro2%thqoP02+Gt;ZA2izYA)mSdh5vOHJ4C zm@b~b0+VnZj!-XgFxEC6$WF!|oMhzC@zB!R#ZEYb^TZebVf&Rmm?~g6WYf*$6gVhz z$#F6uo_a5&jKxZWuG}2yvEAxr#cCVo2v9n_utEO3g6j-4{p{)7HS`w&f zCBhnxRMpsx`EV2bP%kw@VPn2zu;qA1CAcUSh)$Tz{$K|g%T+zAhAQG1;JoSaeJ=L( zacX?lkp&Epco=>_dD#Ga!c((`vjvWFCcC#6T%pP&MC6vvpL`cm+QTe!eR2=)lt4~; zZN$oYfz#a7EE1LyIx&1(&{8<@wPqny&w z#MKd|r0;xo&vGKuM9PDwVWF6eV{n}Q8Ovigc~y|StlsfVmBpSc$K#wpPaIApVjNG* zc&cCyFaumy`iu;&N=qpho3z%bF8Wb!xfiEk61Bns6eiz_=k!k8!4LG9s_+w-4S*-n zMN_CDMqn1qtvl;ofr4{4IQE~ zYBl7?`A~}8`V-IB-SjNoPBr1v?9{{HE1cy}2*!K72fgrvU+}+tpX_B2 zcndG#bErT&=ol@hB6y7Q8(Tylu}z+oz2r#(NteZ4NXFXO10;mOWGF!`;Rk%jgD?!D zz{krRZ^UtUX^fC}u|E{%x~hkiUMp!81nE@0!YGX{EJ6`bo0{o-FqF=#b$U2orVBb? zU*)&3k&D3>y^rtXI=%yG7z@)NSxzuMiPFMj6cFh&-uNi*Qa3S^4&hx&6}i+W{RuN* zBYW8)=c#bJ1ufw3d!K1L0DyG( zsxs6HZfalQ-%U*wxB&&^D4G(8faUm&ljI~AjE9^ziQu8Ehrd8}%cHyM^D3Nw)g$x~ zD8rrfC+^OppbQ?8PT`@x?k+Mn45Uhq??eFbaaPO%hD z!9lqIe0)X}!&h+E>>?Y8Ni47(KX;a+&)x%$5rNZ*WmIwXr*kFsvhE}Wx!uMke^GNK zZ4pI9cNhvss3tuliHH;O#BJv)cSYB2&&km9{(8afaz$W6Bi3_5RnbQvhB6Up z02QDb;tW>C5_BAz+DB{~q;bj78eWP?Vjy&e`C+L(rr>Bp($hhUo>bN0KIU>a(yzA&H&Im6h%I0DkI88Dq2JdqZ z39~adyvn#S*}vJ77PVYgfDbSi4KYUy5wqwYoQk*b8tj7!aslR*Ph1TrSQe329eIuQB9|kV(Zxt{)-vju zvCaj?LUW;Wl+oMVY%Y^0jVuPrx$=_uU2GAH&FsP<_ZoSK#7uFP67esDNjLQvta*bm;~&sZA&pxvS&braDtjBbdt1l6b+DF~zcq8p@(H}o2O z`ix3|+FVV&#&7V8+POS@e*gd=07*naRBjKmlYO_W0gJH&&(bd}huuiev>vLfTCdCbNGi50{twZ$h z@Q}T{1Rg+n_yNiM7CYcXD#@k5hNh~G&c@|AKNiPe-VW2v+VD4iGS+YoZVCseTl%|C zmpn<E)d-Y+5h#G-kU#!i}o*kK+Do)H=FfkC9C+I}xX2)*qqk`!^q^S&NkonkuEIh&U zA$@dov4~&NFJ@gDHY|BY*6Sf(lAeTJb=62};w(*jQuhTv^W&S}!VYq+1eZC34udT& zic#3n;V~D8=5$~Fuq=C}zx&5qb~#o1YXKhRXl_@rM~D9%Iui%QEQzZPxwEC_OVXpG zx97Z0si8GvesFn@C-Mkeo+eQMxy_})UEqyOciJIS{6&&;IC|nUe|TUGMQ{^X?tUG- z9G08sTm|$rqlEKM@Ok=#KC!!b9*6n7cl8kJX_wZAFpGKw)5Qa|0zXlGY-G>1Lb0^- z7x!U!3Y}p(NGeH(gU$wOryj@Aj#8c`Ug7Q^?05baAOCn(O3$c%xx+y6W8D*~h?!g$ zY>3hiXt=XWP*{nVs#$8@n44mxe~{yW(bCaR-BY#oo#h)nr#Ojfh+m;L z{{>Nc5zNND_zi2Li}ukP>Lp6@?=(>?;5H(c*_J2D81p`yl%0$+Ryh{*-p-9h#0z~} z%rRY5l_tqmRF1~eP+60*iUabMxM-}FU^J5N#0}9zc#LoIuG}jBkUzv8IbOb$JLxpt z7K}ZxqllDWsF`RYW2mj@B_*_>&w{Z(ZG+0(lUHJ8Jx1SwUHTIog=B85H&H8HRyR;Z ztdq8lV|5kPMjx=(>-RiBDHy8jD}yag)BQOT(zz>7hO_z=4}|rw5?b;^9pDGBkDKsx z8bI-)E9Zk7{2#{RR-7!bZSH0j{(<-6zR?Xx*H;Z72$G=?B&&*6BF?q`%NUFqx`CAi zn?e()X{2E?{OjD2z96Nh`5bFo%Vjr_j>Wt!t=F=wULf8W!FsXX#sz(!?ocM#b7Fo#PWQOU8aMoig|3Rc(~4EI2nM8==tiF*s61TLk&SAtr5Hk zT2rP~-BBRys67dCaGZTryX_G=FQ@1^+=4gp1vae*5F)&{pcMpn=hKF1me-YpGO1qB)a9ia( zA8Q{1=U zzpOQRWZ1#XjUheUPv~e^s%L<|RF)CJozrI;J+Xa8tPvoOza4$WpY4^DpLXFVF^HDq z3NZzh{EB^X2d#9J(>s$_drRQ4uuOY9ZK1rDKjXJ=miKU4jZb;xnUJusIYASmU*wE4 zB17^R3XWvL-obNk}cDU+TH9 zSH($RW@;{9cv#cewG1tVbvHozm+^#ip_x7MZk zi~h&1qgL7bc$8gDIqW-DZ~qwX)z7MGkM|$7-db-|SDvWL=%4u} z$MQ_L2~Btz?1!293f#pC_yFdiANN22XA6Zd=_!q+fS4xAiF&fGY$7~{WgIoWNoSb< z*QD{jYUZ)1>F|)c!WalsXf1$rP*zbI3W>U^n9AL3hx}QSl|(&McW&vc<3DA+3CYluhN0AhFgoj?M(YhLR!MiY$M$4nHPn+&r<_5EjzcP1G&(!WfS*rrnRdaMIbmyCJ zSk=&r?1R`H>d<|dOrPnv-{PO3oF1*O)2#3m$A0ry-;InpKAV5C*#_8$^YAV(@5Z{E z#N&8}{-(Mq#S`HIHaA?j3QExdTEuI`UB1p=1dAF_-~X4tG0&pk@sqj2?xFMgev)G` z!#`e5FrEeN(q5g4Q+%scM>WT&fY0E8qbN4TWi&tx6%NQ^e}vqaNB_=!^l&Xzc8-I> zyhnbK74fX6ncX+xxIH*?etbJ`r1wC^-|lgN^YK5qN9fR$KV*<*xZo*opYMe}i ze!irPZtO9?%IiQuuA!DEeSE8~IWmnp3AA&?EyoDg$hHhg%jMOdpB>l>-#9*%&wTROShPk;;^2^=b)ghOr>eCJ&{O8q0=?fRWVDrxA6~8vM-^=$9MX3SbD+b zIG6vzaPCI~}+S7<0-;|E*-ox+fS6=a;*$2rM0F64%09;-_T0~rR<|>n_bWP zuI6wqu8-4kk=_7dP+3ogqj;Q4LJahWJun0Q#h;)!Pv$=GmCv&m_V84G1EgB;abC>r zAqLv&Mm&-0>Ux}^|B>b{yHomXKfB^0Be^LKqy+e=I_bPn8t!s8Q5|loPUZ&AXIb)z zzQmEbfPEfvsGYhCzt>fHg=z%LIFwFVSJfk(jn`5q(T>;h6>iAC`WO0&sWR>y`i|212N{qqpl5kB9w`3!73YEDBdB3A*bJ%F+i^H^`3T^+Ya&rQ`}C(yw*jr<~wQ1-&-kO18=8}1e@Fa`=a zTA1&#gv^1#Rzs?7+y0eyygHUKHsf~6@U%r4(RL9l*5ApR`ueJNt1-I2Z;tGZQ+%!U zO1pE|1J`A8bA~>m`^NN!-Km!f?hjcQ{zZ(Is{%V>{LW2gQoiDGaS_Rp)1s?Kb@0FB zv0QxP=3_Cpp9kit&(K&Y7>9kK7gmrnMO41oIS2g8OV=bKIE`jY31(WenSbMMa`FY0rjOCf@ z?M*?;L$^YG!|%L^Nlms4C${(-@&o=laU;jU`o zUjsHKsI^cSi}U(i|Ap@iNiBZE+ww^b|9j5m*yi)7!F1s8Y zMRPn!6{#$)!!;Pde{i0tNa^w``7j@j7v+e=B6ta5?1cgF5KCic7>+*Z22Wu%$3qlY zuu*T(m$(>o<<}@Ja?^j=Syj50R zvzWK4_qr>ewsXT`xN0BZxm;Pl)nV{}AHW_Q!sDSD6yxkLO}~IdJ_=j;GH&3!&=DlN zMSpk=$=n9+@fO`!=TwjND#)t(sXLmar7hkX;6xX)CqM!3i+=qQ2E1cHZ_y*3UUdEun z0$&SfE=WHAuC%`B)wB>Q@)>RnK2=+Hbx!mClHzw= zRe|(>zaLRk)dK4;ca4y9@)#aQDK7CAy~6wn4?w*6r@N46PaqP1GGbIG6@v9BC(MIh zw9O8&(m5YSfb<93UF`f&TExjBnA;xZ+X-ItaL`9~s9?H){fx$D*^v65>pVD;Qa{`u zcNLsgw?GUy#sByrk zJi)kJuvK*3=&n>5k~xR{nx{jQ8pSVQsOz*bMONm^_8;ubbizvf#1qZiu2;eDGIJ*W zokcU`&Q7YpeCNk z%sa1iARb%5IXZ(AXabf4f>>ON!#PQmqD1-wPQg}8pm0QF7|pBn7*yDid{`U)r5GTf1xI6Ts0IO?z#}EnQv*TYuXJ?O~R*o2s2w z7d_1`t`1T~>w=1;S9Sy5fM3-o{eX*c2$UD|;5J^?%%j++PwLz77r(?h5Cn~(JOpuF zJ(@>xZM+W|*h^jKSal2>I^6b&C_9J-iMsYid%2ZQ-IlJKj)yM4y-b#+qQ3XuKIsD- z0|Kuyz8P2egPNs7bYHt^Y6bsouBF<+3(SRE!NF(fC;SH6Fb)pDSlx|Jz%rVF!^H~w zckoeTofAq!6l}9MrI)7+ea&u5%i#!^&w> z|M}qnxk}Gg53~pwl6wl@kd^QlE`e&wi&p{%ESA;W#o?Hdf)A;_9w!ddFWel)DmMJk znS03Y{@|1yuBq5AFbAup{QTl)tghqz86mT$Ye)S-yW&MjdLA_^X^e`?d69glyCzwaw;yffYzdH5>d<@L0X%Ft*G!ar~& zU*x9T)YwJKX*u`Mjo9Sz@B^94@JD9u1g+r<)G{{79R6`IIeav=6!lyK$uv@poRkxX zi1sj8AS=w~VtTi+M^+cR^?A`rT%fLa6VhNF?vhz}6Z}CnAlcr@ zJHXZ*_0jM}N$p=_M)&l;Q@3R1_S6a6L04oyxmP}grts2{-_B#TF($x0{W;^epx2=u z{}*+W>q1uE0k=dXNbDH6qVCvjbRId?aT&HkZ)}1Yfdbw;;-oXk{oFpN+QM1@G0b^i zHq2;{PzCnu2&<$W>>L{WH^g{fB(?$<4CMf(&~|aL^j|nSvtFKXtJAlC{q9&%wgLIp$; zMF_7vgKLcej<+%Q%`*YZpCx6;_d4Q*VEUVyVT067^_NfYt<3rcrgA<`(c_RUgnm^Xf+l< z1D3#in9lq8GZT;C-x=^YZie05h}Y?tIzr!9=dd)dha$kdNUzcD^$YzskJ0Vb7`~>* zsLs|F^-y)N*4gFNIsT$@s95`=UDFz-&e|XCy}VHkSDW!)?c+^2La)Nk7|!MS4=$ne zVl7PICfpdlb5HQ`4X(%^xhD66tq?{tAqOY%1O8d{SBEsAn_{7p-A^6XM<~^vXMeQ2 z;jgN^2rrj?NMSS3+Pv!N=56RZp8iok@~`n}>v(3hL>;)0()jC6;Y@z{ewr~Lv)Jbq zvc8=q^-r_5dDPq{gG3nw8jr`|W5zXI(b+KYyPhT=!+3@MBQ(^CO!h$!&$2MLuM$iW zZ?x8{I6LHpgF2n^+ZC;6j&uIwshMKH*RMByx~TtpN*}lZ)6`xl%kAv8#%@_jju$U* zrk?C=pijw(I*f`s_Ur8#{gQg&5Z~?0f5pv=&>v2*C1uE`(K zjd3)~?xkLEPLV|}HU(~wznDt_MWlHdwoz00rwA2>EJ?|9gzg&^M1(QM5h074KABCN zr7K}RXEu?iWwG*!InL#460<9!WN?4C*Lg2!yZfq^R^=P_ z>e%Nesm{&C&+dG7Mw@sByHQU(F4hZM^c2TMQ|c`IbP5a5e!4EU({;Rt5AYP_q}_NL zbKw=dNS8^9QCJ)2(0zY~7#Wi7W4L$`>$1km`DVb8Aj#+^d*ec=kJb6JKJVKWnB!e( zH`8tv$2o8s{AN863G%JohuX>M_6xbrP}V@aApYU+YCjhMU!YNd4EAnJiHlsC%r6?G z+(|E*F*dLvJWp&RnMb}grs8CsuJ^fO^j`bEyP5l?@y+g%^=s~znJ00vqdMe*+vri! zYOY#hGA*I`sw%X=pKuhvutVUf^AD%gKJ~Zw1X;x-vnK>`ZFm~``qiL|$J3(2qjT-k zEukt`hxbZl#6nAmw~7RHGq)ShTVIhc-;6+)el3foE)8UJ-1^Zcb!Wi)YTb{#sogU7 z>padJMZV-7obO!7zm6B;7w;T(!ycNMXqVM-zOQ->j`cR-GGr$O!!+?g?Qx!Qy^F2@ zgREtsV`T;U->g0LWQd^76-uy_zIi2DXxOS!j$1uhql5+_(NY2<3(Si9-iP4 z5XKfC<6P<;57raaJ&xAd_!qrb|A00yA6nTH?M(Z$`e2u~ciKT}p}p4LWCvLvydvPv z?BFZq|JAqNI_mA`O|m!neEyMYa9~cLwC<}sI?8Ghm}J+`J=J%PR{zta4;v?3lm2?=V!*mJr z{3(7ET|d+7s-B(c)4$Eb+uk1u!I;a=m2p(n);l;vybxn)2sNhL^xWu7$1}BYPv!`U zfRk`hPSq84WB7!jkRPUD9twuPbraQECqh<#aqlkipRaWKOy>_=0_Ui2W=!HfXQC>V zdCQ^n^|Uz0OEo0ph|xCeyq$#WWe4Y?KwXSJHYy2u@5GtQZIOl@~gKtH!@VSjuKd8s9J!7J2(V#S!?t7aAXtWtE&%%B6gr$;>x z-ykc?-Nse-KDTME$83~pmvoMGmUqWTAITPy{j^$QmI`kBU(pkVE`s$1Ho+q?NPL## zU@<3v13F?Vw1)l=%emkzJc56r4H~eX8$lJ=0CQm#mcSl3A2#wiKEUBL7yp4o-zcgd zoQD<~!R{<(b@5XE3_rLjwB<_d(0;u_-Lya04LBN-t%2O1U*LQRVk&nQ%|$zv8!y4@ zz!;-jv&;TV>ga00eEzNqgUO@ClR{hVjgVkpMoSN0Os8_MHqI&BtmbaR~XXHkhX z5q{tnaT@YN5KS}}rY-wc4|-E?yO8z4y)?8C& z8GIhQ+Y7*ndGr=}PY6}a3^*obZ7Xy5pWBTy&L{p!o9R845g{7;R<*QQJ|p@jKb5=Pz>+EemzN#)?4jEl!zzf8ubgd(p9+~{0mB4;KsN} zRK$LIhYGU}*i|7E`ireLz+!(DaZL0#9;t%-ozJSfJd7*z9nQryDG%1i1oc%n<}>yK z`>oYndDL4a0>SnYYh@rk@JrynYGG}(K3jz?4(zmYSxJE^>Y2UHTBmE+jnpCgjFr`n z0m%+r$NgXdZi3^m2^;8TdLh){GSCxuVHMZ{=^Viac@+1BzVH?%@EHKij}3V?NL~RC zxi)@Yo0(&=dYDQ|kR9rwLt##6g8{KF+V zFPG*gDlEuKoc~~9#6a4{wLKM`ePm4J_$&t<1+wf4SCQYsw}%8}>mF7oxS!mVfWBnc z)5_yqkpKW707*naRMh|c8GIAQayM87H}D`;6#sJ+cD{SO<8$7$`u;@gdwfl(DeF5Y z`rEoF;+1lTe9KzQd(=@SWE@JzFmVnTk1d>nSi|3KvByl>ZaK9q3%eC9sWt*242(BdI&r-CR;4a7U@ z&IDhKz!aRLPP7Aaab9obZ(M|&)D4~ZCa;HwR?`FoqZ~fsA8tY~ja?Z($lAf-=q(@G zcgU_{2wfLFd5=1*=X<7`Wvv$0YjdaPAQJUwB-u{Ljygl8%E_J{Sc@3zEXq(@JwVmc zso2K#Fa)pkS6zjcs%O>_-=5z7pZaA_^*&ZLRH=wMq4#Mk6~rzvmsja{vC*=7vYm_N zZ>p1U`2Fnq*UZ90X94jeW=Q4{n5pWS4&6sBzy~Ub_3~#CfKAxX>JGgCZKe){@xohx0Sn4Y(ignl>{I78fc`Nm__xM~!bme5nQGHDf zuzkv^=N^|kCT)ZJxqEl|S*w{n=f@*?H%D|b{kV`Cnc2zgsxRkwRHXJ*l|&CC;d^KC z+9%Zg%1Y6_bUjCgGdaAKwJrzV>EH5wuzkPcx{%M`GmQ7?jeJG~h{^m#pI2YRPFn?I zuX=;~^o+}hyvF6AAAR2Y6!iD=w0^bP%VHtAiIrk14HNw+n^wzO*evSv2_*1rF%|>J zq9fQsrKuiO1CgTAw1awJAQ#q(XK;Pp565|*cBx`kkTkg#yQq>VB2TLgvOiv#eJO$3 z;R4R{M5C7)Wcx);wl}2jVxVr%uXPok#F7KCp8tl$4bdI@c@JJI(sk4{tC(tK4OVYe zasA!o51A_O4AJ@fY17}Trimvezx`O?boE?mGB;>5g`I{=z8lH(yc3)WvRdIqc zkf6_CA;)=J=%0K)Jg(&UoWGLyVv$UH4cr&6MP3}zD{%yLDSx8MuS7K?F5OyLQ1MCKBSePRLPxv9f6KBH@ybx{p`OIBzV zdo*N{gN+gT41Yrm|EXr{9Oa|Cm{qL~YLT*WmqM>fFD%?|&x{MUXGLB{ex2oMhL7S$ zDl96X9(BiAOu+^^M3X56C2Fm!MAip&q6Ztcu>Byx(;Tb=cU27`kEgA}Ql?&Wn7q zmbFxz71@|i;q;dNhts&g4(kI$pN4K$1LYgsb5Q&88utqKha-0YZ{-pyl~ zJ+pn)4mH)iMhD=!HIbvSKG$@daKz~zJW30a9$kts>rwy}_Z0B`5U^AK zq+*E^ej@tx~$dOSo%3HjP+Z|@R1POLQBo437{Gj7_(i~Djr4H3!W zu6RX06fC;nDSeSWNr_)69L*>g-S9g+ln1rwFz)DCNE0)t7_H!Q*d!)!0K1s=S^Y^3 zM6_;<$$GDNsQ1u%j#5R;+Ip-0)4s|G;)2pE@2fbQ11FFoCyN~G4gE?7=>cZZUt%ri zqCMAELsetl-@0zyv!EmNMKxBPP;=E%YlgRr=V@*w_voCLR$228^MKjN`;(`)8fTR= ze^>4FIYoM_zGU6xtE#x}tG4Res-l&qcIaQNSGo;9*6Z{FWO9A&#ddDNncM_Z&=nVG zINd>6ZiXhj1S7Eq)#w*01vg3|9H58#9^zF!YY4vrlTsUnUmsg_M;ko^gMCN2C#(Lx zbFFSP)Mi6v8EESv_ZoNXdu$_ZpPX|%c@$wy`f|@<Gtt|M zWyLm(s1jHsYO=3gEzCPSBp_f%nW8^rm%0%5u_%`qZtV0834A7_Z8k@wTyE@lw4;Z{ zbEC1e%dzs5EGcUmwZsvlw6WKyX6sm2DXv%w=8t?(bVVGW z(fQ5a-3jUix6{5{m;3TTk%2*2zb~G#+!HK>amvwEwZ{&kN#W$PB@wa;ueQ*W& z6_qoQ<-cF&m}PSJdC$4KBZyz|KpHQ!bn+?P15Pv+t>q>p+hXnaV8}}L-vVZSeEjfk zcHyvM1-in8T>KBoVx!Ivw~nwjyXwgnNGj_#wtQ`1lr>-8)X1EL z-nZ@>da3=7QqJfmp>y~n>ZvVeCa0?1?qt0Pt393cQ6!qr5hM?K+KLv&GNZoZqqA1X z2`xOH+?uJ+-;B-7b#4!NCTpT4JrOTyKMd^S#nyK5R=mUL9-Z&2z8qr0}o1^F$Qo7P|KzXBn1LT~*muq9XOlYGs`=i<*zk_TIJLc4nlx z+&j_xNA45PbZ<@fWA|#aquc43V*Zoc!W-f3;l6GTF(12|dfS;N-21(I%#UukUwiiC z6md8649U4}W?D(!uU2i#hcEGxcBUlUYfiH>Be9Nqop$ z=U#4Zx|9BD-`n|)jk1g=6ZJNh7{0Px9!rmtT`oALp+2 zoWxPFR-O^_4Pf=I-Zq*pmDMp@8O?hHN@~1d4w*~Hmf$hwJ-DO!rgeB zXpFVA1;fB-C|ZgnQB9mcU#;Xlv5|@wX%h0UPv4q<6#FUo7gsCiO2Hk z-obcd^S9YTUEwu-ZMOH0zk@e@zr>rJktZKNhriLN{`FSlj z;by3hyfhoXa4s+BS7IpbpyGUk=kR9KP@;WyL1Vzz};{e19e%BL>%`A@Pr?u0%EY2V&LR%dLi@9P6 z?&A}x>Yeb%4m_dqVt{8MFT|9$Mw*d+|6TR3)xUSaEcCwlG(I>e! zaC!3N%U2PiV@;VeHR!hjUum+=k4ADRb*3e~tXn}ilPHsbY+U&DljNMO7Ci}G9R+rYoD z?+H24RmFdutDfg4dD>_3z77}u_wENX7cJ>u$`-zKiYlTr)#o@|tNrInX1<4Xj`eQTh;+v zm#;^%ws~%S%9Fj??Q}1o{f>w-f%%$6-PO%?4kmkhTZ4FLZWF6M-Sza=Wx0teh_Y}Q zzdL97ML7G?VYxrTtv7iqe)0LV^7$xFAKS0~yTt-=)u_aO>$Y4MRpdEZuQyVD+XpHw zYKkhVhf&gQ^KWW9;tUIA^V8cLIwEV@MsP0orrL#0rtf2Mq!3d0@MHDWA;X^I48;!Xq z&q4y0SvAyI%ZW)`QkO8Z^f=K@UKNjIKQTuXBCwy1$2qlH@3hu%S1y7-c#^Kfo3Ki) zSFx(1mEswqzUb;!CeKhU)B_!&Z>br2qAsK)+j$NgXoFC`!!5Wr-?u*VM2^&#*u`ae zC;!e}^#`4ed6oh$yW$2Q4pRpn&vE>_s4kA9KNcYi)##xph+**1 zC7LD%iO+Og%%S3VY8WmGmIvUVrx=ROG?mi03m4?lx-7ytUQQGr#CI&=1XLFdL<12( zTREA}^H?#Ls!%&>N3E$UeWDk&`Qy`eYQLszMt$is(uJU0O77;d2lwjzdta0JhA z63az3^x|IJn#*tycci0KmhP$zh}T{$Kv_)qHZX8+g;Fo3=5*2tVv2jf_jziGY0oZB z4|EUJ87Aotjlzue^5Ylz+_I@}kCU2w z?;Lz6X1`!D*;Y&M(WUe};~n+m3SyYpMlZdW%$n*SdpR^Xa9S@Gv}7dgDmk%InBBwWQCIB}LwL5VRmiVk2O z*5fx0L?D>2u}dF899p0~me2z#&U^WHxTz@pK(jLqXNBlAJ)ef!E^-3j^1kH=vtjN! z`c0oVJ936@rt0%o9>71+MbVkcsh?3v4AEOSSI@&WIK^@ujP3M}3aETsiDS%H)*4@X zPU^S)z5(9L-=3!?;G^xG9O)z%oNZ)=JZ}s|QyuJol{V>NvIlqLEB1}X1vw#iw7tDg zaBhOAlsPq@SJj!R`qU1Q z&YCXk`>#)0Uj9urX3Aattm3nPUKovR zd5RVDN!R6!?zu%XJJNgchbYIU-o}@(8WpjCn^HGal$Ys<*rfmBP~O40NW@Sbz#TbM z`)V(LfD@(Zf7}aQ*+d&8qptWYh8c(XN3=sn>MAOdo63_9_1CYk1V_;X3COSSp_5(< zKa}FG`hxzZ(r7M7kJCr=Aoa7{D7T4Wsxi0J^)V9b`6OQJ&U_nt^+yi1)|h>*E!IG5 zl3Bv4Wu{nP*>3)0ozXVyv31glC1bTf0JU(#hcu}f^(GP<#R+jhe4r2XL3|L? z#a8hzH5LPC0!^TWRGf-ab$L)8l%~il^2&4ckRH;nVuV4UR1W0(6lQCSBs8RjbVmn!VuQDTXne7tcZcJe zudrWo^^BO#iQWix!nU6diQjaHe#_&~9YM6;3R5rC5BBr8qlVdhXrlaPzbd-Qk9IAl z%lk$9hIaJZQ~uxB*pOyXI|C2;UB@n3C?5Gt@F9BxHv}AZj z-aX;19`t*?D|^H9xGzJ!!>Ba*ivi+a+AnX5AE~@JK*7|AQs@nBptIDB+ERT?qiHma zic=mch#-BNe@7TUfFW9l2vL;wP#^IRl|~ROQJA_TM%(QR1O9S5eHP{akMfKB#t1w& z*ST+a%9-tQ+nAB+g*vNmi!QLZiR?n3v=I48>BHE}&#hR_xSMv z?)OMBQj9?XW3sQlx0&)bocz@U<$MQwA8s>UhXdrF`kMlsz?y*Hvz0ktU{?|6=F zqN{&E=d9;Oy8p$`l|5^$1f5ss)sJLfKBuebcI@Dpa-Q!wzmsaC40Xh*LgpwHj&!w9 z<+K0&ZvL&R*#m;6Z(RHN;D=4dAmh62FBvT(Wt{D)ZHeu@>}sqxrWxnOA|p+7k?mwVxlD$L z_Hf|=cG4{|MC22Dgch}FyLgY%=qtWc0ADd+{D^v@gy>5J#Z%c&d=f`&H-(mwwmafi zam3ipO=zK5OT|cu#hi(*NJA`t%}sP(?=36VO0rV)JayK(qpDktEvRNz zEz8o~t@);(-e|H_OW)>8s+ta0f9a{Zq19Wz)F-UN)@aKxqm@S=Q15x3KCF-Gy?lc2 zQwDXyZO-NGn1jB&nOopFT)Y}Lki@(6d3poz5u_stm(htgBL-^_PowDzwL}bcLmKX2 zDbgs4{AoPxqT&>c-eM^RVT!ni;(`{U5p^dI9mi-nQwUL)decSPBF@oZ@s}JfLS%V) zU!Iib;USMGDE5c|5kVKJ2-)Z=U8O=8M5mF!2`I~}=pj9%i=;@=3)(?Du!Anr7Me@F zQ6KrZAvffj=!R|>O+#rY4W-f;gb|b>>WF)Kk1^gaGwHCSWclXFoj*7DIFR?c3ucYx zT~?5J4E^{F>Y;+IqD`TKvyJ^6>dE0U6EQeO!Dxa>$U+kTEe^mkT#{YK-tW2o(1>z( z$S9O^F4$8b&THrPD8?VvN^2YWV~HMu;rbv)>!GT%e#{MZ5sdOX$YngUjD@!S^p9=7 zXigb%Jws0TB-MIdY<$?hnB)54lKHP5pdXnv=?#vViPVflu{pI4YWp#c~PmpsB;5Ct5DKM3>Q(eGbzX z{tvJGehSR}TFL6b?bRM0u1eDmeZWc=Z}A!~SG+WhbaBaNO5k;UTGv;*;nHRF5o2%m z@iYrA^HgrG^~9ABWWgXCS3)ecQX}zz7IDqEsZ+IAt`WHzGtF6)%l6V*%}!@wPdnWTb5~B~oL#UaS;eW2W3|j4;Azuy`p#cpU2C zH9f=xv=nbq47KPDGc55O7sLnQWrhpWd6}L>PkECb#nX71I>*~oKDAZfQ;YOV-BI7O zmTF;@^^D~M)(Uf{eyfL?t8@uH&l;)T=pt5Gl}}$&BXoXkS1ZluR#WSWyT6re#aO%a zD7}pT;bGpIYRpe6jd3|!n0b^M# z?eu$!r8p|jgD8w%AeOsP8-0p?Mo)AQRgGf^)W3)iNauz$MTBHG|D5GNHdWrgl=IXT z77`&|7`<(K*lW2gL(J9Ztw1c%5h9=bCi0x)bUXv zv;7X$J5^$9<#`p?=^?%6KXHnC@tA5&xGF@)eBS%x!pd+&cfF@gQv37*~V z&nS!w)Cwo%Em_F-(wB0tcV%A>pA}mZYsEA4!VTF~Vkup3+&AQ(tQ(<^ zDoD0#x!iu=_f_7!uA3p*K`lb5&pti&Z^cSEZQ=T-|b;T~xN!-_k0{{A@+*+Llu-F%MZ6EU4OAsK(ac zI)n#tAb-O(E{t@{(O=O5={%cP;R@#B8){)Y9w43P@HC_&oj$>Z)%*>|(2HImmdes` z+~6(L4N2^WG8ib{Py}70XCi`T&}7`g3Z8>C+Yv;Av5)G}1~lS^A|GPJ1pKL=BZ--rS^P)cDW90gtEjqc zh|Tl^okbGerT)}kyyXJ=5!O>Ho{tnE&{W*Y4fMGg-|2Q{)-K;ISt~NKKleok-NmYg z8)ASo;X*PkL^6-%QTT-mAc`8{0hLA&mWvs9z*1`jQb^+_!S&xAJwJ~9XdnOK z{Th%zDiu%g8ZOn$Q!Qt|IER6pDt2-%M`E9~TDLZn%#+p{pSwIt*?c18WBu`m26=-0 zkJo8k;#}CBu%MtuzFU2Y`{wvo_Ic>j-RW|D@^9<+yWbF3vcpfdF@kKL!ZY%;igN4j zbXxSd;<;b;WBZSlzUNrQxGQ(mdCA6`*bm8wLsd9&7cnS8ELLn3yG8q`?alcDlCbrzRIIq{V)iU=x>#^P&o!ih|%Iy*toht(N`bCOIvV*fLRi=DC;}K<*vSrXiYlq)X zqb2RZoS?~32a|WHDzc>hQy);}XobFwS)4$nMT9fmzgfs0j|UZuTChZU+hN@2I8oj< z(TdG&NY~6)neEjjze+))@q`**D`+Ul()t5&KV7ePFY z1#nFC$1hwKF6_iID#(Z6q!9?>ZFq;H{0%`cD2x)2jtevz7K+gu7!*$n@PJCwBa{+r zs0$v5%i;jKixpxp6*qnq5u&^;SXMRW8$t4f94)vncg61<%5C^ZT}Kz*-~vhbNrMM|lsQ;YcpP(fnBV)f07dwL(|Yw{#8{zy+4*#gn)aCYiT&9qr{@ z4&;8!ybhBPqkXXiv3f1mQ+1p|HEK+MiN;o}u^`~z`>VKSd{D}j)NLt)skApDw-WuL zw(vM)slvJkt*58-l#ihroy95IFZ%Ex@|R^$3Y*0_0FmMm4Dp9-@A&a^!>3OjC(~Od zhi6?6-jXkmI;(T2yZFq}xQ`U~Vb2@GprJ;Ry*x`qsmW%L{?6yM=0M+$aGLGP#YA5T zs9ndm*!%Dek*NXm{0=y~*(ch)_TL<-wo8uIK3nWLVJ#vK1h(-hZ2xXs6OiBkxPNU| z&G6;Xw@&4|67_Q9wclR+_5B`>VVd|)d=_tw)p$U2$wT3^ROA-P&N+HQd&bR$VPocMLJGb=`g*Z-L%BGAic7iup5@WM()Xg^95^v z|7@k`t+~V8M#g?`bt~4IW=+RyJkeX}6m8+*;;VSZ_t6h2+?+DRY#}g#Gvy2QR{;`2lzg&=O~)@g!{C9!P^kSE#O2c%BFL!My?Bi+j3J$**u&ehRC|eXH`^N z;4)4-8_A#FT)q8g*5FcrhH_O{JWH)WtUkhBD{Z#b&vIfiOIz`;Yt8%) zzsxgP>EA=rise+vb$GsDrjM<3R&>M2o#HLEqO-h>Dzn49u8XikZQxm0owLsJsOtI! zn#ol@9@}we9-kFzN^aHk#vlFOc)hK;rz~lv+9w6H5+(SFj*!uG37u%OxFufd3EYpa z>U-j|w=%!$C?8_zC$<1(NqkhAGkf*q8xP=Z)GYD(j2m>H@u8n(kby-EaLHK$%g>6L0@@?!3ZE8-U25vAz`SK?Hf%7gd@O~nE{ z)W2~*Jm6?LS=^7d;4Cd5e=29NAN#Q%^}0*!PE72~E1TFQPgxyd2&NK9WRva#+*Q~~nP0OK{4+jWy0|gBwogVtN19~2=eN^# zU6oY}jjr0wd%OuaWp)w+DapQ7m9=)N!DcyYf~q2m>RN16e-ksxh^pPY=-rT?LWTq_ z@$WA_$(pjN(blowwx1WEyIk&{H=wA?hfbrH1?Mr>L%)xK7bDI_OxkrjLga76OoXAD>J@TQM$ixJmfM1Z#uS8E8D{k>4 zUdx$sK8+ND+n_(+;6=-^QmOr#Dul_k=D& z)6h~aS1WigOOE2bya>;+oR^5IVh^n_V#Lo_jwL)3;apWuH}+D3>}GcGT+Nw^KUIG{ zD>&U<>BCfWwYPdMsePf@5z|E$&tXf3vLA1=ZRDytOLpdVCWk(EHnY`n9}2w@SLf9d z^D>>|2dcJS1*iI|ZN6umm3+r}mQt|ojK|<7JP^;dmuI1%(a~O7G|A2Ua#3_O*Jc${ zkDNCP%t12Mw?@Dcn>ZRB*g)-|Jhl+qyozCtOl|ze?`{`e&bV2=1uRFQ`);Fr;Inpl-4H;m=|LBEONn9et;Eh_25a;4ZUCs?KMM69$%agy-rwf1w)lLfEX zF6J!bXt&$DTrK4;`YCqnZRkOT&<2lD2Jf)WoTqlHjp`V#Ad|Za9rw8n-GdJvpgk;Y#|)Gd^LZw& z^9x?g+f`NDWbwN(O`qpHij=TE@fZkH!dLXfd|JuVwbl)3mu}1?8t^3~@LS%FuDpZK z@jyI5J`Tp;+yph1*5mX;9l>RAiBhnf25<@&Kp4;BI=q>?V*?)H1^_4D$9kmkdai=Y z$f8#8#}%p!11C8VE|f+LfM#MSI&fv`Pa~+Oc!ch_OS9qTk2D#{FzGa{=2SX`>Us;X7y?5W%VOLH>Zh?$nQ5{*_hr;=lDo2BPt)5Dg(GmsNWCw4f?KDUX5I=EG z@f+gRRFNNsm4+mMA7g=kc=D2KE5#IZc2=_P9=tT>7k!ZTpq}?%E6lR2Q@WvEWUXS8 zYT>wUtp-^2)MV=plGPh!3#(~+PpccHm6#h+BaFk6LUQ;=swisETwAa`LJiOpaYeSb z+2o(%tLUN6**x~O#%sTSf=2{Q-B#vg%LiFW-BQnFpT$r*D~C~Sx+$B|Q}I;B8D+&c zSwOas^JS)(DqhO+Vv(pL=gDm{RNgn17_W^4dBx~plroyi=SFw?Fk`#TCN7Ie(NUVB zkgOrk&nMlXU$LQq_%K56jxsKd=j#=)QbjuN^Tj#`O z>@(6F6Q#p&IsQYq*du?|1NpM`1ki0%hLOa#brt_DVdJB#zUVJPf(xonY9YNrkjlfq zQxYIr%iFfjj>YbHtAdq++Z=}!3X`7o`_50SW&fju9o89fLmd0HFIi+LrC5ESY zXPC3Cq|9sX{bo@$l;?Ab`AVPQ$bf~mkyP4Bqr1j+N2;y9j0<>Y9MS`_u73`BHOQN3 zMyrZyw%@m~WL*eLxsv{Y+EE^TiSp0_e8f$3K$g9-ePu|TfBk^`hUG2&;k5Ou?yOq# z?|L1|@;mfJ3QnQ|3~QgR!!7hVM9~s!sqV`iRd4-W*U*{=;;^|uFV^d<7*xV|^8!0) zmJWbReBk@Mn64rVHvo)=lUEbL&38mYv6Y^Rc~qU(%K_Am&Ke0~hA3oArdBl1(No@+ zo$TA>aM{FuKtzb&?Y(7)7;7XGi7nEXt73wjgisz>>>+mA{58yJ$LJ%E- zKT>c3-{9iqc!k1r5lOf&dmt7mG>l>qMq_CW8c{Gh(GfX}N{f*qFIA^bG=a8o6`sUR znCU(3Qh9hZA2IkoN9)=885(geHKSi}fv4&!x;a(EZMw?u;NZpTi)w6z;GEbbB4F@K zu1CMoA5?>;(KMQ-2kL=(AkN_&TG4Gx#1&McqPU1;WT7LwI38KlAJ;IR%26T>$lWd` z`ggn@<6V#^?bWt{7l!%1Hb4$)riMB)RI5Rd2%X^6@u;rjtVIk zu+?$WmM(vAgt}sV79MDRaX=D%bY}gnQe_(vW86gm9Y;646(#wwyo}4Rb94TT>l}f1 zyp!U@Yl`CQT!0tgHNtoqVvvZ<`iQ>82ha;;nXv%zNT4PdNTqZQ-9w#0JRKMDe2}w{ zpo1NM2cF5fX7dk=$>{L)?l$HG(>d6JoD; zCWQS5jHjW(Qg6*c>VWmcvnI1^=IWdu@RRTHuzRk)9}2#>oQ13|S$km%4$8Mf?51|& zg099xQC99#;pRb`A=~L95g&ce_)JLp+rL9>-FGL=3`B89-CsAPUNnd%$<_9$j`Nv6 zSzap$v#6)+h|+4cdJ3m;+-Vu3z68A;=RK_Rs&dwJ-`WA~@qiX;CtOrj@2A!B5v_?@ zBk$vQjmP$%QVN+REwVTNZudSC$DZ_0Zk2Z9`#iHOcZ(icpml-2ZC6De+Ml_>b550Z z7w350$2yMh)RfPn8FV!VKpw{SAez*TP*{U2AwDsf6k(@zd})DGTb-)au_lr#J0Hm5%5Oz+VY9l0qY zxg%ZCZ;*m9W=B0tKQ&LGE`Q>sR2Fmafs4^XF3b*AT%N*ci>`uvbc;G;8OEaq9wI}& zp_xd56Sep`ORmJeSbz@P6y8a^Z3Yt2&Q=`qvz>)oC^Sm zqhi#bQcxJz;lu)N2sgjO9sEhYw18@(8cy;BzJUI;@NE}|t!UhH-_Khz59;647`2za zs;A~QTFQ&{S)9fKe52VSfYRBbLDYaYW2E?(EAT8@k3-x})IpHGB$psX6rt~a4d0i( zxLbQ{(;el6@Tni%fqhjQb%Fy`py{xGuzoYgsQOfp_BedVhcmegPt?b`3Kvxy{07Ma z)U|5Y=xUNthGOM1&kx=R)x#F#xMZ6}arB=5bhL47Hs1KZ@PFv@#{Re8eP@HctIuA1 z9{YYu<_OHO^|Q6IE$5Rw4%P6HQV?W#u!2sY7*Eytxhf@6hTg$Zd`yS38$pm+w4oCAv-pC$q$lcKvZFvr->YW_Q_xZA@CMt`8*pBrahqE|Ep6tcGIeEsUnx0q< zZy)Q0w>|xjZ}DDzRR5<7dxie0s_F`KSmby^=?lHIy3$=5>9d*Ez$OGnVv9XOX3AU1 zgTBm8`;SYyV)IAX-gx1i9avTO&-j#nML!e&F9==$YCiiofPa6_^FOC@N_Q1CLvyPM zv5G~^bq%@Sms!K+vbk*9YG__DY}1%)8#BC_ks{{yvMJZeB~dOBAr@WF>7360|NP$1 zXN&4?Ds%JWDtd$%{V%@6r_9-ENK@x#Cqw|Z=3M^??Pk16=M)!L6{cD^Pmo4KFj;P< zzpz^mmSdz9nLHXJQOV(IizqjJ=QfeU8%DhP)2;T=!5e+-@3m|ZS_z1^weZW?dzZf%|rM;cPl?xewG(H zS36p%>%P|4{-D)xN2C1S2r(*&OosL+-a;0I=&xwG7^rPB4{_b^ori>4AALPmDSzoh z&&4+a zrPC!Wz;%SG0{+7A#w}ig8={I%Q6oM>C@zzMxfn+`Xb71^saPoD4=gZZWgpJezegs2!U5bGUg(N^dcj|^QzgTTPSODAc*0qDonjG+?RXpa z@B#d30FF{DSHYha!5xA43nf%b1};(`QHSRuQ}~PK@(;N~{tCNDLmoC_lt>kKu!Gvt zOCCgH1$iKga%74aPRIEfp5Ymdr2+8>b~bA-H@Ui+3x!qNAi9|PyZqO@K*Y=KqL1&Iz*T-jTt<7Oxw#!4 zc>e0c)TedUsn{T9XxHRY8Y?%++ms_CMXv0tRcOidTwW6IivO8@HeHlqnwP0SuF;}R zSF|0Pre$efTB!LKxkeUg(Q=L!r^RW}rc!O4shhS%yQZzvE@lmp`~gmTE4J} zC9)mdO

EYCvwD5idH|4z0On4C6#2h<1wB6vVk4&x`pira?2lqKmwcBgF!Q%HOFc zrooOYR8Ga(6dG1vRQKq`Mz!26$Ty0k#dj{A9&>9pl{GaUwSHyW)G*&N%fFnii;1F` z6U0zlr5~vm#%L?C&H1?bVvpM9Yo7k@+N0%-I$`YQ!D=~=Mgn<@?sBoTxNULYgwsYD zW>GS}L#9YmQ;qr5qA}MR19#_2TQ5guGR=6k+WxKR|8V~E)PMImywFjM4PDn}K--D#>okzj)Wztfifv2u z5~G*nj9QJOj#{IHmn$E1rGY-_9)=8ay{qkV?P<@eA6@OK((Ar^`EBD-QE0irV@y`} zB=a%Rg}QS%#i|ziK)%C^F%aI^p@#~ysiTk0VsQ(h0sYPwOYpF)zc%E-VP{jlyW z&Kbv4e;H@w!cWGr3wC^oQuIw($co(@3VZ;y9L476$NTbzF@@1GmFzxRL>P8iQ{UOi4HeoyswlmLU)& z2t+IfQW9>XEBsI)hGH&;kOK+gfT$BaWq`JUa<$p=h zvhDdXjkn!*1^fiN-0oVaUa^^3mWomIk+V>B({HJ9BHX2rcRD9J&f+un;RbEJy1|pp zCFF^rudQ+)f@fYUJaSyxIrclq5$(~b`5@0DqFA)x1ufohdCt$LdA9Ft&jknWow45! zdG@Sf3MWc;ZLl_tw{swC^f3;@or@3$D?Y?*%%q8Qo$@%IQ*aYIP=Poka6B_EQ%f-k z2Vv#yRDca=Krj#Gk(j_myq+)chdfOU=QteXc%`XsDxG~*duk;zMX0UJ>=L-=v9vu> z(MB7+r@oAB9Kyc3U5|IZqh3LBJ>n@Dqo|5L!4>4Icw;WsAxvD7g{BYWP$S>Iqw%t1 ztUiSYi?^D^di+G^ZDp^d+eO>`%0msV&Ispi)!w(lv&=k(LNP*{g=1VQ%>27^v&AAU zJ4pD`60-ISH|Qa{!d#h&enjuO6RoN)=$CYTYlXE6W(1$9Y8K~(qku{K>T z?`FA6oAsB~Zt8@4Vx@act-b8uM(ayky!y&*qEAx9kyiWsysZU}yrvnA51g+03ybl|Nbi+-Zjsv{Qg7GA@L_!BI{GIAGBF&`dCmG;w9Jj7P4WedC@;oxIv%Vw_NjVjN$$ua7CUd^jjYone!aA&ofJ~oc4 z9G<5}@(*GdZ$YwrEDp*HK1j*f$?23VEb2>MPG6|^@WcqBaPo_yS|J z6e^?c>XvL2%?xkD5Pyq7dK-0Ae9m{dH9epn;wm5I5?+GM@IpK#@CMGu89c;Ieuyes zMF}WE0q)@qng>5L&?$uDpqxQ<;-!2@^W;vEF5flzYkfpMO36VEgrPN^Aa68KIH0!S zz9w#ncSRY-QwJJ>vmC&!cr+)nCpYKjY(Xn*#Sh3AKG3;0TzE1Pc{s=MW8BAe8bnWN zH##GdenlT_#smtZcIeGVaRa9*SNPH>q#~J?KH4k(`?`9ty5V=(2{HN-dq1S=Ynu*X z5wmLIXt9+X&?rD&=C#U=&tVna62ZIym54sbxplV!~>8dt5uM(UgS|MT8djZC2SEN zwo3PS;>^?cs3y0amHF9Yqq>Lxa6Q7Qfaddb+@t}hqJCJ*JNX1hP#_$1ojTzeof9u81HbYi z{*B*(Kl1SbuHjvLWBg|17&k;;j>jDJj!H7-W41bn>0BtfnA=oM(N4C=ej#kf>#H3j zjA(tD8l+!w9*{?o%#oZ&ndqt_=tn9w&hZJhU<&cPEna&61>D6J4A_94e<;lp#$$;JM^xR{bQm&5d~g00001V?cQr5x38s`NMwkoA(?_Ctp1X0M8;dIRF3v literal 0 HcmV?d00001 diff --git a/apps/images/matrix_7000.mat b/apps/images/matrix_7000.mat new file mode 100644 index 0000000000000000000000000000000000000000..067c80bac375f367c5b6ecca8d0ccaa0dfbffbf7 GIT binary patch literal 248 zcmeZu4DoSvQZUssQ1EpO(M`+DN!3v(D9TSMO-@ZwNUBuuNX*GhNmZZ)U|{t01VdFlx}aJxO-cVwfU sT}zzIA+;xm_nnw=+WyPmkbO^!P9HpWXY1a@j5hl}%L_SZtz2*r0Jah}ng9R* literal 0 HcmV?d00001 From ebeccac0253586571f2bd6bcc157da6116438308 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 9 Sep 2022 01:49:29 -0400 Subject: [PATCH 27/63] add prefix to build targets --- .../anderson2021/CMakeLists.txt | 40 ++++++++++++------- .../anderson2021/DefaultCostModel.cpp | 8 ++-- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index f6db09312582..7a82d3688431 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -19,13 +19,15 @@ add_custom_command(OUTPUT ${WF_CPP} # cost_model, train_cost_model add_executable(anderson2021-cost_model.generator cost_model_generator.cpp) -target_link_libraries(cost_model.generator PRIVATE Halide::Halide Halide::Generator) +target_link_libraries(anderson2021-cost_model.generator PRIVATE Halide::Halide Halide::Generator) -add_halide_library(anderson2021-cost_model FROM cost_model.generator +add_halide_library(anderson2021_cost_model FROM anderson2021-cost_model.generator + GENERATOR cost_model TARGETS cmake) -add_halide_library(anderson2021-train_cost_model FROM cost_model.generator +add_halide_library(anderson2021_train_cost_model FROM anderson2021-cost_model.generator + GENERATOR train_cost_model TARGETS cmake - USE_RUNTIME cost_model.runtime) + USE_RUNTIME anderson2021_cost_model.runtime) ## retrain_cost_model add_executable(anderson2021-retrain_cost_model @@ -33,7 +35,8 @@ add_executable(anderson2021-retrain_cost_model ${COMMON_DIR}/Weights.cpp retrain_cost_model.cpp ${WF_CPP}) -target_link_libraries(retrain_cost_model PRIVATE ASLog cost_model train_cost_model Halide::Halide Halide::Plugin) +target_link_libraries(anderson2021-retrain_cost_model PRIVATE ASLog anderson2021_cost_model + anderson2021_train_cost_model Halide::Halide Halide::Plugin) ### ## Main autoscheduler library @@ -52,7 +55,8 @@ add_autoscheduler(NAME Anderson2021 ${COMMON_DIR}/Weights.cpp ${WF_CPP}) -target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser cost_model train_cost_model) +target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser + anderson2021_cost_model anderson2021_train_cost_model) ## # Tests and demos @@ -62,15 +66,16 @@ target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser cost_model t # ================================================================= add_executable(anderson2021-demo.generator ${COMMON_DIR}/demo_generator.cpp) -target_link_libraries(demo.generator PRIVATE Halide::Generator) +target_link_libraries(anderson2021-demo.generator PRIVATE Halide::Halide Halide::Generator) add_halide_library(anderson2021-demo FROM demo.generator + GENERATOR demo TARGETS cmake AUTOSCHEDULER Halide::Anderson2021 REGISTRATION DEMO_REGISTRATION_FILE) add_executable(anderson2021-demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) -target_link_libraries(demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) +target_link_libraries(anderson2021-demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) add_test(NAME demo_apps_autoscheduler COMMAND demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) @@ -83,15 +88,18 @@ set_tests_properties(demo_apps_autoscheduler ## ================================================================= add_executable(anderson2021-included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) -target_link_libraries(included_schedule_file.generator PRIVATE Halide::Generator) +target_link_libraries(anderson2021-included_schedule_file.generator PRIVATE + Halide::Halide Halide::Generator) -add_halide_library(anderson2021-included_schedule_file FROM included_schedule_file.generator +add_halide_library(anderson2021-included_schedule_file FROM + anderson2021-included_schedule_file.generator + GENERATOR included_schedule_file TARGETS cmake AUTOSCHEDULER Halide::Anderson2021 REGISTRATION included_schedule_reg) add_executable(anderson2021-demo_included_schedule_file ${included_schedule_reg}) -target_link_libraries(demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) +target_link_libraries(anderson2021-demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) add_test(NAME demo_included_schedule_file COMMAND demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) @@ -108,17 +116,18 @@ set_tests_properties(demo_included_schedule_file add_executable(anderson2021-featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) add_executable(anderson2021-get_host_target ${COMMON_DIR}/get_host_target.cpp) -target_link_libraries(get_host_target PRIVATE Halide::Halide) +target_link_libraries(anderson2021-get_host_target PRIVATE Halide::Halide) add_executable(anderson2021-weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) -target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) +target_link_libraries(anderson2021-weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= # Smaller tests if (BUILD_SHARED_LIBS) add_executable(anderson2021-test_apps_autoscheduler test.cpp) - target_link_libraries(test_apps_autoscheduler PRIVATE Halide::Halide Halide::Tools ${CMAKE_DL_LIBS}) + target_link_libraries(anderson2021-test_apps_autoscheduler PRIVATE + Halide::Halide Halide::Tools ${CMAKE_DL_LIBS}) add_test(NAME test_apps_autoscheduler COMMAND test_apps_autoscheduler $) @@ -185,7 +194,8 @@ set_tests_properties(test_storage_strides LABELS Anderson2021 ENVIRONMENT "HL_TARGET=${Halide_TARGET}") -add_executable(anderson2021-test_thread_info test/thread_info.cpp LoopNest.cpp) +add_executable(anderson2021-test_thread_info test/thread_info.cpp LoopNest.cpp + FunctionDAG.cpp GPULoopInfo.cpp Tiling.cpp) target_link_libraries(anderson2021-test_thread_info PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME test_thread_info COMMAND anderson2021-test_thread_info) diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index b453c4789055..84462d4afc12 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -14,8 +14,8 @@ #include "DefaultCostModel.h" #include "HalideBuffer.h" #include "NetworkSize.h" -#include "cost_model.h" -#include "train_cost_model.h" +#include "anderson2021_cost_model.h" +#include "anderson2021_train_cost_model.h" // This is an embedded version of `baseline.weights`. // The embedding is done using binary2cpp. @@ -215,7 +215,7 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim } } - int result = train_cost_model(num_stages, + int result = anderson2021_train_cost_model(num_stages, cursor, num_cores, batch_id, @@ -286,7 +286,7 @@ void DefaultCostModel::evaluate_costs() { auto loss = Runtime::Buffer::make_scalar(); - int result = cost_model(num_stages, + int result = anderson2021_cost_model(num_stages, cursor, num_cores, batch_id++, From e6fd31ab20066dc94ed9cd346524a8a540a137de Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 9 Sep 2022 01:55:14 -0400 Subject: [PATCH 28/63] steven's patch --- .../anderson2021/AutoSchedule.cpp | 349 +++++++++++------- .../anderson2021/AutoSchedule.h | 80 +--- .../anderson2021/CMakeLists.txt | 179 +++++---- src/autoschedulers/anderson2021/CostModel.h | 73 +++- .../anderson2021/DefaultCostModel.cpp | 10 +- .../anderson2021/DefaultCostModel.h | 2 +- src/autoschedulers/anderson2021/LoopNest.cpp | 126 +++---- src/autoschedulers/anderson2021/LoopNest.h | 22 +- .../anderson2021/SearchSpace.cpp | 54 ++- src/autoschedulers/anderson2021/SearchSpace.h | 6 +- src/autoschedulers/anderson2021/State.cpp | 68 ++-- src/autoschedulers/anderson2021/State.h | 18 +- .../anderson2021/check_weights.cpp | 10 +- .../anderson2021/test/bounds.cpp | 31 +- .../anderson2021/test/state.cpp | 3 +- .../anderson2021/test/storage_strides.cpp | 39 +- 16 files changed, 552 insertions(+), 518 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 640d21823d64..601b04f795f5 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -20,23 +20,28 @@ Environment variables used (directly or indirectly): - HL_BEAM_SIZE - Beam size to use in the beam search. Defaults to 32. Use 1 to get a greedy search instead. - - HL_CYOS - "Choose-your-own-schedule". If set to 1, lets you navigate the search tree by hand in the terminal. Whee! This is for debugging the autoscheduler. - - HL_FEATURE_FILE -> output - *** DEPRECATED *** use the 'featurization' output from Generator instead - Write out a training featurization for the selected schedule into this file. - Needs to be converted to a sample file with the runtime using featurization_to_sample before it can be used to train. + HL_DEBUG_AUTOSCHEDULE + If set, is used for the debug log level for auto-schedule generation (overriding the + value of HL_DEBUG_CODEGEN, if any). HL_PERMIT_FAILED_UNROLL Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. - HL_SCHEDULE_FILE - *** DEPRECATED *** use the 'schedule' output from Generator instead - Write out a human-and-machine readable block of scheduling source code for the selected schedule into this file. +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + + Most of the settings in this Autoscheduler are controlled by the values specified via + an `autoscheduler.fieldname` GeneratorParam, as listed in the Anderson2021Params struct; + this is the preferred way to set these. + + For now, however, you can (instead) control these settings via env vars; + doing so requires that you compile all of Halide with HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + defined. (Note that this ability is deprecated, and likely to be removed in Halide 16.) + + That said, here are the (legacy) env vars you can still use when HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + is defined: + + HL_BEAM_SIZE + Beam size to use in the beam search. Defaults to 32. Use 1 to get a greedy search instead. HL_RANDOM_DROPOUT percent chance of accepting each state in the beam. Normalized by the number of decisions made, so 5 would be there's a 5 percent chance of never rejecting any states. @@ -51,10 +56,6 @@ HL_NO_SUBTILING If set to 1, limits the search space to that of Mullapudi et al. - HL_DEBUG_AUTOSCHEDULE - If set, is used for the debug log level for auto-schedule generation (overriding the - value of HL_DEBUG_CODEGEN, if any). - HL_SEARCH_SPACE_OPTIONS Allow/disallow search space options to be considered by the autoscheduler. Expects a string of four 0/1 values that allow/disallow the following options: compute root, inline, compute at the block level, compute at the thread level e.g. 1000 would allow compute root only @@ -63,10 +64,23 @@ If set, only a random subset of the generated tilings for each stage will be accepted into the beam HL_FREEZE_INLINE_COMPUTE_ROOT - If set, run a pre-pass where only compute_root and inline scheduling options are considered. The cheapest stages (according to the cost model) have these decisions 'frozen' for the remaining autoscheduling passes + If set, run a pre-pass where only compute_root and inline scheduling options are considered. + The cheapest stages (according to the cost model) have these decisions 'frozen' for the remaining autoscheduling passes. + +#endif - TODO: expose these settings by adding some means to pass args to - generator plugins instead of environment vars. +#ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS + + HL_CYOS + "Choose-your-own-schedule". + + If set to 1, lets you navigate the search tree by hand in the terminal. + Whee! This is for debugging the autoscheduler. Since it is generally only + for use by developers/maintainers of this autoscheduler, it defaults + to being omitted entirely unless you build Halide with HALIDE_AUTOSCHEDULER_ALLOW_CYOS defined. + Even then, you must *also* set the env var to 1 to make use of it. + +#endif */ #include #include @@ -103,64 +117,136 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -struct Anderson2021Params { - /* Maximum level of parallelism available i.e. number of SMs on target GPU */ - int parallelism = 80; -}; - using std::string; using std::vector; -// Get the HL_RANDOM_DROPOUT environment variable. Purpose of this is described above. -double get_dropout_threshold() { - string random_dropout_str = get_env_variable("HL_RANDOM_DROPOUT"); - if (!random_dropout_str.empty()) { - return atof(random_dropout_str.c_str()); - } else { - return 100; +struct ProgressBar { + void set(double progress) { + if (!draw_progress_bar) { + return; + } + auto &os = aslog(ProgressBarLogLevel).get_ostream(); + counter++; + const int bits = 11; + if (counter & ((1 << bits) - 1)) { + return; + } + const int pos = (int)(progress * 78); + os << "["; + for (int j = 0; j < 78; j++) { + if (j < pos) { + os << "."; + } else if (j - 1 < pos) { + os << "/-\\|"[(counter >> bits) % 4]; + } else { + os << " "; + } + } + os << "]"; + for (int j = 0; j < 80; j++) { + os << "\b"; + } } + + void clear() { + if (counter) { + auto &os = aslog(ProgressBarLogLevel).get_ostream(); + for (int j = 0; j < 80; j++) { + os << " "; + } + for (int j = 0; j < 80; j++) { + os << "\b"; + } + } + } + +private: + uint32_t counter = 0; + static constexpr int ProgressBarLogLevel = 1; + const bool draw_progress_bar = isatty(2) && aslog::aslog_level() >= ProgressBarLogLevel; +}; + +// TODO: this is scary as heck, can we be sure all these references don't go stale? +struct AutoSchedule { + const FunctionDAG &dag; + Anderson2021Params params; + const Target ⌖ + const std::vector &outputs; + std::mt19937 &rng; + CostModel *cost_model; + Statistics &stats; + SearchSpace &search_space; + const LoopNestParser *partial_schedule; + + AutoSchedule(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + const std::vector &outputs, + std::mt19937 &rng, + CostModel *cost_model, + Statistics &stats, + SearchSpace &search_space, + const LoopNestParser *partial_schedule); + + bool use_partial_schedule() const { + return partial_schedule; + } + + IntrusivePtr optimal_schedule_pass(int beam_size, + int pass_idx, + int num_passes, + ProgressBar &tick, + std::unordered_set &permitted_hashes); + + // Performance coarse-to-fine beam search and return the best state found. + IntrusivePtr optimal_schedule(int beam_size); +}; + +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API +template +T get_scalar_env_var(const char *nm, T def = T()) { + auto str = get_env_variable(nm); + if (str.empty()) { + return def; + } + std::istringstream iss(str); + T t; + iss >> t; + user_assert(!iss.fail() && iss.get() == EOF) << "Unable to parse: " << str; + return t; } +#endif // Decide whether or not to drop a beam search state. Used for // randomly exploring the search tree for autotuning and to generate // training data. -bool random_dropout(std::mt19937 &rng, size_t num_decisions) { - static double random_dropout_threshold = std::max(0.0, get_dropout_threshold()); - if (random_dropout_threshold >= 100) { +bool random_dropout(const Anderson2021Params ¶ms, std::mt19937 &rng, size_t num_decisions) { + if (params.random_dropout >= 100) { return false; } // The random dropout threshold is the chance that we operate // entirely greedily and never discard anything. - double t = random_dropout_threshold; + double t = params.random_dropout; t /= 100; t = std::pow(t, 1.0f / num_decisions); t *= 100; - double r = rng() % 100; - bool drop_it = r >= t; + uint32_t r = rng(); + bool drop_it = (r % 100) >= t; return drop_it; } -// Get the HL_SEARCH_SPACE_OPTIONS environment variable. Described above -std::string get_search_space_options() { - std::string options = get_env_variable("HL_SEARCH_SPACE_OPTIONS"); - if (options.empty()) { - return "1111"; - } - return options; -} - // Configure a cost model to process a specific pipeline. void configure_pipeline_features(const FunctionDAG &dag, - int hardware_parallelism, + const Anderson2021Params ¶ms, CostModel *cost_model) { cost_model->reset(); - cost_model->set_pipeline_features(dag, hardware_parallelism); + cost_model->set_pipeline_features(dag, params); } AutoSchedule::AutoSchedule(const FunctionDAG &dag, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, const std::vector &outputs, std::mt19937 &rng, @@ -168,8 +254,8 @@ AutoSchedule::AutoSchedule(const FunctionDAG &dag, Statistics &stats, SearchSpace &search_space, const LoopNestParser *partial_schedule) - : dag{dag}, hardware_parallelism{hardware_parallelism}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} { - configure_pipeline_features(dag, hardware_parallelism, cost_model); + : dag{dag}, params{params}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} { + configure_pipeline_features(dag, params, cost_model); } // A single pass of coarse-to-fine beam search. @@ -211,15 +297,17 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, q.emplace(std::move(s)); }; + std::unique_ptr target_loop_nest; + +#ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS string cyos_str = get_env_variable("HL_CYOS"); string cyos_from_file_str = get_env_variable("HL_CYOS_FROM_FILE"); bool cyos_from_file = !cyos_from_file_str.empty(); bool cyos_is_enabled = cyos_from_file || cyos_str == "1"; - - std::unique_ptr target_loop_nest; if (cyos_from_file) { target_loop_nest = LoopNestParser::from_file(cyos_from_file_str); } +#endif // This loop is beam search over the sequence of decisions to make. for (int i = 0;; i++) { @@ -291,7 +379,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, } // Random dropout - if (pending.size() > 1 && random_dropout(rng, dag.nodes.size() * 2)) { + if (pending.size() > 1 && random_dropout(params, rng, dag.nodes.size() * 2)) { continue; } @@ -380,6 +468,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, q.resort(); } +#ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS if (cyos_is_enabled) { int selection = -1; bool found = false; @@ -434,6 +523,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, q.clear(); q.emplace(std::move(selected)); } +#endif // HALIDE_AUTOSCHEDULER_ALLOW_CYOS } } @@ -446,6 +536,7 @@ IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { // If the beam size is one, it's pointless doing multiple passes. int num_passes = (beam_size == 1) ? 1 : 5; +#ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS string cyos_str = get_env_variable("HL_CYOS"); string cyos_from_file_str = get_env_variable("HL_CYOS_FROM_FILE"); if (!cyos_from_file_str.empty()) { @@ -456,14 +547,14 @@ IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { // ask them to do more than one pass. num_passes = 1; } +#endif // HALIDE_AUTOSCHEDULER_ALLOW_CYOS - string num_passes_str = get_env_variable("HL_NUM_PASSES"); - if (!num_passes_str.empty()) { + if (params.num_passes != 0) { // The user has requested a non-standard number of passes. - num_passes = std::atoi(num_passes_str.c_str()); + num_passes = params.num_passes; } - bool use_pre_pass = get_env_variable("HL_FREEZE_INLINE_COMPUTE_ROOT") == "1"; + bool use_pre_pass = params.freeze_inline_compute_root != 0; int pass_idx = 0; if (use_pre_pass && num_passes > 1) { @@ -504,41 +595,38 @@ IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { // The main entrypoint to generate a schedule for a pipeline. void generate_schedule(const std::vector &outputs, const Target &target, - int hardware_parallelism, + const Anderson2021Params ¶ms, AutoSchedulerResults *auto_scheduler_results) { internal_assert(target.has_gpu_feature()) << "Specified target (" << target.to_string() << ") does not support GPU"; Timer timer; - aslog(0) << "generate_schedule for target=" << target.to_string() << "\n"; - aslog(0) << "hardware_parallelism = " << hardware_parallelism << "\n"; + aslog(1) << "generate_schedule for target=" << target.to_string() << "\n"; + aslog(1) << "Anderson2021Params.parallelism:" << params.parallelism << "\n"; + aslog(1) << "Anderson2021Params.beam_size:" << params.beam_size << "\n"; + aslog(1) << "Anderson2021Params.random_dropout:" << params.random_dropout << "\n"; + aslog(1) << "Anderson2021Params.random_dropout_seed:" << params.random_dropout_seed << "\n"; + aslog(1) << "Anderson2021Params.weights_path:" << params.weights_path << "\n"; + aslog(1) << "Anderson2021Params.disable_subtiling:" << params.disable_subtiling << "\n"; + aslog(1) << "Anderson2021Params.randomize_tilings:" << params.randomize_tilings << "\n"; + aslog(1) << "Anderson2021Params.search_space_options:" << params.search_space_options << "\n"; + aslog(1) << "Anderson2021Params.freeze_inline_compute_root:" << params.freeze_inline_compute_root << "\n"; + aslog(1) << "Anderson2021Params.partial_schedule_path:" << params.partial_schedule_path << "\n"; + aslog(1) << "Anderson2021Params.num_passes:" << params.num_passes << "\n"; + aslog(1) << "Anderson2021Params.stack_factor:" << params.stack_factor << "\n"; + aslog(1) << "Anderson2021Params.shared_memory_limit_kb:" << params.shared_memory_limit_kb << "\n"; + aslog(1) << "Anderson2021Params.shared_memory_sm_limit_kb:" << params.shared_memory_sm_limit_kb << "\n"; + aslog(1) << "Anderson2021Params.active_block_limit:" << params.active_block_limit << "\n"; + aslog(1) << "Anderson2021Params.active_warp_limit:" << params.active_warp_limit << "\n"; // Start a timer HALIDE_TIC; - // Get the seed for random dropout - string seed_str = get_env_variable("HL_SEED"); - // Or use the time, if not set. - int seed = (int)time(nullptr); - if (!seed_str.empty()) { - seed = atoi(seed_str.c_str()); - } - - aslog(1) << "Dropout seed = " << seed << "\n"; - - // Get the beam size - string beam_size_str = get_env_variable("HL_BEAM_SIZE"); - // Defaults to 32 - size_t beam_size = 32; - if (!beam_size_str.empty()) { - beam_size = atoi(beam_size_str.c_str()); - } - - string weights_in_path = get_env_variable("HL_WEIGHTS_DIR"); - string weights_out_path; // deliberately empty - string randomize_weights_str = get_env_variable("HL_RANDOMIZE_WEIGHTS"); bool randomize_weights = randomize_weights_str == "1"; + string weights_in_path = params.weights_path; + string weights_out_path; // deliberately empty + // Analyse the Halide algorithm and construct our abstract representation of it FunctionDAG dag(outputs, target); if (aslog::aslog_level() > 0) { @@ -555,23 +643,22 @@ void generate_schedule(const std::vector &outputs, IntrusivePtr optimal; - string partial_schedule_filename = get_env_variable("PARTIAL_SCHEDULE"); std::unique_ptr partial_schedule; - if (!partial_schedule_filename.empty()) { - aslog(0) << "Loading partial schedule from " << partial_schedule_filename << "\n"; - partial_schedule = LoopNestParser::from_file(partial_schedule_filename); + if (!params.partial_schedule_path.empty()) { + aslog(0) << "Loading partial schedule from " << params.partial_schedule_path << "\n"; + partial_schedule = LoopNestParser::from_file(params.partial_schedule_path); aslog(0) << "Partial schedule:\n"; partial_schedule->dump(); aslog(0) << "\n"; } - std::mt19937 rng{(uint32_t)seed}; - SearchSpace search_space{dag, hardware_parallelism, target, get_search_space_options(), rng, cost_model.get(), stats, partial_schedule.get()}; + std::mt19937 rng{(uint32_t)params.random_dropout_seed}; + SearchSpace search_space{dag, params, target, rng, cost_model.get(), stats, partial_schedule.get()}; - AutoSchedule autoschedule{dag, hardware_parallelism, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()}; + AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()}; // Run beam search - optimal = autoschedule.optimal_schedule(beam_size); + optimal = autoschedule.optimal_schedule(params.beam_size); HALIDE_TOC; @@ -579,10 +666,10 @@ void generate_schedule(const std::vector &outputs, aslog(1) << "** Optimal schedule:\n"; // Just to get the debugging prints to fire - optimal->calculate_cost(dag, hardware_parallelism, target, cost_model.get(), stats, aslog::aslog_level() > 0); + optimal->calculate_cost(dag, params, target, cost_model.get(), stats, aslog::aslog_level() > 0); // Apply the schedules to the pipeline - optimal->apply_schedule(dag, hardware_parallelism, target); + optimal->apply_schedule(dag, params, target); // Print out the schedule if (aslog::aslog_level() > 0) { @@ -592,29 +679,6 @@ void generate_schedule(const std::vector &outputs, optimal->print_compute_locations(); } - string schedule_file = get_env_variable("HL_SCHEDULE_FILE"); - if (!schedule_file.empty()) { - user_warning << "HL_SCHEDULE_FILE is deprecated; use the schedule output from Generator instead\n"; - aslog(1) << "Writing schedule to " << schedule_file << "...\n"; - std::ofstream f(schedule_file); - f << "// --- BEGIN machine-generated schedule\n" - << optimal->schedule_source - << "// --- END machine-generated schedule\n"; - f.close(); - internal_assert(!f.fail()) << "Failed to write " << schedule_file; - } - - // Save the featurization, so that we can use this schedule as - // training data (once we've benchmarked it). - string feature_file = get_env_variable("HL_FEATURE_FILE"); - if (!feature_file.empty()) { - user_warning << "HL_FEATURE_FILE is deprecated; use the featurization output from Generator instead\n"; - std::ofstream binfile(feature_file, std::ios::binary | std::ios_base::trunc); - optimal->save_featurization(dag, hardware_parallelism, target, binfile); - binfile.close(); - internal_assert(!binfile.fail()) << "Failed to write " << feature_file; - } - if (auto_scheduler_results) { #ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto_scheduler_results->scheduler_name = "Anderson2021"; @@ -622,7 +686,7 @@ void generate_schedule(const std::vector &outputs, auto_scheduler_results->schedule_source = optimal->schedule_source; { std::ostringstream out; - optimal->save_featurization(dag, hardware_parallelism, target, out); + optimal->save_featurization(dag, params, target, out); auto_scheduler_results->featurization.resize(out.str().size()); memcpy(auto_scheduler_results->featurization.data(), out.str().data(), out.str().size()); } @@ -662,7 +726,22 @@ struct Anderson2021 { } Anderson2021Params params; params.parallelism = params_in.parallelism; - Autoscheduler::generate_schedule(outputs, target, params.parallelism, results); + params.beam_size = get_scalar_env_var("HL_BEAM_SIZE", 32); + params.random_dropout = get_scalar_env_var("HL_RANDOM_DROPOUT", 100); + params.random_dropout_seed = get_scalar_env_var("HL_SEED", (int)time(nullptr)); + params.weights_path = get_scalar_env_var("HL_WEIGHTS_DIR"); + params.disable_subtiling = get_scalar_env_var("HL_NO_SUBTILING", 0); + params.randomize_tilings = get_scalar_env_var("HL_RANDOMIZE_TILINGS", 0); + params.search_space_options = get_scalar_env_var("HL_DISABLE_MEMOIZED_FEATURES", "1111"); + params.freeze_inline_compute_root = get_scalar_env_var("HL_AUTOSCHEDULE_MEMORY_LIMIT", 0); + params.partial_schedule_path = get_scalar_env_var("PARTIAL_SCHEDULE", ""); + params.num_passes = get_scalar_env_var("HL_NUM_PASSES", 0); + params.stack_factor = get_scalar_env_var("HL_STACK_FACTOR", 0.95f); + params.shared_memory_limit_kb = get_scalar_env_var("HL_SHARED_MEMORY_LIMIT", 48); + params.shared_memory_sm_limit_kb = get_scalar_env_var("HL_SHARED_MEMORY_SM_LIMIT", 96); + params.active_block_limit = get_scalar_env_var("HL_ACTIVE_BLOCK_LIMIT", 32); + params.active_warp_limit = get_scalar_env_var("HL_ACTIVE_WARP_LIMIT", 64); + Autoscheduler::generate_schedule(outputs, target, params, results); } #else void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *results) { @@ -676,9 +755,24 @@ struct Anderson2021 { { ParamParser parser(params_in.extra); parser.parse("parallelism", ¶ms.parallelism); + parser.parse("beam_size", ¶ms.beam_size); + parser.parse("random_dropout", ¶ms.random_dropout); + parser.parse("random_dropout_seed", ¶ms.random_dropout_seed); + parser.parse("weights_path", ¶ms.weights_path); + parser.parse("disable_subtiling", ¶ms.disable_subtiling); + parser.parse("randomize_tilings", ¶ms.randomize_tilings); + parser.parse("search_space_options", ¶ms.search_space_options); + parser.parse("freeze_inline_compute_root", ¶ms.freeze_inline_compute_root); + parser.parse("partial_schedule_path", ¶ms.partial_schedule_path); + parser.parse("num_passes", ¶ms.num_passes); + parser.parse("stack_factor", ¶ms.stack_factor); + parser.parse("shared_memory_limit_kb", ¶ms.shared_memory_limit_kb); + parser.parse("shared_memory_sm_limit_kb", ¶ms.shared_memory_sm_limit_kb); + parser.parse("active_block_limit", ¶ms.active_block_limit); + parser.parse("active_warp_limit", ¶ms.active_warp_limit); parser.finish(); } - Autoscheduler::generate_schedule(outputs, target, params.parallelism, results); + Autoscheduler::generate_schedule(outputs, target, params, results); results->autoscheduler_params = params_in; } #endif @@ -689,7 +783,7 @@ REGISTER_AUTOSCHEDULER(Anderson2021) // An alternative entrypoint for other uses void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, int beam_size, @@ -698,26 +792,25 @@ void find_and_apply_schedule(FunctionDAG &dag, Statistics stats; std::mt19937 rng{(uint32_t)12345}; - string partial_schedule_filename = get_env_variable("PARTIAL_SCHEDULE"); std::unique_ptr partial_schedule; - if (!partial_schedule_filename.empty()) { - aslog(0) << "Loading partial schedule from " << partial_schedule_filename << "\n"; - partial_schedule = LoopNestParser::from_file(partial_schedule_filename); + if (!params.partial_schedule_path.empty()) { + aslog(0) << "Loading partial schedule from " << params.partial_schedule_path << "\n"; + partial_schedule = LoopNestParser::from_file(params.partial_schedule_path); aslog(0) << "Partial schedule:\n"; partial_schedule->dump(); aslog(0) << "\n"; } - SearchSpace search_space{dag, hardware_parallelism, target, get_env_variable("HL_SEARCH_SPACE_OPTIONS"), rng, cost_model, stats, partial_schedule.get()}; - AutoSchedule autoschedule{dag, hardware_parallelism, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()}; + SearchSpace search_space{dag, params, target, rng, cost_model, stats, partial_schedule.get()}; + AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()}; IntrusivePtr optimal = autoschedule.optimal_schedule(beam_size); // Apply the schedules - optimal->apply_schedule(dag, hardware_parallelism, target); + optimal->apply_schedule(dag, params, target); if (schedule_features) { - optimal->compute_featurization(dag, hardware_parallelism, target, schedule_features, stats); + optimal->compute_featurization(dag, params, target, schedule_features, stats); } } diff --git a/src/autoschedulers/anderson2021/AutoSchedule.h b/src/autoschedulers/anderson2021/AutoSchedule.h index be0fd865ccb9..29819de7534a 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.h +++ b/src/autoschedulers/anderson2021/AutoSchedule.h @@ -16,87 +16,9 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -struct ProgressBar { - void set(double progress) { - if (!draw_progress_bar) { - return; - } - counter++; - const int bits = 11; - if (counter & ((1 << bits) - 1)) { - return; - } - const int pos = (int)(progress * 78); - aslog(0) << '['; - for (int j = 0; j < 78; j++) { - if (j < pos) { - aslog(0) << '.'; - } else if (j - 1 < pos) { - aslog(0) << "/-\\|"[(counter >> bits) % 4]; - } else { - aslog(0) << ' '; - } - } - aslog(0) << ']'; - for (int j = 0; j < 80; j++) { - aslog(0) << '\b'; - } - } - - void clear() { - if (counter) { - for (int j = 0; j < 80; j++) { - aslog(0) << ' '; - } - for (int j = 0; j < 80; j++) { - aslog(0) << '\b'; - } - } - } - -private: - uint32_t counter = 0; - const bool draw_progress_bar = isatty(2); -}; - typedef PerfectHashMap StageMapOfScheduleFeatures; -struct AutoSchedule { - const FunctionDAG &dag; - int hardware_parallelism; - const Target ⌖ - const std::vector &outputs; - std::mt19937 &rng; - CostModel *cost_model; - Statistics &stats; - SearchSpace &search_space; - const LoopNestParser *partial_schedule; - - AutoSchedule(const FunctionDAG &dag, - int hardware_parallelism, - const Target &target, - const std::vector &outputs, - std::mt19937 &rng, - CostModel *cost_model, - Statistics &stats, - SearchSpace &search_space, - const LoopNestParser *partial_schedule); - - bool use_partial_schedule() const { - return partial_schedule; - } - - IntrusivePtr optimal_schedule_pass(int beam_size, - int pass_idx, - int num_passes, - ProgressBar &tick, - std::unordered_set &permitted_hashes); - - // Performance coarse-to-fine beam search and return the best state found. - IntrusivePtr optimal_schedule(int beam_size); -}; - -void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, int hardware_parallelism, const Target &target, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); +void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); } // namespace Autoscheduler } // namespace Internal diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 7a82d3688431..53b80e4c43fa 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -2,7 +2,10 @@ # Resources for the autoscheduler library ## -add_compile_definitions(HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API) +# TODO(someone): `TARGETS cmake` cannot possibly be right for most of +# the cases here, as it is just the arch-bits-os of the current `Halide_TARGET`, +# so no GPU features will ever be present. I've commented out usages +# that seem obviously wrong to me. set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") include_directories("${Halide_BINARY_DIR}/include") @@ -18,24 +21,26 @@ add_custom_command(OUTPUT ${WF_CPP} VERBATIM) # cost_model, train_cost_model -add_executable(anderson2021-cost_model.generator cost_model_generator.cpp) -target_link_libraries(anderson2021-cost_model.generator PRIVATE Halide::Halide Halide::Generator) +add_executable(anderson2021_cost_model.generator cost_model_generator.cpp) +target_link_libraries(anderson2021_cost_model.generator PRIVATE Halide::Halide Halide::Generator) -add_halide_library(anderson2021_cost_model FROM anderson2021-cost_model.generator - GENERATOR cost_model +add_halide_library(anderson2021_cost_model FROM anderson2021_cost_model.generator + GENERATOR cost_model + FUNCTION_NAME cost_model TARGETS cmake) -add_halide_library(anderson2021_train_cost_model FROM anderson2021-cost_model.generator - GENERATOR train_cost_model +add_halide_library(anderson2021_train_cost_model FROM anderson2021_cost_model.generator + GENERATOR train_cost_model + FUNCTION_NAME train_cost_model TARGETS cmake USE_RUNTIME anderson2021_cost_model.runtime) ## retrain_cost_model -add_executable(anderson2021-retrain_cost_model +add_executable(anderson2021_retrain_cost_model DefaultCostModel.cpp ${COMMON_DIR}/Weights.cpp retrain_cost_model.cpp ${WF_CPP}) -target_link_libraries(anderson2021-retrain_cost_model PRIVATE ASLog anderson2021_cost_model +target_link_libraries(anderson2021_retrain_cost_model PRIVATE ASLog anderson2021_cost_model anderson2021_train_cost_model Halide::Halide Halide::Plugin) ### @@ -65,150 +70,142 @@ target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser # ================================================================= -add_executable(anderson2021-demo.generator ${COMMON_DIR}/demo_generator.cpp) -target_link_libraries(anderson2021-demo.generator PRIVATE Halide::Halide Halide::Generator) +add_executable(anderson2021_demo.generator ${COMMON_DIR}/demo_generator.cpp) +target_link_libraries(anderson2021_demo.generator PRIVATE Halide::Halide Halide::Generator) -add_halide_library(anderson2021-demo FROM demo.generator +add_halide_library(anderson2021_demo FROM demo.generator GENERATOR demo - TARGETS cmake + FUNCTION_NAME demo + # TODO(someone) + # TARGETS cmake AUTOSCHEDULER Halide::Anderson2021 REGISTRATION DEMO_REGISTRATION_FILE) -add_executable(anderson2021-demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) -target_link_libraries(anderson2021-demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) - -add_test(NAME demo_apps_autoscheduler - COMMAND demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) +add_executable(anderson2021_demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) +target_link_libraries(anderson2021_demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) -set_tests_properties(demo_apps_autoscheduler +add_test(NAME anderson2021_demo_apps_autoscheduler + COMMAND anderson2021_demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) +set_tests_properties(anderson2021_demo_apps_autoscheduler PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) ## ================================================================= -add_executable(anderson2021-included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) -target_link_libraries(anderson2021-included_schedule_file.generator PRIVATE +add_executable(anderson2021_included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) +target_link_libraries(anderson2021_included_schedule_file.generator PRIVATE Halide::Halide Halide::Generator) -add_halide_library(anderson2021-included_schedule_file FROM - anderson2021-included_schedule_file.generator +add_halide_library(anderson2021_included_schedule_file FROM + anderson2021_included_schedule_file.generator GENERATOR included_schedule_file - TARGETS cmake + FUNCTION_NAME included_schedule_file + # TODO(someone) + # TARGETS cmake AUTOSCHEDULER Halide::Anderson2021 REGISTRATION included_schedule_reg) -add_executable(anderson2021-demo_included_schedule_file ${included_schedule_reg}) -target_link_libraries(anderson2021-demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) - -add_test(NAME demo_included_schedule_file - COMMAND demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) +add_executable(anderson2021_demo_included_schedule_file ${included_schedule_reg}) +target_link_libraries(anderson2021_demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) -set_tests_properties(demo_included_schedule_file +add_test(NAME anderson2021_demo_included_schedule_file + COMMAND anderson2021_demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) +set_tests_properties(anderson2021_demo_included_schedule_file PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) ## ==================================================== ## Auto-tuning support utilities. ## TODO(#4053): implement auto-tuning support in CMake? -add_executable(anderson2021-featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) +add_executable(anderson2021_featurization_to_sample ${COMMON_DIR}/featurization_to_sample.cpp) -add_executable(anderson2021-get_host_target ${COMMON_DIR}/get_host_target.cpp) -target_link_libraries(anderson2021-get_host_target PRIVATE Halide::Halide) +add_executable(anderson2021_get_host_target ${COMMON_DIR}/get_host_target.cpp) +target_link_libraries(anderson2021_get_host_target PRIVATE Halide::Halide) -add_executable(anderson2021-weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) -target_link_libraries(anderson2021-weightsdir_to_weightsfile PRIVATE Halide::Runtime) +add_executable(anderson2021_weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) +target_link_libraries(anderson2021_weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= # Smaller tests if (BUILD_SHARED_LIBS) - add_executable(anderson2021-test_apps_autoscheduler test.cpp) - target_link_libraries(anderson2021-test_apps_autoscheduler PRIVATE + add_executable(anderson2021_test_apps_autoscheduler test.cpp) + target_link_libraries(anderson2021_test_apps_autoscheduler PRIVATE Halide::Halide Halide::Tools ${CMAKE_DL_LIBS}) - add_test(NAME test_apps_autoscheduler - COMMAND test_apps_autoscheduler $) + add_test(NAME anderson2021_test_apps_autoscheduler + COMMAND anderson2021_test_apps_autoscheduler $) - set_tests_properties(test_apps_autoscheduler PROPERTIES + set_tests_properties(anderson2021_test_apps_autoscheduler PROPERTIES LABELS "Anderson2021;multithreaded" ENVIRONMENT "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH};HL_TARGET=${Halide_TARGET}") endif () ## -add_executable(anderson2021-test_perfect_hash_map ${COMMON_DIR}/test_perfect_hash_map.cpp) +add_executable(anderson2021_test_perfect_hash_map ${COMMON_DIR}/test_perfect_hash_map.cpp) -add_test(NAME test_perfect_hash_map COMMAND test_perfect_hash_map) -set_tests_properties(test_perfect_hash_map +add_test(NAME anderson2021_test_perfect_hash_map COMMAND test_perfect_hash_map) +set_tests_properties(anderson2021_test_perfect_hash_map PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) ## -add_executable(anderson2021-test_function_dag ${COMMON_DIR}/test_function_dag.cpp FunctionDAG.cpp) -target_link_libraries(anderson2021-test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +add_executable(anderson2021_test_function_dag ${COMMON_DIR}/test_function_dag.cpp FunctionDAG.cpp) +target_link_libraries(anderson2021_test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME test_function_dag COMMAND test_function_dag) -set_tests_properties(test_function_dag +add_test(NAME anderson2021_test_function_dag COMMAND anderson2021_test_function_dag) +set_tests_properties(anderson2021_test_function_dag PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) -add_executable(anderson2021-test_bounds test/bounds.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp Tiling.cpp) -target_link_libraries(anderson2021-test_bounds PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +add_executable(anderson2021_test_bounds test/bounds.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp Tiling.cpp) +target_link_libraries(anderson2021_test_bounds PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME anderson2021-test_bounds COMMAND test_bounds) -set_tests_properties(anderson2021-test_bounds +add_test(NAME anderson2021_test_bounds COMMAND anderson2021_test_bounds) +set_tests_properties(anderson2021_test_bounds PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) -add_executable(anderson2021-test_parser test/parser.cpp) -target_link_libraries(anderson2021-test_parser PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +add_executable(anderson2021_test_parser test/parser.cpp) +target_link_libraries(anderson2021_test_parser PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME test_parser COMMAND anderson2021-test_parser) -set_tests_properties(test_parser +add_test(NAME anderson2021_test_parser COMMAND anderson2021_test_parser) +set_tests_properties(anderson2021_test_parser PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) -add_executable(anderson2021-test_state test/state.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) -target_link_libraries(anderson2021-test_state PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +add_executable(anderson2021_test_state test/state.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +target_link_libraries(anderson2021_test_state PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME test_state COMMAND anderson2021-test_state) -set_tests_properties(test_state +add_test(NAME anderson2021_test_state COMMAND anderson2021_test_state) +set_tests_properties(anderson2021_test_state PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) -add_executable(anderson2021-test_storage_strides test/storage_strides.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) -target_link_libraries(anderson2021-test_storage_strides PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +add_executable(anderson2021_test_storage_strides test/storage_strides.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +target_link_libraries(anderson2021_test_storage_strides PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME test_storage_strides COMMAND anderson2021-test_storage_strides) -set_tests_properties(test_storage_strides +add_test(NAME anderson2021_test_storage_strides COMMAND anderson2021_test_storage_strides) +set_tests_properties(anderson2021_test_storage_strides PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) -add_executable(anderson2021-test_thread_info test/thread_info.cpp LoopNest.cpp +add_executable(anderson2021_test_thread_info test/thread_info.cpp LoopNest.cpp FunctionDAG.cpp GPULoopInfo.cpp Tiling.cpp) -target_link_libraries(anderson2021-test_thread_info PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +target_link_libraries(anderson2021_test_thread_info PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME test_thread_info COMMAND anderson2021-test_thread_info) -set_tests_properties(test_thread_info +add_test(NAME anderson2021_test_thread_info COMMAND anderson2021_test_thread_info) +set_tests_properties(anderson2021_test_thread_info PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) -add_executable(anderson2021-test_tiling test/tiling.cpp Tiling.cpp) -target_link_libraries(anderson2021-test_tiling PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) +add_executable(anderson2021_test_tiling test/tiling.cpp Tiling.cpp) +target_link_libraries(anderson2021_test_tiling PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) -add_test(NAME test_tiling COMMAND anderson2021-test_tiling) -set_tests_properties(test_tiling +add_test(NAME anderson2021_test_tiling COMMAND anderson2021_test_tiling) +set_tests_properties(anderson2021_test_tiling PROPERTIES - LABELS Anderson2021 - ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + LABELS Anderson2021) diff --git a/src/autoschedulers/anderson2021/CostModel.h b/src/autoschedulers/anderson2021/CostModel.h index db85072d9171..486fd45d31ec 100644 --- a/src/autoschedulers/anderson2021/CostModel.h +++ b/src/autoschedulers/anderson2021/CostModel.h @@ -12,7 +12,78 @@ namespace Halide { namespace Internal { namespace Autoscheduler { + typedef PerfectHashMap StageMapOfScheduleFeatures; + +struct Anderson2021Params { + /** Maximum level of parallelism available. */ + int parallelism = 16; + + /** Beam size to use in the beam search. Defaults to 32. Use 1 to get a greedy search instead. + * Formerly HL_BEAM_SIZE */ + int beam_size = 32; + + /** percent chance of accepting each state in the beam. + * Normalized by the number of decisions made, so 5 would be there's a 5 percent chance of never rejecting any states. + * Formerly HL_RANDOM_DROPOUT */ + int random_dropout = 100; + + /** Random seed used by the random dropout. If 0, use time(). + * Formerly HL_SEED */ + int random_dropout_seed = 0; + + /** When training or schedule, read weights from this directory or file. + * (If path ends in `.weights` it is written as a single file, otherwise a directory of files.) + * Formerly HL_WEIGHTS_DIR */ + std::string weights_path; + + /** If set to nonzero value: limits the search space to that of Mullapudi et al. + * Formerly HL_NO_SUBTILING */ + int disable_subtiling = 0; + + /** If set to nonzero value, only a random subset of the generated tilings for each stage will be accepted into the beam. + * Formerly HL_RANDOMIZE_TILINGS */ + int randomize_tilings = 0; + + /** Expects a string of four 0/1 values that allow/disallow the following options: + * compute root, inline, compute at the block level, compute at the thread level + * e.g. 1000 would allow compute root only + * Formerly HL_SEARCH_SPACE_OPTIONS */ + std::string search_space_options = "1111"; + + /** If set to nonzero value, run a pre-pass where only compute_root and inline scheduling options are considered. + * Formerly HL_FREEZE_INLINE_COMPUTE_ROOT */ + int freeze_inline_compute_root = 0; + + /** If nonempty, load the initial (partial) schedule from the given file. + * Formerly PARTIAL_SCHEDULE */ + std::string partial_schedule_path; + + /** User-requested specific number of passes. Ignored if 0. + * Formerly HL_NUM_PASSES */ + int num_passes = 0; + + /** TODO: document me + * Formerly HL_STACK_FACTOR */ + double stack_factor = 0.95f; + + /** TODO: document me + * Formerly HL_SHARED_MEMORY_LIMIT */ + int shared_memory_limit_kb = 48; + + /** TODO: document me + * Formerly HL_SHARED_MEMORY_SM_LIMIT */ + int shared_memory_sm_limit_kb = 96; + + /** TODO: document me + * Formerly HL_ACTIVE_BLOCK_LIMIT */ + int active_block_limit = 32; + + /** TODO: document me + * Formerly HL_ACTIVE_WARP_LIMIT */ + int active_warp_limit = 64; +}; + } // namespace Autoscheduler } // namespace Internal @@ -22,7 +93,7 @@ class CostModel { // Configure the cost model for the algorithm to be scheduled. virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - int hardware_parallelism) = 0; + const Internal::Autoscheduler::Anderson2021Params ¶ms) = 0; // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place. // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features. diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index 84462d4afc12..18c240418a4d 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -44,7 +44,7 @@ bool ends_with(const std::string &str, const std::string &suffix) { } // namespace void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - int hardware_parallelism) { + const Internal::Autoscheduler::Anderson2021Params ¶ms) { const int pipeline_feat_size = head1_w * head1_h; // We ignore the first seven pipeline features in the cost @@ -74,8 +74,8 @@ void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::Func } internal_assert(stage == num_stages); pipeline_feat_queue = pipeline_features; - internal_assert(hardware_parallelism > 0); - num_cores = hardware_parallelism; + internal_assert(params.parallelism > 0); + num_cores = params.parallelism; } void DefaultCostModel::set_pipeline_features(const Runtime::Buffer &pipeline_feats, int n) { @@ -215,7 +215,7 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim } } - int result = anderson2021_train_cost_model(num_stages, + int result = train_cost_model(num_stages, cursor, num_cores, batch_id, @@ -286,7 +286,7 @@ void DefaultCostModel::evaluate_costs() { auto loss = Runtime::Buffer::make_scalar(); - int result = anderson2021_cost_model(num_stages, + int result = cost_model(num_stages, cursor, num_cores, batch_id++, diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.h b/src/autoschedulers/anderson2021/DefaultCostModel.h index d5ff40fc5ccf..a5a03423a3ca 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.h +++ b/src/autoschedulers/anderson2021/DefaultCostModel.h @@ -43,7 +43,7 @@ class DefaultCostModel : public CostModel { // Configure the cost model for the algorithm to be scheduled. void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - int hardware_parallelism) override; + const Internal::Autoscheduler::Anderson2021Params ¶ms) override; void set_pipeline_features(const Runtime::Buffer &, int n); // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index d486c34ba0d6..c2da65e37822 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -15,62 +15,27 @@ namespace Autoscheduler { // entirely unroll the thing const int kUnrollLimitGPU = 16; -// Get the HL_NO_SUBTILING environment variable. Purpose described above. -bool get_may_subtile() { - string no_subtiling_str = get_env_variable("HL_NO_SUBTILING"); - if (no_subtiling_str == "1") { - return false; - } else { - return true; - } -} - -bool may_subtile() { - static bool b = get_may_subtile(); - return b; +bool may_subtile(const Anderson2021Params ¶ms) { + return params.disable_subtiling == 0; } // Shared memory limit per block for the target GPU -int64_t get_shared_memory_limit() { - // HL_SHARED_MEMORY_LIMIT is in KB - std::string limit = get_env_variable("HL_SHARED_MEMORY_LIMIT"); - if (limit.empty()) { - return 48 * 1024; - } - return atoi(limit.c_str()) * 1024; // Convert to bytes -} - -// Shared memory limit per SM for the target GPU -int64_t get_shared_memory_sm_limit_helper() { - // HL_SHARED_MEMORY_SM_LIMIT is in KB - std::string limit = get_env_variable("HL_SHARED_MEMORY_SM_LIMIT"); - if (limit.empty()) { - return 96 * 1024; - } - return atoi(limit.c_str()) * 1024; // Convert to bytes +int64_t get_shared_memory_limit(const Anderson2021Params ¶ms) { + return (int64_t)params.shared_memory_limit_kb * 1024; // Convert to bytes } -int64_t get_shared_memory_sm_limit() { - static int64_t limit = get_shared_memory_sm_limit_helper(); - return limit; +int64_t get_shared_memory_sm_limit(const Anderson2021Params ¶ms) { + return (int64_t)params.shared_memory_sm_limit_kb * 1024; // Convert to bytes } // Maximum number of active blocks for the target GPU -int64_t get_active_block_hardware_limit() { - std::string limit = get_env_variable("HL_ACTIVE_BLOCK_LIMIT"); - if (limit.empty()) { - return 32; - } - return atoi(limit.c_str()); +int64_t get_active_block_hardware_limit(const Anderson2021Params ¶ms) { + return params.active_block_limit; } // Maximum number of active warps for the target GPU -int64_t get_active_warp_hardware_limit() { - std::string limit = get_env_variable("HL_ACTIVE_WARP_LIMIT"); - if (limit.empty()) { - return 64; - } - return atoi(limit.c_str()); +int64_t get_active_warp_hardware_limit(const Anderson2021Params ¶ms) { + return params.active_warp_limit; } int get_unroll_limit(const Target &target) { @@ -182,6 +147,7 @@ void LoopNest::generate_vec_dim_serial_tilings(vector &serial_sizes) const // the newly inserted loop nests of f into a threads loop outside a serial loop. // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result. bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, + const Anderson2021Params ¶ms, const Target &target, int v, vector> &result, @@ -198,7 +164,7 @@ bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, new_parent->copy_from(*(this)); for (auto &c : new_parent->children) { if (c->node == f) { - c = c->parallelize_in_tiles(t, new_parent, target, false, false); + c = c->parallelize_in_tiles(t, new_parent, params, target, false, false); } } result.emplace_back(new_parent); @@ -1227,19 +1193,19 @@ void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopIn } // Assume that when a block is active, all its warps are active -void LoopNest::compute_warp_and_block_occupancy(int hardware_parallelism, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { +void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { // Only compute these features for stage's that actually have a block // loop if (node != gpu_loop_info.current_block_loop->node) { return; } - auto active_block_hardware_limit = get_active_block_hardware_limit(); - auto active_warp_hardware_limit = get_active_warp_hardware_limit(); + auto active_block_hardware_limit = get_active_block_hardware_limit(params); + auto active_warp_hardware_limit = get_active_warp_hardware_limit(params); int64_t num_warps_per_block = gpu_loop_info.thread_info->num_warps_per_block; - int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)hardware_parallelism); + int64_t num_blocks = std::ceil(gpu_loop_info.num_blocks / (double)params.parallelism); auto max_theoretical_active_blocks = std::min(active_block_hardware_limit, num_blocks); auto max_active_warps = std::min(active_warp_hardware_limit, max_theoretical_active_blocks * num_warps_per_block); @@ -1250,14 +1216,14 @@ void LoopNest::compute_warp_and_block_occupancy(int hardware_parallelism, Schedu feat.max_block_occupancy = (double)max_active_blocks / (double)active_block_hardware_limit; } -void LoopNest::compute_shared_mem_occupancy(const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const { +void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params ¶ms, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const { if (!is_gpu_block(target)) { return; } - static auto shared_mem_limit = get_shared_memory_limit(); - static auto shared_mem_sm_limit = get_shared_memory_sm_limit(); - static auto active_block_hardware_limit = get_active_block_hardware_limit(); + auto shared_mem_limit = get_shared_memory_limit(params); + auto shared_mem_sm_limit = get_shared_memory_sm_limit(params); + auto active_block_hardware_limit = get_active_block_hardware_limit(params); feat.shared_mem_occupancy = (double)total_shared_mem_alloc_size / (double)shared_mem_limit; internal_assert(feat.shared_mem_occupancy <= 1) << "Invalid shared mem occupancy: " << feat.shared_mem_occupancy; @@ -1297,7 +1263,7 @@ std::pair LoopNest::find_innermost_and_paren return {child, parent}; } -int64_t LoopNest::points_accessed_per_thread(const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { +int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { std::unique_ptr innermost_parent_clone = std::make_unique(); innermost_parent_clone->copy_from(*parent); @@ -1361,7 +1327,7 @@ int64_t LoopNest::points_accessed_per_thread(const Target &target, const GPULoop } } - IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(tiling, grandparent, target, true, false, false, rvars_to_move_inward); + IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(tiling, grandparent, params, target, true, false, false, rvars_to_move_inward); const auto &bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain); int64_t num_points = 1; @@ -1651,7 +1617,7 @@ std::pair LoopNest::compute_alloc_size_of_node_here(const Functio // Do a recursive walk over the loop nest computing features to feed the cost model. void LoopNest::compute_features(const FunctionDAG &dag, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, const StageMap &sites, int64_t instances, @@ -1686,8 +1652,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, size_t i = size[idx]; loop_instances *= i; if (stage->loop[idx].pure && !in_impure) { - if (hardware_parallelism > 1 && - (parallel || (parent->is_root() && parallel_tasks < hardware_parallelism))) { + if (params.parallelism > 1 && + (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) { // Either we've picked our parallel tiling, or // it's not yet determined. Assume we'll not split // any loops and just stop after we hit the @@ -1696,9 +1662,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, // If we haven't picked out parallel tiling yet, // assume that we'll target 8*cores when we do, // which is a common rule of thumb. - if (!parallel && parallel_tasks > hardware_parallelism * 8) { + if (!parallel && parallel_tasks > params.parallelism * 8) { // We would split this loop - parallel_tasks = hardware_parallelism * 8; + parallel_tasks = params.parallelism * 8; } } } else if (i != 1) { @@ -1799,7 +1765,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, ++stats.num_memoization_misses; } - c->compute_features(dag, hardware_parallelism, target, sites, subinstances, parallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); if (use_memoized_features) { c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id); @@ -1955,7 +1921,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, } else { // How this loop will be parallelized is not yet // determined. Use optimistic values for the features. - bytes_at_task = (feat.bytes_at_realization + hardware_parallelism - 1) / hardware_parallelism; + bytes_at_task = (feat.bytes_at_realization + params.parallelism - 1) / params.parallelism; innermost_bytes_at_task = std::min(bytes_at_task, feat.innermost_bytes_at_realization); } @@ -2051,7 +2017,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, // Recurse inwards for (const auto &c : children) { - c->compute_features(dag, hardware_parallelism, target, sites, subinstances, subparallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + c->compute_features(dag, params, target, sites, subinstances, subparallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); } for (const auto *node : store_at) { auto &feat = features->get(&(node->stages[0])); @@ -2288,7 +2254,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); compute_mem_load_features( jac.first, @@ -2319,7 +2285,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); compute_mem_load_features( jac.first, @@ -2359,7 +2325,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); compute_mem_load_features( jac.first, @@ -2664,12 +2630,12 @@ void LoopNest::compute_features(const FunctionDAG &dag, } } - compute_shared_mem_occupancy(target, total_shared_mem_alloc_sizes.get(stage), feat); + compute_shared_mem_occupancy(params, target, total_shared_mem_alloc_sizes.get(stage), feat); if (innermost && !is_scalar()) { compute_warp_features(feat, gpu_loop_info); - compute_warp_and_block_occupancy(hardware_parallelism, feat, gpu_loop_info); + compute_warp_and_block_occupancy(params, feat, gpu_loop_info); } } @@ -3033,10 +2999,11 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, + const Anderson2021Params ¶ms, const Target &target) { const auto &bounds = get_bounds(f); - if (!may_subtile()) { + if (!may_subtile(params)) { // If we are restricting ourselves to the Mullapudi et al // scheduling space, then once something is computed here // we may not subtile this loop. @@ -3051,7 +3018,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, node->stage = &f->stages[s]; node->innermost = true; node->vectorized_loop_index = -1; - node->tileable = tileable && (is_root() || may_subtile()); + node->tileable = tileable && (is_root() || may_subtile(params)); // always set gpu_label as thread if legal. // if !in_threads_loop we are computing either at root level or inside a serial loop @@ -3156,6 +3123,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, // Parallelize this loop according to the given tiling. IntrusivePtr LoopNest::parallelize_in_tiles(const vector &tiling, const LoopNest *parent, + const Anderson2021Params ¶ms, const Target &target, bool inner_tiling, bool adjust_tiling, @@ -3166,7 +3134,7 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const vectornode = outer->node = node; inner->stage = outer->stage = stage; - inner->tileable = outer->tileable = tileable && may_subtile(); + inner->tileable = outer->tileable = tileable && may_subtile(params); inner->vector_dim = outer->vector_dim = vector_dim; inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index; @@ -3199,7 +3167,7 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const vectorparallel = true; } - outer->tileable = may_subtile(); + outer->tileable = may_subtile(params); // First make an inner loop representing a 1x1x1... tile inner->size.resize(size.size(), 1); @@ -3341,7 +3309,7 @@ bool LoopNest::region_computed_shrinks(const FunctionDAG::Node *f, const LoopNes // loop marked gpu_threads, in which case f's loops cannot be gpu_threads vector> LoopNest::compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, const SearchSpaceOptions &search_space_options, int v, @@ -3407,7 +3375,7 @@ vector> LoopNest::compute_in_tiles(const FunctionDA std::unique_ptr r{new LoopNest}; r->copy_from(*this); - r->compute_here(f, true, v, in_threads_loop, target); + r->compute_here(f, true, v, in_threads_loop, params, target); if (!in_realization) { r->store_at.insert(f); } else { @@ -3416,7 +3384,7 @@ vector> LoopNest::compute_in_tiles(const FunctionDA // if GPU and creating a threads loop INSIDE a block loop, create child for each thread tiling if (!is_root() && !in_threads_loop && target.has_gpu_feature()) { - bool made_child = r->add_gpu_thread_tilings(f, target, v, result, union_counts); + bool made_child = r->add_gpu_thread_tilings(f, params, target, v, result, union_counts); if (!made_child) { // no good thread tilings, just keep r with the untiled loop inserted as serial result.emplace_back(r.release()); } @@ -3432,7 +3400,7 @@ vector> LoopNest::compute_in_tiles(const FunctionDA return result; } - if (child >= 0 && !called_by_multiple_children && !in_realization && (may_subtile() || is_root())) { + if (child >= 0 && !called_by_multiple_children && !in_realization && (may_subtile(params) || is_root())) { // Push the Func further inwards in the loop nest const auto &c = children[child]; @@ -3442,14 +3410,14 @@ vector> LoopNest::compute_in_tiles(const FunctionDA } for (int store_here = 0; store_here < 1; store_here++) { - if (is_root() && num_ones == (int)c->size.size() && hardware_parallelism > 1) { + if (is_root() && num_ones == (int)c->size.size() && params.parallelism > 1) { // Don't fuse into serial loops, or we could never parallelize this Func. continue; } in_threads_loop |= (children[child]->gpu_label == thread); // we must pass down union thread count constraints computed at block level when computing further in - auto opts = children[child]->compute_in_tiles(f, this, hardware_parallelism, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); + auto opts = children[child]->compute_in_tiles(f, this, params, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); for (IntrusivePtr &n : opts) { // (Only valid if one child calls f) Push the // computation into the child. Possibly leaving diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h index 9d668f51bfa5..034838de9a8e 100644 --- a/src/autoschedulers/anderson2021/LoopNest.h +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -7,6 +7,7 @@ #define LOOP_NEST_H #include "ASLog.h" +#include "CostModel.h" #include "FunctionDAG.h" #include "GPULoopInfo.h" #include "GPUMemInfo.h" @@ -42,13 +43,13 @@ enum class GPUMemoryType { global, registers, inlined }; -bool may_subtile(); +bool may_subtile(const Anderson2021Params ¶ms); -int64_t get_shared_memory_limit(); +int64_t get_shared_memory_limit(const Anderson2021Params ¶ms); -int64_t get_active_block_hardware_limit(); +int64_t get_active_block_hardware_limit(const Anderson2021Params ¶ms); -int64_t get_active_warp_hardware_limit(); +int64_t get_active_warp_hardware_limit(const Anderson2021Params ¶ms); constexpr int64_t get_register_mem_alloc_limit() { return 128; @@ -172,6 +173,7 @@ struct LoopNest { // the newly inserted loop nests of f into a threads loop outside a serial loop. // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result. bool add_gpu_thread_tilings(const FunctionDAG::Node *f, + const Anderson2021Params ¶ms, const Target &target, int v, vector> &result, @@ -318,13 +320,13 @@ struct LoopNest { void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const; // Assume that when a block is active, all its warps are active - void compute_warp_and_block_occupancy(int parallelism, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const; + void compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const; - void compute_shared_mem_occupancy(const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; + void compute_shared_mem_occupancy(const Anderson2021Params ¶ms, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; std::pair find_innermost_and_parent() const; - int64_t points_accessed_per_thread(const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; + int64_t points_accessed_per_thread(const Anderson2021Params ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const; @@ -347,7 +349,7 @@ struct LoopNest { // Do a recursive walk over the loop nest computing features to feed the cost model. void compute_features(const FunctionDAG &dag, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, const StageMap &sites, int64_t instances, @@ -426,11 +428,13 @@ struct LoopNest { bool tileable, int v, bool in_threads_loop, + const Anderson2021Params ¶ms, const Target &target); // Parallelize this loop according to the given tiling. IntrusivePtr parallelize_in_tiles(const vector &tiling, const LoopNest *parent, + const Anderson2021Params ¶ms, const Target &target, bool inner_tiling, bool adjust_tiling, @@ -451,7 +455,7 @@ struct LoopNest { // loop marked gpu_threads, in which case f's loops cannot be gpu_threads vector> compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, const SearchSpaceOptions &search_space_options, int v, diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index ad4ac831528a..c616d918ede1 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -6,20 +6,14 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -bool use_randomized_tilings() { - static std::string randomization_str = get_env_variable("HL_RANDOMIZE_TILINGS"); - return randomization_str == "1"; -} - SearchSpace::SearchSpace(const FunctionDAG &dag, - int hardware_parallelism, + const Internal::Autoscheduler::Anderson2021Params ¶ms, const Target &target, - const std::string &search_space_options, std::mt19937 &rng, CostModel *cost_model, Statistics &stats, const LoopNestParser *partial_schedule) - : dag{dag}, hardware_parallelism{hardware_parallelism}, target{target}, search_space_options{search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, randomize_tilings{use_randomized_tilings()}, partial_schedule{partial_schedule} { + : dag{dag}, params{params}, target{target}, search_space_options{params.search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, partial_schedule{partial_schedule} { memoized_compute_root_blocks.make_large(dag.nodes.size()); } @@ -92,7 +86,7 @@ bool SearchSpace::add_states_from_memoized_blocks(const IntrusivePtr &sta new_root->children[block_index++] = new_block; } - if (child->calculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { + if (child->calculate_cost(dag, params, target, cost_model, stats)) { num_children++; accept_child(std::move(child)); ++stats.num_block_memoization_hits; @@ -134,18 +128,18 @@ vector SearchSpace::filter_parallel_tile_option max_total = std::max(max_total, total); // If a stage does not have enough parallelism regardless of the - // tiling (i.e. its size is < hardware_parallelism * 2 before + // tiling (i.e. its size is < params.parallelism * 2 before // splitting), then the only tiling worth considering is the // one that retains the full extent in this dimension // (outer_tiling == size). In that case, skip over updating // min_total, otherwise it will be filtered out below - if (max_available >= hardware_parallelism * 2 || total != max_available) { + if (max_available >= params.parallelism * 2 || total != max_available) { if (min_total != 0) { min_total = std::min(min_total, total); } else { min_total = total; } - const double tasks_per_core = ((double)total) / hardware_parallelism; + const double tasks_per_core = ((double)total) / params.parallelism; o.idle_core_wastage = std::max(o.idle_core_wastage, std::ceil(tasks_per_core) / tasks_per_core); @@ -158,8 +152,8 @@ vector SearchSpace::filter_parallel_tile_option // Filter out the less useful options bool ok = - (min_total >= hardware_parallelism * 2 && - (max_total <= hardware_parallelism * 16 || target.has_gpu_feature())); + (min_total >= params.parallelism * 2 && + (max_total <= params.parallelism * 16 || target.has_gpu_feature())); if (!ok) { insufficient_parallelism.emplace_back(std::move(o)); @@ -169,7 +163,7 @@ vector SearchSpace::filter_parallel_tile_option options.emplace_back(std::move(o)); } - int64_t parallelism_limit = hardware_parallelism; + int64_t parallelism_limit = params.parallelism; while (options.empty()) { for (auto &o : insufficient_parallelism) { if (o.min_parallelism >= parallelism_limit) { @@ -219,7 +213,7 @@ void SearchSpace::process_pending_states(std::unordered_mapcalculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { + if (entry.second[i]->calculate_cost(dag, params, target, cost_model, stats)) { num_children++; accept_child(std::move(entry.second[i])); accepted++; @@ -234,7 +228,7 @@ void SearchSpace::process_pending_states(std::unordered_mapcalculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { + if (state->calculate_cost(dag, params, target, cost_model, stats)) { num_children++; accept_child(std::move(state)); stats.num_tilings_accepted++; @@ -259,7 +253,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, int next_node = state->num_decisions_made / 2; int phase = state->num_decisions_made % 2; - if (!may_subtile()) { + if (!may_subtile(params)) { // When emulating the older search space, we do all // parallelizing last, so that it is independent of the // tiling decisions. @@ -397,7 +391,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, std::unordered_map secondary_options; for (int vector_dim : vector_dims) { Timer timer; - auto tile_options = root->compute_in_tiles(node, nullptr, hardware_parallelism, target, search_space_options, vector_dim, false, false, is_pre_pass); + auto tile_options = root->compute_in_tiles(node, nullptr, params, target, search_space_options, vector_dim, false, false, is_pre_pass); stats.compute_in_tiles_time += timer.elapsed(); timer.restart(); @@ -405,7 +399,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, stats.filter_thread_tiles_time += timer.elapsed(); for (const auto &o : options) { - if (!randomize_tilings && num_children >= 1 && o.max_idle_lane_wastage > 0.5) { + if (!params.randomize_tilings && num_children >= 1 && o.max_idle_lane_wastage > 0.5) { Filter(o.loop_nest.get()) << "Excess idle lane wastage\n" << "max_idle_lane_wastage = " << o.max_idle_lane_wastage << "\n"; break; @@ -413,7 +407,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, ++stats.num_tilings_generated; - if (!randomize_tilings) { + if (!params.randomize_tilings) { if (add_child(state, o.loop_nest, accept_child)) { num_children++; } @@ -434,7 +428,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, } } - if (randomize_tilings) { + if (params.randomize_tilings) { process_pending_states(primary_options, secondary_options, num_children, accept_child, node); } } else { @@ -443,7 +437,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, bool should_parallelize = false; IntrusivePtr pure_stage; - if (hardware_parallelism > 1) { + if (params.parallelism > 1) { for (const auto &c : root->children) { if (c->node == node && node->dimensions > 0) { if (c->stage->index == 0) { @@ -484,7 +478,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, // step 1) parallelize all loop nests for this node into (parallel, serial) with given serial tiles for (auto &c : parallel_root.children) { if (c->node == node) { - c = c->parallelize_in_tiles(parallel_t, ¶llel_root, target, false, true); + c = c->parallelize_in_tiles(parallel_t, ¶llel_root, params, target, false, true); } } @@ -506,7 +500,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, for (auto &c : new_root->children) { if (c->node == node) { vector tiling((int)(c->size.size()), 1); - c = c->parallelize_in_tiles(tiling, new_root, target, false, true); + c = c->parallelize_in_tiles(tiling, new_root, params, target, false, true); } } if (add_child(state, new_root, accept_child)) { @@ -523,7 +517,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, double prev_idle_core_wastage = 0; for (const auto &o : options) { - if (!randomize_tilings && num_children >= 1 && o.idle_core_wastage > 1.2 && o.idle_core_wastage != prev_idle_core_wastage) { + if (!params.randomize_tilings && num_children >= 1 && o.idle_core_wastage > 1.2 && o.idle_core_wastage != prev_idle_core_wastage) { // We have considered several options, and the // remaining ones leave lots of cores idle. break; @@ -537,11 +531,11 @@ void SearchSpace::generate_children(const IntrusivePtr &state, for (auto &c : new_root->children) { if (c->node == node) { - c = c->parallelize_in_tiles(o.inner_tiling, new_root, target, true, false); + c = c->parallelize_in_tiles(o.inner_tiling, new_root, params, target, true, false); } } - if (!randomize_tilings) { + if (!params.randomize_tilings) { if (add_child(state, new_root, accept_child)) { num_children++; memoize_blocks(node, new_root); @@ -563,7 +557,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, } } - if (randomize_tilings) { + if (params.randomize_tilings) { process_pending_states(primary_options, secondary_options, num_children, accept_child, node); } } @@ -656,7 +650,7 @@ bool SearchSpace::add_child(const IntrusivePtr &state, auto child = state->make_child(); child->root = new_root; child->num_decisions_made++; - if (child->calculate_cost(dag, hardware_parallelism, target, cost_model, stats)) { + if (child->calculate_cost(dag, params, target, cost_model, stats)) { accept_child(std::move(child)); return true; } diff --git a/src/autoschedulers/anderson2021/SearchSpace.h b/src/autoschedulers/anderson2021/SearchSpace.h index 8688d0a90598..1e80c0e1760f 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.h +++ b/src/autoschedulers/anderson2021/SearchSpace.h @@ -23,13 +23,12 @@ namespace Autoscheduler { struct SearchSpace { using StateVector = std::vector>; const FunctionDAG &dag; - int hardware_parallelism; + const Anderson2021Params ¶ms; const Target ⌖ SearchSpaceOptions search_space_options; std::mt19937 &rng; CostModel *cost_model; Statistics &stats; - bool randomize_tilings; const LoopNestParser *partial_schedule; NodeMap inlined_nodes; @@ -37,9 +36,8 @@ struct SearchSpace { NodeMap>>> memoized_compute_root_blocks; SearchSpace(const FunctionDAG &dag, - int hardware_parallelism, + const Anderson2021Params ¶ms, const Target &target, - const std::string &search_space_options, std::mt19937 &rng, CostModel *cost_model, Statistics &stats, diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 217b3956acfb..9cab832e7773 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -7,18 +7,8 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -double get_stack_memory_adjustment_factor() { - string stack_factor_str = get_env_variable("HL_STACK_FACTOR"); - if (stack_factor_str.empty()) { - return 0.95; - } - - return std::atof(stack_factor_str.c_str()); -} - -int64_t get_stack_memory_limit() { - static double stack_factor = get_stack_memory_adjustment_factor(); - return stack_factor * 103232; +int64_t get_stack_memory_limit(const Anderson2021Params ¶ms) { + return params.stack_factor * 103232; } uint64_t State::structural_hash(int depth) const { @@ -37,7 +27,7 @@ void State::compute_loop_nest_parents(map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const { +const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params ¶ms, const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const { std::vector ancestors; // Innermost loop nests are never considered as compute locations @@ -74,7 +64,7 @@ const LoopNest *State::deepest_valid_compute_location(const mapstage); - if (total > get_shared_memory_limit()) { + if (total > get_shared_memory_limit(params)) { continue; } } @@ -104,7 +94,7 @@ const LoopNest *State::deepest_valid_compute_location(const mapgpu_label == block) { total_shared_mem_alloc_sizes.get(candidate->stage) += new_shared_mem_alloc_size; - internal_assert(total_shared_mem_alloc_sizes.get(candidate->stage) <= get_shared_memory_limit()); + internal_assert(total_shared_mem_alloc_sizes.get(candidate->stage) <= get_shared_memory_limit(params)); } internal_assert(new_register_alloc_size <= get_register_mem_alloc_limit()); @@ -235,13 +225,13 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest vector tiling(c->node->dimensions, 1); // Split into parallelized and serial - c = c->parallelize_in_tiles(tiling, loop_nest, target, true, false); + c = c->parallelize_in_tiles(tiling, loop_nest, params, target, true, false); if (vectorized_loop_index >= 0) { tiling[vectorized_loop_index] = inner_extent; } // Split parallelized into blocks and threads - c = c->parallelize_in_tiles(tiling, loop_nest, target, true, false); + c = c->parallelize_in_tiles(tiling, loop_nest, params, target, true, false); } else { // An update stage may have more or fewer dimensions than // the pure stage, but the tiling requires its dimensions to @@ -259,7 +249,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest // For update stages, split into parallelized and serial // (parallelize_in_tiles will move any RVars inwards and // make them serial) - c = c->parallelize_in_tiles(tiling, loop_nest, target, false, true); + c = c->parallelize_in_tiles(tiling, loop_nest, params, target, false, true); // If vectorized_loop_index < 0, then this update stage // likely does not loop over the vectorized loop of the @@ -272,7 +262,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest // Now that the RVars have been moved inwards, we can // split the outer loop into blocks and threads - c = c->parallelize_in_tiles(thread_tiling, loop_nest, target, true, false); + c = c->parallelize_in_tiles(thread_tiling, loop_nest, params, target, true, false); } } } @@ -304,7 +294,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) // Mark as 'thread' so this loop is split into threads and // serial c->gpu_label = thread; - c = c->parallelize_in_tiles(tiling, loop_nest, target, false, true); + c = c->parallelize_in_tiles(tiling, loop_nest, params, target, false, true); } return; } @@ -344,17 +334,17 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) // Mark as 'thread' so this loop is split into threads and // serial c->gpu_label = thread; - c = c->parallelize_in_tiles(tiling, loop_nest, target, false, true); + c = c->parallelize_in_tiles(tiling, loop_nest, params, target, false, true); } } } -IntrusivePtr State::get_root_for_features(int hardware_parallelism, const Target &target) const { +IntrusivePtr State::get_root_for_features(const Anderson2021Params ¶ms, const Target &target) const { if (!has_compute_root_loops_without_blocks() && !has_loop_nest_without_thread_loops()) { return root; } - FeatureLoopNestMutator mutator{hardware_parallelism, target}; + FeatureLoopNestMutator mutator{params, target}; // We copy the loop nest in 2 cases: // - If the current loop nest has compute root loops without blocks (it is @@ -399,8 +389,8 @@ void State::set_gpu_store_site(const map *features, Statistics &stats, bool verbose) const { - auto feature_root = get_root_for_features(hardware_parallelism, target); +bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, StageMap *features, Statistics &stats, bool verbose) const { + auto feature_root = get_root_for_features(params, target); StageMap sites; sites.make_large(dag.nodes[0].stages[0].max_id); @@ -487,7 +477,7 @@ bool State::compute_featurization(const FunctionDAG &dag, int hardware_paralleli // If 'loop' would never be considered as a compute location (i.e. by // LoopNest::compute_in_tiles()), walk up the loop nest until we reach a // location that would be considered - loop = deepest_valid_compute_location(parent, n, loop, feature_root.get(), total_shared_mem_alloc_sizes); + loop = deepest_valid_compute_location(params, parent, n, loop, feature_root.get(), total_shared_mem_alloc_sizes); int64_t num_realizations = total_loop_extents_of_ancestors(parent, loop); for (const auto &stage : n.stages) { @@ -506,7 +496,7 @@ bool State::compute_featurization(const FunctionDAG &dag, int hardware_paralleli } Timer timer; - feature_root->compute_features(dag, hardware_parallelism, target, sites, 1, 1, nullptr, nullptr, *feature_root, nullptr, nullptr, nullptr, features, {feature_root.get()}, true, total_shared_mem_alloc_sizes, stats, verbose); + feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, nullptr, nullptr, nullptr, features, {feature_root.get()}, true, total_shared_mem_alloc_sizes, stats, verbose); stats.featurization_time += timer.elapsed(); ++stats.num_featurizations; @@ -522,10 +512,10 @@ bool State::compute_featurization(const FunctionDAG &dag, int hardware_paralleli return true; } -void State::save_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, std::ostream &out) const { +void State::save_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, std::ostream &out) const { StageMap features; Statistics stats; - compute_featurization(dag, hardware_parallelism, target, &features, stats); + compute_featurization(dag, params, target, &features, stats); for (const auto &n : dag.nodes) { if (n.is_input) { @@ -627,12 +617,12 @@ int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest * return result; } -bool State::exceeds_shared_memory_limit(const Target &target) const { +bool State::exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const Target &target) const { if (!target.has_gpu_feature()) { return false; } - static int64_t limit = get_shared_memory_limit(); + static int64_t limit = get_shared_memory_limit(params); if (limit == 0) { return false; @@ -649,13 +639,13 @@ bool State::exceeds_shared_memory_limit(const Target &target) const { return false; } -bool State::exceeds_local_memory_limit(const Target &target) const { +bool State::exceeds_local_memory_limit(const Anderson2021Params ¶ms, const Target &target) const { if (!target.has_gpu_feature()) { return false; } for (const auto &c : root->children) { - if (c->get_total_constant_local_mem_alloc_size() > get_stack_memory_limit()) { + if (c->get_total_constant_local_mem_alloc_size() > get_stack_memory_limit(params)) { return true; } @@ -667,19 +657,19 @@ bool State::exceeds_local_memory_limit(const Target &target) const { return false; } -bool State::calculate_cost(const FunctionDAG &dag, int hardware_parallelism, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) { +bool State::calculate_cost(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) { Timer timer; if (!root->has_valid_thread_extents()) { Filter(root.get()) << "Invalid thread extents\n"; return false; } - if (exceeds_shared_memory_limit(target)) { + if (exceeds_shared_memory_limit(params, target)) { Filter(root.get()) << "Exceeds shared memory limit\n"; return false; } - if (exceeds_local_memory_limit(target)) { + if (exceeds_local_memory_limit(params, target)) { Filter(root.get()) << "Exceeds local memory limit\n"; return false; } @@ -693,7 +683,7 @@ bool State::calculate_cost(const FunctionDAG &dag, int hardware_parallelism, con StageMap features; - if (!compute_featurization(dag, hardware_parallelism, target, &features, stats, verbose)) { + if (!compute_featurization(dag, params, target, &features, stats, verbose)) { Filter(root.get()) << "Contains a local allocation that likely cannot be promoted to registers\n"; return false; } @@ -981,13 +971,13 @@ bool State::can_fuse_gpu(const vector ¶llel_extents) const { // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. -void State::apply_schedule(const FunctionDAG &dag, int hardware_parallelism, const Target &target) { +void State::apply_schedule(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target) { StageMap> state_map; std::vector ancestors; NodeMap all_inlined; root->collect_all_inlined(all_inlined); - root->apply(LoopLevel::root(), state_map, hardware_parallelism, 0, nullptr, nullptr, target, ancestors, all_inlined); + root->apply(LoopLevel::root(), state_map, params.parallelism, 0, nullptr, nullptr, target, ancestors, all_inlined); std::ostringstream src; std::unordered_set new_serial_vars; diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h index f3574507aa3d..846c895a4c53 100644 --- a/src/autoschedulers/anderson2021/State.h +++ b/src/autoschedulers/anderson2021/State.h @@ -106,7 +106,7 @@ struct State { bool has_compute_root_loops_without_blocks() const; struct FeatureLoopNestMutator { - int hardware_parallelism; + const Anderson2021Params ¶ms; const Target ⌖ void operator()(LoopNest *new_loop_nest) const; @@ -122,13 +122,13 @@ struct State { void add_outer_thread_loops(LoopNest *loop_nest) const; }; - IntrusivePtr get_root_for_features(int hardware_parallelism, const Target &target) const; + IntrusivePtr get_root_for_features(const Anderson2021Params ¶ms, const Target &target) const; void set_gpu_store_site(const map> &parent, const LoopNest *loop, LoopNest::Sites &site) const; - bool compute_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, StageMap *features, Statistics &stats, bool verbose = false) const; + bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, StageMap *features, Statistics &stats, bool verbose = false) const; - void save_featurization(const FunctionDAG &dag, int hardware_parallelism, const Target &target, std::ostream &out) const; + void save_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, std::ostream &out) const; bool contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const; @@ -143,11 +143,11 @@ struct State { int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const; - bool exceeds_shared_memory_limit(const Target &target) const; + bool exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const Target &target) const; - bool exceeds_local_memory_limit(const Target &target) const; + bool exceeds_local_memory_limit(const Anderson2021Params ¶ms, const Target &target) const; - bool calculate_cost(const FunctionDAG &dag, int hardware_parallelism, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false); + bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false); // Make a child copy of this state. The loop nest is const (we // make mutated copies of it, rather than mutating it), so we can @@ -170,13 +170,13 @@ struct State { // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. - void apply_schedule(const FunctionDAG &dag, int hardware_parallelism, const Target &target); + void apply_schedule(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target); bool should_always_consider_inline(const FunctionDAG::Node *node) const; void add_to_always_consider_inline_options(const FunctionDAG::Node *node); void update_always_consider_inline_options(const FunctionDAG::Node *node); - const LoopNest *deepest_valid_compute_location(const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const; + const LoopNest *deepest_valid_compute_location(const Anderson2021Params ¶ms, const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const; int64_t total_loop_extents_of_ancestors(const map> &parent, const LoopNest *loop) const; }; diff --git a/src/autoschedulers/anderson2021/check_weights.cpp b/src/autoschedulers/anderson2021/check_weights.cpp index a9ac923336b4..08697e053baa 100644 --- a/src/autoschedulers/anderson2021/check_weights.cpp +++ b/src/autoschedulers/anderson2021/check_weights.cpp @@ -8,12 +8,6 @@ using namespace Halide; -std::string getenv_safe(const char *key) { - const char *value = getenv(key); - if (!value) value = ""; - return value; -} - int check_weights(const std::string &filename, const std::vector &shape) { Runtime::Buffer buf(shape); @@ -36,13 +30,13 @@ int check_weights(const std::string &filename, const std::vector &shape) { int main(int argc, char **argv) { using std::string; - string weights_dir = getenv_safe("HL_WEIGHTS_DIR"); + string weights_dir = argc > 1 ? argv[1] : ""; if (weights_dir.empty()) { std::cout << "No weights_dir specified. Exiting.\n"; return 0; } - std::cout << "Checking weights...\n"; + std::cout << "Checking weights from " << weights_dir << " ...\n"; int num_nans = check_weights(weights_dir + "/head1_conv1_weight.data", {head1_channels, head1_w, head1_h}); num_nans = check_weights(weights_dir + "/head1_conv1_bias.data", {head1_channels}); diff --git a/src/autoschedulers/anderson2021/test/bounds.cpp b/src/autoschedulers/anderson2021/test/bounds.cpp index a6ff1361e1a8..4814b36d7248 100644 --- a/src/autoschedulers/anderson2021/test/bounds.cpp +++ b/src/autoschedulers/anderson2021/test/bounds.cpp @@ -8,6 +8,7 @@ using namespace Halide::Internal::Autoscheduler; void test_bounds() { Target target("host-cuda"); + Anderson2021Params params; Var x("x"), y("y"); { @@ -33,16 +34,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute h at root - root->compute_here(node_h, true, 0, false, target); + root->compute_here(node_h, true, 0, false, params, target); // Tile h std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); const auto &thread = root->children[0]->children[0]; const auto &thread_bounds_g = thread->get_bounds(node_g); @@ -73,16 +74,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute h at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile h std::vector tiling; tiling.push_back(2); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); const auto &thread = root->children[0]->children[0]; const auto &thread_bounds_g = thread->get_bounds(node_g); @@ -121,16 +122,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -179,16 +180,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -279,16 +280,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); diff --git a/src/autoschedulers/anderson2021/test/state.cpp b/src/autoschedulers/anderson2021/test/state.cpp index b283243cc55a..f2c1508acb73 100644 --- a/src/autoschedulers/anderson2021/test/state.cpp +++ b/src/autoschedulers/anderson2021/test/state.cpp @@ -8,6 +8,7 @@ using namespace Halide::Internal::Autoscheduler; void test_state() { Target target("host-cuda"); + Anderson2021Params params; // Test update_always_consider_inline_options Var x("x"), y("y"); @@ -34,7 +35,7 @@ void test_state() { std::unique_ptr root = std::make_unique(); // Compute h at root - root->compute_here(node_h, true, 0, false, target); + root->compute_here(node_h, true, 0, false, params, target); std::unique_ptr state = std::make_unique(); state->root = root.release(); diff --git a/src/autoschedulers/anderson2021/test/storage_strides.cpp b/src/autoschedulers/anderson2021/test/storage_strides.cpp index b1b4ed4c83db..58c782cd9e3d 100644 --- a/src/autoschedulers/anderson2021/test/storage_strides.cpp +++ b/src/autoschedulers/anderson2021/test/storage_strides.cpp @@ -7,6 +7,7 @@ using namespace Halide::Internal::Autoscheduler; void test_bounds() { Target target("host-cuda"); + Anderson2021Params params; bool verbose = false; int bytes_per_point = 4; @@ -34,16 +35,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute h at root - root->compute_here(node_h, true, 0, false, target); + root->compute_here(node_h, true, 0, false, params, target); // Tile h std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); const auto &thread = root->children[0]->children[0]; const auto &thread_bounds_g = thread->get_bounds(node_g); @@ -86,16 +87,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -142,16 +143,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -202,16 +203,16 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.back() = 32; // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -263,19 +264,19 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); tiling.push_back(1); // Serial loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); tiling.clear(); tiling.push_back(1); tiling.push_back(32); // Thread loop - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); @@ -333,23 +334,23 @@ void test_bounds() { std::unique_ptr root = std::make_unique(); // Compute out at root - root->compute_here(node_out, true, 0, false, target); + root->compute_here(node_out, true, 0, false, params, target); // Tile out std::vector tiling; tiling.push_back(1); tiling.push_back(1); // Serial loop - auto thread_loop = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + auto thread_loop = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr thread_loop_copy{new LoopNest}; thread_loop_copy->copy_from(*thread_loop); - thread_loop_copy->compute_here(node_f, true, 0, false, target); + thread_loop_copy->compute_here(node_f, true, 0, false, params, target); tiling.clear(); tiling.push_back(32); tiling.push_back(1); // Thread loop root->children[0] = thread_loop_copy.release(); - root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), target, true, false); + root->children[0] = root->children[0]->parallelize_in_tiles(tiling, root.get(), params, target, true, false); std::unique_ptr root_copy{new LoopNest}; root_copy->copy_from(*root); From 31fad63592070a04203ea61c5378cc14b7195f24 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 11 Sep 2022 17:54:33 -0400 Subject: [PATCH 29/63] fix cmake error --- src/autoschedulers/anderson2021/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 53b80e4c43fa..2cd9278f899d 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -73,7 +73,7 @@ target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser add_executable(anderson2021_demo.generator ${COMMON_DIR}/demo_generator.cpp) target_link_libraries(anderson2021_demo.generator PRIVATE Halide::Halide Halide::Generator) -add_halide_library(anderson2021_demo FROM demo.generator +add_halide_library(anderson2021_demo FROM anderson2021_demo.generator GENERATOR demo FUNCTION_NAME demo # TODO(someone) From c4e93d8edac830b9dfcd5f3aa69d032ca65d6598 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 11 Sep 2022 22:39:50 -0400 Subject: [PATCH 30/63] fix Weights.cpp path --- src/autoschedulers/adams2019/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index 03746f1a6f14..ff5fc6309a32 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -126,7 +126,7 @@ add_executable(adams2019_featurization_to_sample ${COMMON_DIR}/featurization_to_ add_executable(adams2019_get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(adams2019_get_host_target PRIVATE Halide::Halide) -add_executable(adams2019_weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp Weights.cpp) +add_executable(adams2019_weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) target_link_libraries(adams2019_weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= From 82da090b6422658fe4eef78cb01302a8b5e645d0 Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 5 Nov 2022 14:06:03 -0700 Subject: [PATCH 31/63] Remove usage of include_directories --- src/autoschedulers/adams2019/CMakeLists.txt | 6 +++- .../anderson2021/CMakeLists.txt | 35 ++++++++++--------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt index ff5fc6309a32..ab9ffe6a501f 100644 --- a/src/autoschedulers/adams2019/CMakeLists.txt +++ b/src/autoschedulers/adams2019/CMakeLists.txt @@ -3,7 +3,6 @@ ## set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") -include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") function(add_adams2019_test NAME) set(options "") @@ -59,6 +58,7 @@ add_executable(adams2019_retrain_cost_model ${COMMON_DIR}/Weights.cpp retrain_cost_model.cpp $) +target_include_directories(adams2019_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(adams2019_retrain_cost_model PRIVATE ASLog adams2019_cost_model adams2019_train_cost_model Halide::Halide Halide::Plugin) ## @@ -76,6 +76,7 @@ add_autoscheduler(NAME Adams2019 ${COMMON_DIR}/Weights.cpp $) +target_include_directories(Halide_Adams2019 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(Halide_Adams2019 PRIVATE ASLog ParamParser adams2019_cost_model adams2019_train_cost_model) ## @@ -103,6 +104,7 @@ add_adams2019_test(adams2019_demo_apps_autoscheduler # ================================================================= add_executable(adams2019_included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) +target_include_directories(adams2019_included_schedule_file.generator PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(adams2019_included_schedule_file.generator PRIVATE Halide::Generator) add_halide_library(adams2019_included_schedule_file FROM adams2019_included_schedule_file.generator @@ -127,6 +129,7 @@ add_executable(adams2019_get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(adams2019_get_host_target PRIVATE Halide::Halide) add_executable(adams2019_weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) +target_include_directories(adams2019_weightsdir_to_weightsfile PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(adams2019_weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= @@ -151,6 +154,7 @@ add_adams2019_test(adams2019_test_perfect_hash_map) ## add_executable(adams2019_test_function_dag ${COMMON_DIR}/test_function_dag.cpp FunctionDAG.cpp) +target_include_directories(adams2019_test_function_dag PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019") target_link_libraries(adams2019_test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_adams2019_test(adams2019_test_function_dag) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index 2cd9278f899d..6b1cc81f8d01 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -2,15 +2,7 @@ # Resources for the autoscheduler library ## -# TODO(someone): `TARGETS cmake` cannot possibly be right for most of -# the cases here, as it is just the arch-bits-os of the current `Halide_TARGET`, -# so no GPU features will ever be present. I've commented out usages -# that seem obviously wrong to me. - set(COMMON_DIR "${Halide_SOURCE_DIR}/src/autoschedulers/common") -include_directories("${Halide_BINARY_DIR}/include") -include_directories(${COMMON_DIR}) -include_directories("${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") # weights set(WF_CPP baseline.cpp) @@ -40,6 +32,7 @@ add_executable(anderson2021_retrain_cost_model ${COMMON_DIR}/Weights.cpp retrain_cost_model.cpp ${WF_CPP}) +target_include_directories(anderson2021_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_retrain_cost_model PRIVATE ASLog anderson2021_cost_model anderson2021_train_cost_model Halide::Halide Halide::Plugin) @@ -60,6 +53,7 @@ add_autoscheduler(NAME Anderson2021 ${COMMON_DIR}/Weights.cpp ${WF_CPP}) +target_include_directories(Halide_Anderson2021 PUBLIC "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser anderson2021_cost_model anderson2021_train_cost_model) @@ -76,13 +70,13 @@ target_link_libraries(anderson2021_demo.generator PRIVATE Halide::Halide Halide: add_halide_library(anderson2021_demo FROM anderson2021_demo.generator GENERATOR demo FUNCTION_NAME demo - # TODO(someone) - # TARGETS cmake - AUTOSCHEDULER Halide::Anderson2021 - REGISTRATION DEMO_REGISTRATION_FILE) + TARGETS cmake + FEATURES cuda cuda_capability_70 + AUTOSCHEDULER Halide::Anderson2021 + REGISTRATION DEMO_REGISTRATION_FILE) add_executable(anderson2021_demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) -target_link_libraries(anderson2021_demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) +target_link_libraries(anderson2021_demo_apps_autoscheduler PRIVATE anderson2021_demo Halide::RunGenMain) add_test(NAME anderson2021_demo_apps_autoscheduler COMMAND anderson2021_demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) @@ -93,6 +87,7 @@ set_tests_properties(anderson2021_demo_apps_autoscheduler ## ================================================================= add_executable(anderson2021_included_schedule_file.generator ${COMMON_DIR}/included_schedule_file_generator.cpp) +target_include_directories(anderson2021_included_schedule_file.generator PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_included_schedule_file.generator PRIVATE Halide::Halide Halide::Generator) @@ -100,13 +95,13 @@ add_halide_library(anderson2021_included_schedule_file FROM anderson2021_included_schedule_file.generator GENERATOR included_schedule_file FUNCTION_NAME included_schedule_file - # TODO(someone) - # TARGETS cmake + TARGETS cmake + FEATURES cuda cuda_capability_70 AUTOSCHEDULER Halide::Anderson2021 REGISTRATION included_schedule_reg) add_executable(anderson2021_demo_included_schedule_file ${included_schedule_reg}) -target_link_libraries(anderson2021_demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) +target_link_libraries(anderson2021_demo_included_schedule_file PRIVATE anderson2021_included_schedule_file Halide::RunGenMain) add_test(NAME anderson2021_demo_included_schedule_file COMMAND anderson2021_demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) @@ -124,6 +119,7 @@ add_executable(anderson2021_get_host_target ${COMMON_DIR}/get_host_target.cpp) target_link_libraries(anderson2021_get_host_target PRIVATE Halide::Halide) add_executable(anderson2021_weightsdir_to_weightsfile ${COMMON_DIR}/weightsdir_to_weightsfile.cpp ${COMMON_DIR}/Weights.cpp) +target_include_directories(anderson2021_weightsdir_to_weightsfile PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021" ${COMMON_DIR}) target_link_libraries(anderson2021_weightsdir_to_weightsfile PRIVATE Halide::Runtime) # ================================================================= @@ -154,6 +150,7 @@ set_tests_properties(anderson2021_test_perfect_hash_map ## add_executable(anderson2021_test_function_dag ${COMMON_DIR}/test_function_dag.cpp FunctionDAG.cpp) +target_include_directories(anderson2021_test_function_dag PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_function_dag COMMAND anderson2021_test_function_dag) @@ -162,6 +159,7 @@ set_tests_properties(anderson2021_test_function_dag LABELS Anderson2021) add_executable(anderson2021_test_bounds test/bounds.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp Tiling.cpp) +target_include_directories(anderson2021_test_bounds PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_bounds PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_bounds COMMAND anderson2021_test_bounds) @@ -170,6 +168,7 @@ set_tests_properties(anderson2021_test_bounds LABELS Anderson2021) add_executable(anderson2021_test_parser test/parser.cpp) +target_include_directories(anderson2021_test_parser PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_parser PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_parser COMMAND anderson2021_test_parser) @@ -178,6 +177,7 @@ set_tests_properties(anderson2021_test_parser LABELS Anderson2021) add_executable(anderson2021_test_state test/state.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +target_include_directories(anderson2021_test_state PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_state PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_state COMMAND anderson2021_test_state) @@ -186,6 +186,7 @@ set_tests_properties(anderson2021_test_state LABELS Anderson2021) add_executable(anderson2021_test_storage_strides test/storage_strides.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp) +target_include_directories(anderson2021_test_storage_strides PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_storage_strides PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_storage_strides COMMAND anderson2021_test_storage_strides) @@ -195,6 +196,7 @@ set_tests_properties(anderson2021_test_storage_strides add_executable(anderson2021_test_thread_info test/thread_info.cpp LoopNest.cpp FunctionDAG.cpp GPULoopInfo.cpp Tiling.cpp) +target_include_directories(anderson2021_test_thread_info PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_thread_info PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_thread_info COMMAND anderson2021_test_thread_info) @@ -203,6 +205,7 @@ set_tests_properties(anderson2021_test_thread_info LABELS Anderson2021) add_executable(anderson2021_test_tiling test/tiling.cpp Tiling.cpp) +target_include_directories(anderson2021_test_tiling PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(anderson2021_test_tiling PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin) add_test(NAME anderson2021_test_tiling COMMAND anderson2021_test_tiling) From 99627a50ff8d982b4d83103e52cdc5f39b8ed65a Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 4 Dec 2022 23:04:41 -0800 Subject: [PATCH 32/63] Move tests to test directory --- .../included_schedule_file.schedule.h | 60 ------------- test/autoschedulers/CMakeLists.txt | 1 + .../anderson2021/CMakeLists.txt | 85 +++++++++++++++++++ .../anderson2021/demo_generator.cpp | 51 +++++++++++ .../included_schedule_file_generator.cpp | 54 ++++++++++++ .../autoschedulers/anderson2021/test.cpp | 0 6 files changed, 191 insertions(+), 60 deletions(-) delete mode 100644 src/autoschedulers/anderson2021/included_schedule_file.schedule.h create mode 100644 test/autoschedulers/anderson2021/CMakeLists.txt create mode 100644 test/autoschedulers/anderson2021/demo_generator.cpp create mode 100644 test/autoschedulers/anderson2021/included_schedule_file_generator.cpp rename {src => test}/autoschedulers/anderson2021/test.cpp (100%) diff --git a/src/autoschedulers/anderson2021/included_schedule_file.schedule.h b/src/autoschedulers/anderson2021/included_schedule_file.schedule.h deleted file mode 100644 index c84aab4cbc0c..000000000000 --- a/src/autoschedulers/anderson2021/included_schedule_file.schedule.h +++ /dev/null @@ -1,60 +0,0 @@ - -#ifndef included_schedule_file_SCHEDULE_H -#define included_schedule_file_SCHEDULE_H - -// MACHINE GENERATED -- DO NOT EDIT -// This schedule was automatically generated by apps/autoscheduler/AutoSchedule -// for target=x86-64-osx-avx-avx2-f16c-fma-sse41 -// with machine_params=16,16777216,40 - -#include "Halide.h" - -inline void apply_schedule_included_schedule_file( - ::Halide::Pipeline pipeline, - ::Halide::Target target) { - using ::Halide::Func; - using ::Halide::MemoryType; - using ::Halide::RVar; - using ::Halide::TailStrategy; - using ::Halide::Var; - - Func relu = pipeline.get_func(4); - Func conv = pipeline.get_func(3); - Var c(relu.get_schedule().dims()[0].var); - Var ci("ci"); - Var n(relu.get_schedule().dims()[3].var); - Var x(relu.get_schedule().dims()[1].var); - Var xi("xi"); - Var y(relu.get_schedule().dims()[2].var); - Var yi("yi"); - RVar r4_x(conv.update(0).get_schedule().dims()[0].var); - RVar r4_y(conv.update(0).get_schedule().dims()[1].var); - RVar r4_z(conv.update(0).get_schedule().dims()[2].var); - relu - .split(x, x, xi, 2, TailStrategy::ShiftInwards) - .split(c, c, ci, 8, TailStrategy::ShiftInwards) - .split(y, y, yi, 4, TailStrategy::ShiftInwards) - .unroll(xi) - .unroll(yi) - .vectorize(ci) - .compute_root() - .reorder(ci, xi, yi, c, y, x, n) - .fuse(x, n, x) - .parallel(x); - conv.update(0) - .split(c, c, ci, 8, TailStrategy::GuardWithIf) - .unroll(x) - .unroll(y) - .vectorize(ci) - .reorder(ci, c, x, y, n, r4_x, r4_y, r4_z); - conv - .store_in(MemoryType::Stack) - .split(c, c, ci, 8, TailStrategy::ShiftInwards) - .unroll(x) - .unroll(y) - .vectorize(ci) - .compute_at(relu, c) - .reorder(ci, c, x, y, n); -} - -#endif // included_schedule_file_SCHEDULE_H diff --git a/test/autoschedulers/CMakeLists.txt b/test/autoschedulers/CMakeLists.txt index 302ea6489378..12a90bd21622 100644 --- a/test/autoschedulers/CMakeLists.txt +++ b/test/autoschedulers/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(adams2019) +add_subdirectory(anderson2021) add_subdirectory(li2018) add_subdirectory(mullapudi2016) diff --git a/test/autoschedulers/anderson2021/CMakeLists.txt b/test/autoschedulers/anderson2021/CMakeLists.txt new file mode 100644 index 000000000000..f87b40a63caf --- /dev/null +++ b/test/autoschedulers/anderson2021/CMakeLists.txt @@ -0,0 +1,85 @@ +if (NOT TARGET Halide::Anderson2021) + message(STATUS "Disabling anderson2021 tests for static Halide") + return() +endif () + +## +# Build rules for the Anderson2021 autoscheduler library +## + +function(add_anderson2021_test NAME) + set(options "") + set(oneValueArgs ENVIRONMENT) + set(multiValueArgs COMMAND LABELS) + cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (NOT ARGS_COMMAND) + set(ARGS_COMMAND ${NAME}) + endif() + + if (NOT ARGS_LABELS) + set(ARGS_LABELS "") + endif() + list(APPEND ARGS_LABELS anderson2021) + list(APPEND ARGS_LABELS autoschedulers) + + add_test(NAME ${NAME} + COMMAND ${ARGS_COMMAND}) + set_tests_properties(${NAME} + PROPERTIES + LABELS "${ARGS_LABELS}" + ENVIRONMENT "${ENVIRONMENT}") +endfunction() + + +## +# Tests and demos +## + +# ================================================================= + +add_halide_generator(anderson2021_demo.generator + SOURCES demo_generator.cpp) + +add_halide_library(anderson2021_demo FROM anderson2021_demo.generator + GENERATOR demo + TARGETS cmake + FEATURES cuda cuda_capability_70 + AUTOSCHEDULER Halide::Anderson2021 + REGISTRATION DEMO_REGISTRATION_FILE) + +add_executable(anderson2021_demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) +target_link_libraries(anderson2021_demo_apps_autoscheduler PRIVATE anderson2021_demo Halide::RunGenMain) + +add_anderson2021_test(anderson2021_demo_apps_autoscheduler + COMMAND anderson2021_demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) + +# ================================================================= + +add_halide_generator(anderson2021_included_schedule_file.generator + SOURCES included_schedule_file_generator.cpp) + +add_halide_library(anderson2021_included_schedule_file FROM anderson2021_included_schedule_file.generator + GENERATOR included_schedule_file + TARGETS cmake + FEATURES cuda cuda_capability_70 + AUTOSCHEDULER Halide::Anderson2021 + REGISTRATION anderson2021_included_schedule_reg) + +add_executable(anderson2021_demo_included_schedule_file ${anderson2021_included_schedule_reg}) +target_link_libraries(anderson2021_demo_included_schedule_file PRIVATE anderson2021_included_schedule_file Halide::RunGenMain) + +add_anderson2021_test(anderson2021_demo_included_schedule_file + COMMAND anderson2021_demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) + +# ================================================================= +# Smaller tests + +add_executable(anderson2021_test_apps_autoscheduler test.cpp) +target_link_libraries(anderson2021_test_apps_autoscheduler PRIVATE Halide::Halide Halide::Tools ${CMAKE_DL_LIBS}) + +add_anderson2021_test(anderson2021_test_apps_autoscheduler + COMMAND anderson2021_test_apps_autoscheduler $ $/baseline.weights + LABELS multithreaded + ENVIRONMENT "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH}") + diff --git a/test/autoschedulers/anderson2021/demo_generator.cpp b/test/autoschedulers/anderson2021/demo_generator.cpp new file mode 100644 index 000000000000..8c31d68e5e1a --- /dev/null +++ b/test/autoschedulers/anderson2021/demo_generator.cpp @@ -0,0 +1,51 @@ +#include "Halide.h" + +namespace { + +using namespace Halide; + +class ConvRelu : public Halide::Generator { +public: + Input> input{"input", 4}; + Input> filter{"filter", 4}; + Input> bias{"bias", 1}; + Output> relu{"relu", 4}; + + void generate() { + const int N = 5, CI = 120, CO = 24, W = 100, H = 80; + + Var x("x"), y("y"), c("c"), n("n"); + + Func conv("conv"); + RDom r(0, CI, 0, 3, 0, 3); + conv(c, x, y, n) = bias(c); + conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); + relu(c, x, y, n) = max(0, conv(c, x, y, n)); + + relu.bound(c, 0, CO) + .bound(x, 0, W) + .bound(y, 0, H) + .bound(n, 0, N); + + relu.dim(0).set_bounds(0, CO).set_stride(1); + relu.dim(1).set_bounds(0, W).set_stride(CO); + relu.dim(2).set_bounds(0, H).set_stride(CO * W); + relu.dim(3).set_bounds(0, N).set_stride(CO * H * W); + + input.dim(0).set_bounds(0, CI).set_stride(1); + input.dim(1).set_bounds(0, W + 2).set_stride(CI); + input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2)); + input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2)); + + filter.dim(0).set_bounds(0, CO).set_stride(1); + filter.dim(1).set_bounds(0, 3).set_stride(CO); + filter.dim(2).set_bounds(0, 3).set_stride(CO * 3); + filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3); + + bias.dim(0).set_bounds(0, CO).set_stride(1); + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(ConvRelu, demo) diff --git a/test/autoschedulers/anderson2021/included_schedule_file_generator.cpp b/test/autoschedulers/anderson2021/included_schedule_file_generator.cpp new file mode 100644 index 000000000000..7103ec6c80e8 --- /dev/null +++ b/test/autoschedulers/anderson2021/included_schedule_file_generator.cpp @@ -0,0 +1,54 @@ +#include "Halide.h" + +#if defined(GENERATING_SCHEDULE) +// nothing +#else +#include "included_schedule_file.schedule.h" +#endif + +namespace { + +// Trivial Generator for testing (and demonstrating) use of .schedule.h +// files produced by the autoschedulers; this is very similar to +// demo_generator.cpp, but packaged separately to avoid confusion for +// newcomers. +struct IncludedScheduleFile : public Halide::Generator { + Input> input{"input", 4}; + Input> filter{"filter", 4}; + Input> bias{"bias", 1}; + Output> relu{"relu", 4}; + + void generate() { + const int N = 5, CI = 120, CO = 24, W = 100, H = 80; + + Var x("x"), y("y"), c("c"), n("n"); + + // Algorithm + Func conv("conv"); + RDom r(0, CI, 0, 3, 0, 3); + conv(c, x, y, n) = bias(c); + conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); + relu(c, x, y, n) = max(0, conv(c, x, y, n)); + + // Estimates (for autoscheduler and/or RunGen) + input.set_estimates({{0, CI}, {0, W + 2}, {0, H + 2}, {0, N}}); + filter.set_estimates({{0, CO}, {0, 3}, {0, 3}, {0, CI}}); + bias.set_estimates({{0, CO}}); + relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}}); + + // Schedule + if (using_autoscheduler()) { + // nothing + } else { +#if defined(GENERATING_SCHEDULE) + abort(); +#else + apply_schedule_included_schedule_file(get_pipeline(), get_target()); +#endif + } + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(IncludedScheduleFile, included_schedule_file) diff --git a/src/autoschedulers/anderson2021/test.cpp b/test/autoschedulers/anderson2021/test.cpp similarity index 100% rename from src/autoschedulers/anderson2021/test.cpp rename to test/autoschedulers/anderson2021/test.cpp From 4b6f020fc4038d69a01096d4076baf8889c86840 Mon Sep 17 00:00:00 2001 From: aekul Date: Wed, 28 Dec 2022 12:20:48 -0800 Subject: [PATCH 33/63] clang-tidy/format --- apps/hannk/halide/elementwise_generator.cpp | 2 +- src/Generator.cpp | 4 ++-- src/Generator.h | 12 +++++++++--- src/runtime/d3d12compute.cpp | 2 +- src/runtime/hexagon_remote/c11_stubs.cpp | 2 +- src/runtime/mini_d3d12.h | 4 ++-- test/correctness/lerp.cpp | 12 +++++++++--- test/generator/metadata_tester_generator.cpp | 16 ++++++++-------- 8 files changed, 33 insertions(+), 21 deletions(-) diff --git a/apps/hannk/halide/elementwise_generator.cpp b/apps/hannk/halide/elementwise_generator.cpp index 1fd761f43b1b..dc7c65fa37f4 100644 --- a/apps/hannk/halide/elementwise_generator.cpp +++ b/apps/hannk/halide/elementwise_generator.cpp @@ -122,7 +122,7 @@ class Elementwise : public Generator { GeneratorParam output3_type_{"output3_type", Int(0)}; // An array of inputs. - Input[]> inputs_ { "inputs" }; + Input[]> inputs_{"inputs"}; // The program to run. See elementwise_program.h for a description of // this buffer. Input> program_{"program"}; diff --git a/src/Generator.cpp b/src/Generator.cpp index db7253049d65..2902e8bd5d0f 100644 --- a/src/Generator.cpp +++ b/src/Generator.cpp @@ -2147,11 +2147,11 @@ void generator_test() { public: Input expr_array_input{"expr_array_input"}; Input func_array_input{"input_func_array"}; - Input[]> buffer_array_input { "buffer_array_input" }; + Input[]> buffer_array_input{"buffer_array_input"}; Input expr_array_output{"expr_array_output"}; Output func_array_output{"func_array_output"}; - Output[]> buffer_array_output { "buffer_array_output" }; + Output[]> buffer_array_output{"buffer_array_output"}; void generate() { } diff --git a/src/Generator.h b/src/Generator.h index 2981d6f949b6..9d356da660da 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -387,7 +387,9 @@ template struct select_type : std::conditional::type> {}; template -struct select_type { using type = typename std::conditional::type; }; +struct select_type { + using type = typename std::conditional::type; +}; class GeneratorParamInfo; @@ -2155,7 +2157,9 @@ class GeneratorInput_Arithmetic : public GeneratorInput_Scalar { }; template -struct type_sink { typedef void type; }; +struct type_sink { + typedef void type; +}; template struct has_static_halide_type_method : std::false_type {}; @@ -3770,7 +3774,9 @@ class Generator : public Internal::GeneratorBase { // std::is_member_function_pointer will fail if there is no member of that name, // so we use a little SFINAE to detect if there are method-shaped members. template - struct type_sink { typedef void type; }; + struct type_sink { + typedef void type; + }; template struct has_configure_method : std::false_type {}; diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 0e041395c62f..fc0102a5a66e 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -553,7 +553,7 @@ struct d3d12_buffer { d3d12_buffer *staging; size_t offset; size_t size; - } * xfer; + } *xfer; bool mallocd; void *host_mirror; diff --git a/src/runtime/hexagon_remote/c11_stubs.cpp b/src/runtime/hexagon_remote/c11_stubs.cpp index 22078f62787e..a6d045bdb589 100644 --- a/src/runtime/hexagon_remote/c11_stubs.cpp +++ b/src/runtime/hexagon_remote/c11_stubs.cpp @@ -6,7 +6,7 @@ extern "C" { #include "HAP_farf.h" -//#define FARF_LOW 1 // Enable debug output +// #define FARF_LOW 1 // Enable debug output void __attribute__((weak)) __cxa_finalize() { FARF(LOW, "Finalizing\n"); diff --git a/src/runtime/mini_d3d12.h b/src/runtime/mini_d3d12.h index e829b997bbc3..3fe30d1dddd1 100644 --- a/src/runtime/mini_d3d12.h +++ b/src/runtime/mini_d3d12.h @@ -546,7 +546,7 @@ typedef RPC_BINDING_HANDLE handle_t; #define RPC_MGR_EPV void /* rpcdcep.h */ -//#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM) +// #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM) typedef struct _RPC_VERSION { unsigned short MajorVersion; @@ -572,7 +572,7 @@ typedef struct _RPC_MESSAGE { unsigned long RpcFlags; } RPC_MESSAGE, __RPC_FAR *PRPC_MESSAGE; -//#endif /* WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM) */ +// #endif /* WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM) */ /* rpcndr.h */ #ifndef DECLSPEC_NOVTABLE diff --git a/test/correctness/lerp.cpp b/test/correctness/lerp.cpp index 2e1130b412a2..8e2e41ed5c2d 100644 --- a/test/correctness/lerp.cpp +++ b/test/correctness/lerp.cpp @@ -38,11 +38,17 @@ bool convert_to_value(double interpolated) { // Prevent iostream from printing 8-bit numbers as character constants. template -struct promote_if_char { typedef t promoted; }; +struct promote_if_char { + typedef t promoted; +}; template<> -struct promote_if_char { typedef int32_t promoted; }; +struct promote_if_char { + typedef int32_t promoted; +}; template<> -struct promote_if_char { typedef int32_t promoted; }; +struct promote_if_char { + typedef int32_t promoted; +}; template bool relatively_equal(value_t a, value_t b) { diff --git a/test/generator/metadata_tester_generator.cpp b/test/generator/metadata_tester_generator.cpp index 655ea10e7782..3f200d4989dc 100644 --- a/test/generator/metadata_tester_generator.cpp +++ b/test/generator/metadata_tester_generator.cpp @@ -37,17 +37,17 @@ class MetadataTester : public Halide::Generator { Input array2_i16{"array2_i16", 16}; Input array_i32{"array_i32", 32, -32, 127}; // must be overridden to size=2 Input array2_i32{"array2_i32", 32, -32, 127}; - Input array_h { "array_h", nullptr }; // must be overridden to size=2 + Input array_h{"array_h", nullptr}; // must be overridden to size=2 Input[2]> buffer_array_input1 { "buffer_array_input1" }; Input[2]> buffer_array_input2 { "buffer_array_input2" }; // buffer_array_input2.dim must be set Input[2]> buffer_array_input3 { "buffer_array_input3" }; // buffer_array_input2.type must be set Input[2]> buffer_array_input4 { "buffer_array_input4" }; // dim and type must be set // .size must be specified for all of these - Input[]> buffer_array_input5 { "buffer_array_input5" }; - Input[]> buffer_array_input6 { "buffer_array_input6" }; // buffer_array_input2.dim must be set - Input[]> buffer_array_input7 { "buffer_array_input7" }; // buffer_array_input2.type must be set - Input[]> buffer_array_input8 { "buffer_array_input8" }; // dim and type must be set + Input[]> buffer_array_input5{"buffer_array_input5"}; + Input[]> buffer_array_input6{"buffer_array_input6"}; // buffer_array_input2.dim must be set + Input[]> buffer_array_input7{"buffer_array_input7"}; // buffer_array_input2.type must be set + Input[]> buffer_array_input8{"buffer_array_input8"}; // dim and type must be set Input> buffer_f16_typed{"buffer_f16_typed"}; Input> buffer_f16_untyped{"buffer_f16_untyped"}; @@ -70,9 +70,9 @@ class MetadataTester : public Halide::Generator { Output[2]> array_outputs6 { "array_outputs6" }; // dimensions and type will be inferred by usage // .size must be specified for all of these - Output[]> array_outputs7 { "array_outputs7" }; - Output[]> array_outputs8 { "array_outputs8" }; - Output[]> array_outputs9 { "array_outputs9" }; + Output[]> array_outputs7{"array_outputs7"}; + Output[]> array_outputs8{"array_outputs8"}; + Output[]> array_outputs9{"array_outputs9"}; // Output untyped_scalar_output{"untyped_scalar_output"}; // untyped_scalar_output.type must be set From d29fa84183016b9214ed0f0a46625baf7bf4c753 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 30 Dec 2022 23:43:38 -0500 Subject: [PATCH 34/63] Update scripts to use CMake build directory structure --- .../anderson2021/AutoSchedule.cpp | 6 +- src/autoschedulers/anderson2021/CostModel.h | 2 +- src/autoschedulers/anderson2021/Makefile | 28 ++-- .../anderson2021/autotune_loop.sh | 103 ++++++++------- .../anderson2021/generate_data.sh | 117 ++++++++++------- .../anderson2021/scripts/predict_all.sh | 26 ++-- .../anderson2021/scripts/utils.sh | 124 ++++++++++-------- 7 files changed, 218 insertions(+), 188 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 601b04f795f5..f562ce5923ad 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -728,12 +728,12 @@ struct Anderson2021 { params.parallelism = params_in.parallelism; params.beam_size = get_scalar_env_var("HL_BEAM_SIZE", 32); params.random_dropout = get_scalar_env_var("HL_RANDOM_DROPOUT", 100); - params.random_dropout_seed = get_scalar_env_var("HL_SEED", (int)time(nullptr)); + params.random_dropout_seed = get_scalar_env_var("HL_SEED", (int)time(nullptr)); params.weights_path = get_scalar_env_var("HL_WEIGHTS_DIR"); params.disable_subtiling = get_scalar_env_var("HL_NO_SUBTILING", 0); params.randomize_tilings = get_scalar_env_var("HL_RANDOMIZE_TILINGS", 0); - params.search_space_options = get_scalar_env_var("HL_DISABLE_MEMOIZED_FEATURES", "1111"); - params.freeze_inline_compute_root = get_scalar_env_var("HL_AUTOSCHEDULE_MEMORY_LIMIT", 0); + params.search_space_options = get_scalar_env_var("HL_SEARCH_SPACE_OPTIONS", "1111"); + params.freeze_inline_compute_root = get_scalar_env_var("HL_FREEZE_INLINE_COMPUTE_ROOT", 0); params.partial_schedule_path = get_scalar_env_var("PARTIAL_SCHEDULE", ""); params.num_passes = get_scalar_env_var("HL_NUM_PASSES", 0); params.stack_factor = get_scalar_env_var("HL_STACK_FACTOR", 0.95f); diff --git a/src/autoschedulers/anderson2021/CostModel.h b/src/autoschedulers/anderson2021/CostModel.h index 486fd45d31ec..f8602c3a261d 100644 --- a/src/autoschedulers/anderson2021/CostModel.h +++ b/src/autoschedulers/anderson2021/CostModel.h @@ -30,7 +30,7 @@ struct Anderson2021Params { /** Random seed used by the random dropout. If 0, use time(). * Formerly HL_SEED */ - int random_dropout_seed = 0; + int64_t random_dropout_seed = 0; /** When training or schedule, read weights from this directory or file. * (If path ends in `.weights` it is written as a single file, otherwise a directory of files.) diff --git a/src/autoschedulers/anderson2021/Makefile b/src/autoschedulers/anderson2021/Makefile index 536cd088116c..ed829946e138 100644 --- a/src/autoschedulers/anderson2021/Makefile +++ b/src/autoschedulers/anderson2021/Makefile @@ -4,7 +4,7 @@ HALIDE_SRC_ROOT = $(realpath $(SRC)/../../../) COMMON_DIR ?= $(realpath $(SRC)/../common/) HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib -HL_TARGET ?= host-cuda +HL_TARGET ?= host $(info Looking for Halide distro at $(HALIDE_DISTRIB_PATH). If this is incorrect, set the make variable HALIDE_DISTRIB_PATH) @@ -41,8 +41,8 @@ $(BIN)/baseline_weights.o: $(BIN)/baseline_weights.cpp $(CXX) -c $< -o $@ AUTOSCHED_COST_MODEL_LIBS=\ -$(BIN)/cost_model/cost_model.a \ -$(BIN)/cost_model/train_cost_model.a \ +$(BIN)/cost_model/anderson2021_cost_model.a \ +$(BIN)/cost_model/anderson2021_train_cost_model.a \ $(BIN)/cost_model.generator: $(SRC)/cost_model_generator.cpp \ $(SRC)/cost_model_schedule.h \ @@ -55,9 +55,9 @@ $(BIN)/auto_schedule_runtime.a: $(BIN)/cost_model.generator @mkdir -p $(@D) $^ -r auto_schedule_runtime -o $(BIN) target=$(HL_TARGET) -$(BIN)/cost_model/%.a: $(BIN)/cost_model.generator +$(BIN)/cost_model/anderson2021_%.a: $(BIN)/cost_model.generator @mkdir -p $(@D) - $^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime enable_debug_output=$(ENABLE_DEBUG_OUTPUT) -e stmt,static_library,h,assembly + $^ -g $* -o $(BIN)/cost_model -f $* -n anderson2021_$* target=$(HL_TARGET)-no_runtime enable_debug_output=$(ENABLE_DEBUG_OUTPUT) -e stmt,static_library,h,assembly # It's important to use dynamic lookups for undefined symbols here: all of libHalide # is expected to be present (in the loading binary), so we explicitly make the symbols @@ -94,7 +94,7 @@ $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp \ $(GENERATOR_DEPS) \ $(BIN)/auto_schedule_runtime.a @mkdir -p $(@D) - $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) + $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) -I . $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(COMMON_DIR)/ASLog.cpp \ @@ -110,17 +110,17 @@ $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ @mkdir -p $(@D) $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) -$(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp +$(BIN)/anderson2021_featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ -$(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h +$(BIN)/anderson2021_get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ -$(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(COMMON_DIR)/Weights.cpp +$(BIN)/anderson2021_weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(COMMON_DIR)/Weights.cpp @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ + $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -I . # A sample generator to autoschedule. Note that if it statically links # to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the @@ -145,7 +145,7 @@ demo: $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_anderson2021.$(SHAR # demonstrates an autotuning loop # (using $(BIN) and $(SRC) here seems overkill, but makes copy-n-paste elsewhere easier) -autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/featurization_to_sample $(BIN)/get_host_target $(BIN)/retrain_cost_model $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) $(SRC)/autotune_loop.sh +autotune: $(GENERATOR_BIN)/demo.generator $(BIN)/anderson2021_featurization_to_sample $(BIN)/anderson2021_get_host_target $(BIN)/anderson2021_retrain_cost_model $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) $(SRC)/autotune_loop.sh SAMPLES_DIR=test_autotuned_samples \ bash $(SRC)/autotune_loop.sh \ $(GENERATOR_BIN)/demo.generator \ @@ -194,9 +194,9 @@ build: $(BIN)/$(HL_TARGET)/test \ $(BIN)/test_function_dag \ $(BIN)/$(HL_TARGET)/included_schedule_file.rungen \ $(GENERATOR_BIN)/demo.generator \ - $(BIN)/featurization_to_sample \ - $(BIN)/get_host_target \ - $(BIN)/retrain_cost_model \ + $(BIN)/anderson2021_featurization_to_sample \ + $(BIN)/anderson2021_get_host_target \ + $(BIN)/anderson2021_retrain_cost_model \ $(BIN)/libautoschedule_anderson2021.$(SHARED_EXT) test: test_bounds test_tiling test_storage_strides test_parser test_state test_thread_info run_test test_perfect_hash_map test_function_dag demo included_schedule_file autotune diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh index fc627030cfea..98737981d861 100644 --- a/src/autoschedulers/anderson2021/autotune_loop.sh +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -1,35 +1,36 @@ #!/bin/bash -# Build the generator to autotune. This script will be autotuning the -# autoscheduler's cost model training pipeline, which is large enough -# to be interesting. -if [ $# -lt 6 -o $# -gt 7 ]; then - echo "Usage: $0 /path/to/some.generator generatorname halide_target weights_file autoschedule_bin_dir train_only [generator_args_sets]" +# Autotune the given generator +if [ $# -lt 7 -o $# -gt 8 ]; then + echo "Usage: $0 /path/to/some.generator generatorname halide_target weights_file halide_build_dir parallelism train_only [generator_args_sets]" exit fi set -eu -source $(dirname $0)/scripts/utils.sh -find_halide HALIDE_ROOT - -#trap "exit" INT TERM -#trap "kill 0" EXIT +AUTOSCHEDULER_SRC_DIR=$(dirname $0) +SCRIPTS_DIR="${AUTOSCHEDULER_SRC_DIR}/scripts" +source ${SCRIPTS_DIR}/utils.sh GENERATOR=${1} PIPELINE=${2} HL_TARGET=${3} START_WEIGHTS_FILE=${4} -AUTOSCHED_BIN=${5} -TRAIN_ONLY=${6} +HALIDE_BUILD_DIR=${5} +PARALLELISM=${6} +TRAIN_ONLY=${7} + +get_halide_src_dir ${AUTOSCHEDULER_SRC_DIR} HALIDE_SRC_DIR +get_autoscheduler_build_dir ${HALIDE_BUILD_DIR} AUTOSCHEDULER_BUILD_DIR +get_tools_build_dir ${HALIDE_BUILD_DIR} TOOLS_BUILD_DIR LEARNING_RATE=${LEARNING_RATE:-0.001} # Read the generator-arg sets into an array. Each set is delimited # by space; multiple values within each set are are delimited with ; # e.g. "set1arg1=1;set1arg2=foo set2=bar set3arg1=3.14;set4arg2=42" -if [ $# -ge 7 ]; then - IFS=' ' read -r -a GENERATOR_ARGS_SETS_ARRAY <<< "${7}" +if [ $# -ge 8 ]; then + IFS=' ' read -r -a GENERATOR_ARGS_SETS_ARRAY <<< "${8}" else declare -a GENERATOR_ARGS_SETS_ARRAY= fi @@ -47,10 +48,6 @@ if [ -z ${CXX+x} ]; then exit fi -if [ -z ${HL_TARGET} ]; then - get_host_target ${HALIDE_ROOT} HL_TARGET - HL_TARGET=${HL_TARGET}-cuda-cuda_capability_70 -fi echo Training target is: ${HL_TARGET} if [ -z ${GENERATOR} ]; then @@ -84,18 +81,12 @@ fi #fi #done -if [ $(uname -s) = "Darwin" ]; then - LOCAL_CORES=`sysctl -n hw.ncpu` -else - LOCAL_CORES=`nproc` -fi -LOCAL_CORES=80 -echo Local number of cores detected as ${LOCAL_CORES} +get_num_cpu_cores NUM_CPU_CORES +echo "Number of CPU cores detected as ${NUM_CPU_CORES}" # A batch of this many samples is built in parallel, and then # benchmarked serially. BATCH_SIZE=80 -NUM_CORES=80 EPOCHS=200 NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) @@ -177,21 +168,14 @@ make_featurization() { beam=1 fi + # TODO: make these arguments to this file local -r shared_memory_limit=48 local -r shared_memory_sm_limit=96 + local -r active_block_limit=32 + local -r active_warp_limit=64 GPU=$((RANDOM % NUM_GPUS)) - CMD="CUDA_VISIBLE_DEVICES=${GPU} \ - HL_SEARCH_SPACE_OPTIONS=${SEARCH_SPACE_OPTIONS} - HL_SEED=${RANDOM_DROPOUT_SEED} \ - HL_WEIGHTS_DIR=${WEIGHTS} \ - HL_RANDOMIZE_TILINGS=${RANDOMIZE_TILINGS} \ - HL_FREEZE_INLINE_COMPUTE_ROOT=${USE_FREEZE} \ - HL_RANDOM_DROPOUT=${dropout} \ - HL_BEAM_SIZE=${beam} \ - HL_SHARED_MEMORY_LIMIT=${shared_memory_limit} \ - HL_SHARED_MEMORY_SM_LIMIT=${shared_memory_sm_limit} \ - HL_DEBUG_AUTOSCHEDULE=1 \ + CMD="HL_DEBUG_AUTOSCHEDULE=1 \ HL_DEBUG_CODEGEN=1 \ /bin/time -f 'Compile time (s): %e' ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \ ${GENERATOR} \ @@ -201,9 +185,20 @@ make_featurization() { -e stmt,assembly,static_library,c_header,registration,schedule,featurization \ target=${HL_TARGET} \ ${EXTRA_GENERATOR_ARGS} \ - -p ${AUTOSCHED_BIN}/libautoschedule_anderson2021.so \ + -p ${AUTOSCHEDULER_BUILD_DIR}/libautoschedule_anderson2021.so \ autoscheduler=Anderson2021 \ - autoscheduler.parallelism=${NUM_CORES} + autoscheduler.parallelism=${PARALLELISM} \ + autoscheduler.beam_size=${beam} \ + autoscheduler.random_dropout=${dropout} \ + autoscheduler.random_dropout_seed=${RANDOM_DROPOUT_SEED} \ + autoscheduler.weights_path=${WEIGHTS} \ + autoscheduler.randomize_tilings=${RANDOMIZE_TILINGS} \ + autoscheduler.search_space_options=${SEARCH_SPACE_OPTIONS} \ + autoscheduler.freeze_inline_compute_root=${USE_FREEZE} \ + autoscheduler.shared_memory_limit_kb=${shared_memory_limit} \ + autoscheduler.shared_memory_sm_limit_kb=${shared_memory_sm_limit} \ + autoscheduler.active_block_limit=${active_block_limit} \ + autoscheduler.active_warp_limit=${active_warp_limit} \ 2> ${D}/compile_err.txt > ${D}/compile_log.txt" FAILED=0 @@ -227,7 +222,7 @@ make_featurization() { -O3 -I ../../include \ ${LIBPNG_CFLAGS} \ - ${AUTOSCHED_BIN}/host-cuda/RunGenMain.o \ + ${TOOLS_BUILD_DIR}/RunGenMain.o \ ${D}/*.registration.cpp \ ${D}/*.a \ -o ${D}/bench \ @@ -254,7 +249,7 @@ make_featurization() { rm ${D}/compile_log.txt } -IMAGES_DIR="${HALIDE_ROOT}/apps/images" +IMAGES_DIR="${HALIDE_SRC_DIR}/apps/images" # Benchmark one of the random samples benchmark_sample() { @@ -270,7 +265,7 @@ benchmark_sample() { return fi - CMD="CUDA_VISIBLE_DEVICES=${GPU_INDEX} HL_NUM_THREADS=${NUM_CORES} \ + CMD="CUDA_VISIBLE_DEVICES=${GPU_INDEX} HL_NUM_THREADS=${PARALLELISM} \ ${TIMEOUT_CMD} -k ${BENCHMARKING_TIMEOUT} ${BENCHMARKING_TIMEOUT} \ ${D}/bench" @@ -304,7 +299,7 @@ benchmark_sample() { S=$2 FNAME=$6 - ${AUTOSCHED_BIN}/featurization_to_sample ${D}/${FNAME}.featurization $R $P $S ${D}/${FNAME}.sample || echo "featurization_to_sample failed for ${D} (probably because benchmarking failed)" + ${AUTOSCHEDULER_BUILD_DIR}/anderson2021_featurization_to_sample ${D}/${FNAME}.featurization $R $P $S ${D}/${FNAME}.sample || echo "featurization_to_sample failed for ${D} (probably because benchmarking failed)" rm ${D}/${FNAME}.featurization rm ${D}/bench @@ -423,20 +418,26 @@ benchmark_loop() { rm -rf ${BENCHMARK_QUEUE_DIR} } -MAX_AUTOSCHEDULE_JOBS=${LOCAL_CORES} +MAX_AUTOSCHEDULE_JOBS=${NUM_CPU_CORES} BENCHMARK_QUEUE_ENABLED=0 if [[ $USE_BENCHMARK_QUEUE == 1 ]] && [[ $TRAIN_ONLY != 1 ]]; then - echo "Benchmark queue = ON" - # This includes 1 job for the benchmark loop - MAX_AUTOSCHEDULE_JOBS=$((LOCAL_CORES-NUM_GPUS)) - BENCHMARK_QUEUE_ENABLED=1 + # Include 1 job for the benchmark loop + MAX_AUTOSCHEDULE_JOBS=$((NUM_CPU_CORES-NUM_GPUS-1)) + if [[ MAX_AUTOSCHEDULE_JOBS -le 0 ]]; then + MAX_AUTOSCHEDULE_JOBS=${NUM_CPU_CORES} + echo "Not enough cores available to use the benchmark queue" + echo "Benchmark queue = OFF" + else + BENCHMARK_QUEUE_ENABLED=1 + echo "Benchmark queue = ON" + fi else echo "Benchmark queue = OFF" fi -echo "Max. autoschedule jobs = ${MAX_AUTOSCHEDULE_JOBS}" +echo "Max. concurrent autoschedule jobs = ${MAX_AUTOSCHEDULE_JOBS}" SECONDS=0 @@ -540,7 +541,7 @@ if [[ $TRAIN_ONLY != 1 ]]; then fi CUR_SECONDS="$SECONDS" - retrain_cost_model ${HALIDE_ROOT} ${SAMPLES} ${WEIGHTS} ${NUM_CORES} ${EPOCHS} ${PIPELINE} ${LEARNING_RATE} + retrain_cost_model ${HALIDE_BUILD_DIR} ${SAMPLES} ${WEIGHTS} ${PARALLELISM} ${EPOCHS} ${PIPELINE} ${LEARNING_RATE} TRAIN_TIME=$((SECONDS-CUR_SECONDS)) echo "Train time for batch with ID = ${BATCH_ID}: ${TRAIN_TIME}" fi @@ -563,7 +564,7 @@ fi echo Retraining model... CUR_SECONDS="$SECONDS" -retrain_cost_model ${HALIDE_ROOT} ${SAMPLES} ${WEIGHTS} ${NUM_CORES} ${EPOCHS} ${PIPELINE} ${LEARNING_RATE} +retrain_cost_model ${HALIDE_SRC_DIR} ${SAMPLES} ${WEIGHTS} ${PARALLELISM} ${EPOCHS} ${PIPELINE} ${LEARNING_RATE} TRAIN_TIME=$((SECONDS-CUR_SECONDS)) echo "Num batches = ${NUM_BATCHES}. Train time: ${TRAIN_TIME}" diff --git a/src/autoschedulers/anderson2021/generate_data.sh b/src/autoschedulers/anderson2021/generate_data.sh index 8ff3d9fbd402..c407f39c660c 100644 --- a/src/autoschedulers/anderson2021/generate_data.sh +++ b/src/autoschedulers/anderson2021/generate_data.sh @@ -8,7 +8,13 @@ # by providing statistics, support for resuming previous batches, autotuning # across multiple apps, etc. # +# It assumes that the autoscheduler itself and any apps to be autoscheduled have +# already been built and the resulting files are stored in halide_build_dir. +# Using CMake is recommended because this script assumes that the given +# halide_build_dir has the same structure that the CMake build will produce +# # Arguments: +# halide_build_dir - path where Halide is built # max_iterations - the number of batches to generate. The cost model is # retrained after each # resume - resume using the previously generated samples or start a new run? @@ -16,60 +22,70 @@ # existing samples # predict_only - don't generate new data, just predict the costs of the existing # samples +# parallelism - the number of streaming multiprocessors in the target GPU # app - the individual application (in Halide/apps/) to generate data for. If # not provided, it will generate a data for all the apps in the list below -if [[ $# -ne 4 && $# -ne 5 ]]; then - echo "Usage: $0 max_iterations resume train_only predict_only app" +if [[ $# -ne 6 && $# -ne 7 ]]; then + echo "Usage: $0 halide_build_dir max_iterations resume train_only predict_only parallelism app" exit fi -set -e +set -eu -MAX_ITERATIONS=${1} -RESUME=${2} -TRAIN_ONLY=${3} -PREDICT_ONLY=${4} -APP=${5} +HALIDE_BUILD_DIR=${1} +MAX_ITERATIONS=${2} +RESUME=${3} +TRAIN_ONLY=${4} +PREDICT_ONLY=${5} +PARALLELISM=${6} -if [[ $PREDICT_ONLY == 1 && $TRAIN_ONLY == 1 ]]; then - echo "At most one of train_only and predict_only can be set to 1." - exit +if [ -z ${7+x} ]; then + APPS="bgu bilateral_grid local_laplacian nl_means lens_blur camera_pipe stencil_chain harris hist max_filter unsharp interpolate conv_layer cuda_mat_mul iir_blur depthwise_separable_conv" +else + APPS=${7} fi -if [[ $PREDICT_ONLY == 1 ]]; then - echo "Predict only mode: ON" +if [ -z ${CXX+x} ]; then + echo The CXX environment variable must be set. Exiting... + exit fi -SCRIPTS_DIR="$(dirname $0)/scripts" -source ${SCRIPTS_DIR}/utils.sh - -BEST_SCHEDULES_DIR=$(dirname $0)/best - -find_halide HALIDE_ROOT - -build_autoscheduler_tools ${HALIDE_ROOT} -get_absolute_autoscheduler_bin_dir ${HALIDE_ROOT} AUTOSCHED_BIN -get_autoscheduler_dir ${HALIDE_ROOT} AUTOSCHED_SRC - export CXX="ccache ${CXX}" -export HL_MACHINE_PARAMS=80,24000000,160 +AUTOSCHEDULER_SRC_DIR=$(dirname $0) +SCRIPTS_DIR="${AUTOSCHEDULER_SRC_DIR}/scripts" +source ${SCRIPTS_DIR}/utils.sh +make_dir_path_absolute $(dirname $0) AUTOSCHEDULER_SRC_DIR -export HL_PERMIT_FAILED_UNROLL=1 +get_halide_src_dir ${AUTOSCHEDULER_SRC_DIR} HALIDE_SRC_DIR +get_autoscheduler_build_dir ${HALIDE_BUILD_DIR} AUTOSCHEDULER_BUILD_DIR -export AUTOSCHED_BIN=${AUTOSCHED_BIN} -echo "AUTOSCHED_BIN set to ${AUTOSCHED_BIN}" +echo "HALIDE_SRC_DIR = ${HALIDE_SRC_DIR}" +echo "HALIDE_BUILD_DIR = ${HALIDE_BUILD_DIR}" +echo "AUTOSCHEDULER_SRC_DIR = ${AUTOSCHEDULER_SRC_DIR}" +echo "AUTOSCHEDULER_BUILD_DIR = ${AUTOSCHEDULER_BUILD_DIR}" echo -if [ ! -v HL_TARGET ]; then - get_host_target ${HALIDE_ROOT} HL_TARGET +BEST_SCHEDULES_DIR=${AUTOSCHEDULER_SRC_DIR}/best +export HL_PERMIT_FAILED_UNROLL=1 + +if [ -z ${HL_TARGET+x} ]; then + get_host_target ${AUTOSCHEDULER_BUILD_DIR} HL_TARGET HL_TARGET=${HL_TARGET}-cuda-cuda_capability_70 fi -export HL_TARGET=${HL_TARGET} - echo "HL_TARGET set to ${HL_TARGET}" +echo + +if [[ $PREDICT_ONLY == 1 && $TRAIN_ONLY == 1 ]]; then + echo "At most one of train_only and predict_only can be set to 1." + exit +fi + +if [[ $PREDICT_ONLY == 1 ]]; then + echo "Predict only mode: ON" +fi DEFAULT_SAMPLES_DIR_NAME="${SAMPLES_DIR:-autotuned_samples}" @@ -117,12 +133,6 @@ function ctrl_c() { trap ctrl_c INT -if [ -z $APP ]; then - APPS="bgu bilateral_grid local_laplacian nl_means lens_blur camera_pipe stencil_chain harris hist max_filter unsharp interpolate conv_layer cuda_mat_mul iir_blur depthwise_separable_conv" -else - APPS=${APP} -fi - NUM_APPS=0 for app in $APPS; do NUM_APPS=$((NUM_APPS + 1)) @@ -132,7 +142,11 @@ echo "Autotuning on $APPS for $MAX_ITERATIONS iteration(s)" for app in $APPS; do SECONDS=0 - APP_DIR="${HALIDE_ROOT}/apps/${app}" + APP_DIR="${HALIDE_SRC_DIR}/apps/${app}" + if [ ! -d $APP_DIR ]; then + echo "App ${APP_DIR} not found. Skipping..." + continue + fi unset -v LATEST_SAMPLES_DIR for f in "$APP_DIR/${DEFAULT_SAMPLES_DIR_NAME}"*; do @@ -172,32 +186,36 @@ for app in $APPS; do mkdir -p ${SAMPLES_DIR} touch ${OUTPUT_FILE} + GENERATOR_BUILD_DIR=${HALIDE_BUILD_DIR}/apps/${app} + if [[ ${app} = "cuda_mat_mul" ]]; then app="mat_mul" fi - GENERATOR=bin/host/${app}.generator - make -C ${APP_DIR} ${GENERATOR} + GENERATOR=${GENERATOR_BUILD_DIR}/${app}.generator + if [ ! -f $GENERATOR ]; then + echo "Generator ${GENERATOR} not found. Skipping..." + continue + fi + echo if [[ $PREDICT_ONLY != 1 ]]; then NUM_BATCHES=${MAX_ITERATIONS} \ TRAIN_ONLY=${TRAIN_ONLY} \ SAMPLES_DIR=${SAMPLES_DIR} \ - HARDWARE_PARALLELISM=80 \ - SAMPLES_DIR=${SAMPLES_DIR} \ HL_DEBUG_CODEGEN=0 \ - HL_SHARED_MEMORY_LIMIT=48 \ - bash ${AUTOSCHED_SRC}/autotune_loop.sh \ - ${APP_DIR}/${GENERATOR} \ + bash ${AUTOSCHEDULER_SRC_DIR}/autotune_loop.sh \ + ${GENERATOR} \ ${app} \ ${HL_TARGET} \ - ${AUTOSCHED_SRC}/baseline.weights \ - ${AUTOSCHED_BIN} \ + ${AUTOSCHEDULER_SRC_DIR}/baseline.weights \ + ${HALIDE_BUILD_DIR} \ + ${PARALLELISM} \ ${TRAIN_ONLY} | tee -a ${OUTPUT_FILE} fi WEIGHTS_FILE="${SAMPLES_DIR}/updated.weights" - predict_all ${HALIDE_ROOT} ${SAMPLES_DIR} ${WEIGHTS_FILE} ${PREDICTIONS_WITH_FILENAMES_FILE} 1 ${LIMIT:-0} + predict_all ${HALIDE_SRC_DIR} ${HALIDE_BUILD_DIR} ${SAMPLES_DIR} ${WEIGHTS_FILE} ${PREDICTIONS_WITH_FILENAMES_FILE} 1 ${LIMIT:-0} ${PARALLELISM} awk -F", " '{printf("%f, %f\n", $2, $3);}' ${PREDICTIONS_WITH_FILENAMES_FILE} > ${PREDICTIONS_FILE} echo "Computing average statistics..." @@ -208,4 +226,5 @@ for app in $APPS; do save_best_schedule_result ${BEST_SCHEDULES_DIR} ${SAMPLES_DIR} done +echo print_best_schedule_times $(dirname $0)/best diff --git a/src/autoschedulers/anderson2021/scripts/predict_all.sh b/src/autoschedulers/anderson2021/scripts/predict_all.sh index 90f0abc81117..2d1183f778d4 100644 --- a/src/autoschedulers/anderson2021/scripts/predict_all.sh +++ b/src/autoschedulers/anderson2021/scripts/predict_all.sh @@ -1,31 +1,29 @@ #!/bin/bash -if [ $# -ne 5 ]; then - echo "Usage: $0 samples_dir weights_file predictions_file include_filenames limit" +if [ $# -ne 7 ]; then + echo "Usage: $0 halide_build_dir samples_dir weights_file predictions_file include_filenames limit parallelism" exit fi -source $(dirname $0)/utils.sh +SCRIPTS_DIR=$(dirname $0) +source ${SCRIPTS_DIR}/utils.sh -find_halide HALIDE_ROOT - -make_dir_path_absolute ${1} SAMPLES_DIR -make_file_path_absolute ${2} WEIGHTS_FILE -make_file_path_absolute ${3} PREDICTIONS_FILE -INCLUDE_FILENAMES=${4} -LIMIT=${5} +HALIDE_BUILD_DIR=${1} +make_dir_path_absolute ${2} SAMPLES_DIR +make_file_path_absolute ${3} WEIGHTS_FILE +make_file_path_absolute ${4} PREDICTIONS_FILE +INCLUDE_FILENAMES=${5} +LIMIT=${6} +PARALLELISM=${7} echo echo "Samples directory: ${SAMPLES_DIR}" echo "Weights file: ${WEIGHTS_FILE}" echo "Saving predictions to: ${PREDICTIONS_FILE}" -build_retrain_cost_model ${HALIDE_ROOT} - -NUM_CORES=80 NUM_EPOCHS=1 -retrain_cost_model ${HALIDE_ROOT} ${SAMPLES_DIR} ${WEIGHTS_FILE} ${NUM_CORES} ${NUM_EPOCHS} 0 0.001 ${PREDICTIONS_FILE} 0 0 ${LIMIT} +retrain_cost_model ${HALIDE_BUILD_DIR} ${SAMPLES_DIR} ${WEIGHTS_FILE} ${PARALLELISM} ${NUM_EPOCHS} 0 0.001 ${PREDICTIONS_FILE} 0 0 ${LIMIT} if [[ $INCLUDE_FILENAMES == 1 ]]; then exit diff --git a/src/autoschedulers/anderson2021/scripts/utils.sh b/src/autoschedulers/anderson2021/scripts/utils.sh index 5ef88822fed0..203904c5dc16 100644 --- a/src/autoschedulers/anderson2021/scripts/utils.sh +++ b/src/autoschedulers/anderson2021/scripts/utils.sh @@ -1,24 +1,24 @@ #!/bin/bash -function find_halide() { - local -n halide_root_ref=$1 - local -r silent="${2:-0}" - local dir=$(pwd) - - for i in {1..5}; do - if [[ -f ${dir}/distrib/include/Halide.h ]]; then - halide_root_ref=$(cd ${dir}; pwd) - if [[ $silent -ne 1 ]]; then - echo "Using Halide in ${halide_root_ref}" - fi - return 0 - fi - dir=${dir}/.. - done - - echo "Unable to find Halide. Try re-running $(basename $0) from somewhere in the Halide tree." - exit -} +#function find_halide() { + #local -n halide_root_ref=$1 + #local -r silent="${2:-0}" + #local dir=$(pwd) + + #for i in {1..5}; do + #if [[ -f ${dir}/build/include/Halide.h ]]; then + #halide_root_ref=$(cd ${dir}; pwd) + #if [[ $silent -ne 1 ]]; then + #echo "Using Halide in ${halide_root_ref}" + #fi + #return 0 + #fi + #dir=${dir}/.. + #done + + #echo "Unable to find Halide. Try re-running $(basename $0) from somewhere in the Halide tree." + #exit +#} function make_dir_path_absolute() { local -r path=$1 @@ -32,17 +32,28 @@ function make_file_path_absolute() { converted_path_ref=$(cd $(dirname ${path}); pwd)/$(basename ${path}) } -function get_autoscheduler_dir() { - local -r halide_root=$1 +function get_halide_src_dir() { + local -r autoscheduler_src_dir=$1 + local -n halide_src_dir_ref=$2 + make_dir_path_absolute ${autoscheduler_src_dir}/../../../ halide_src_dir_ref +} + +function get_autoscheduler_src_dir() { + local -r halide_src_dir=$1 local -n autoscheduler_dir_ref=$2 - autoscheduler_dir_ref=${halide_root}/src/autoschedulers/anderson2021 + autoscheduler_dir_ref=${halide_src_dir}/src/autoschedulers/anderson2021 } -function get_absolute_autoscheduler_bin_dir() { - local -r halide_root=$1 - local -n autoscheduler_bin_dir_ref=$2 - get_autoscheduler_dir $halide_root autoscheduler_dir - autoscheduler_bin_dir_ref=${autoscheduler_dir}/bin +function get_autoscheduler_build_dir() { + local -r halide_build_dir=$1 + local -n autoscheduler_build_dir_ref=$2 + autoscheduler_build_dir_ref=${halide_build_dir}/src/autoschedulers/anderson2021 +} + +function get_tools_build_dir() { + local -r halide_build_dir=$1 + local -n tools_build_dir_ref=$2 + tools_build_dir_ref=${halide_build_dir}/tools } function get_autoscheduler_bin_dir() { @@ -56,15 +67,15 @@ function get_autoscheduler_make_bin_dir() { } function get_autoscheduler_scripts_dir() { - local -r halide_root=$1 + local -r halide_src_dir=$1 local -n autoscheduler_scripts_dir_ref=$2 - get_autoscheduler_dir $halide_root autoscheduler_dir - autoscheduler_scripts_dir_ref=${autoscheduler_dir}/scripts + get_autoscheduler_src_dir $halide_src_dir autoscheduler_src_dir + autoscheduler_scripts_dir_ref=${autoscheduler_src_dir}/scripts } function build_rungenmain() { local -r halide_root=$1 - get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_src_dir $halide_root autoscheduler_dir get_autoscheduler_bin_dir autoscheduler_bin_dir echo @@ -75,7 +86,7 @@ function build_rungenmain() { function build_featurization_to_sample() { local -r halide_root=$1 - get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_src_dir $halide_root autoscheduler_dir get_autoscheduler_bin_dir autoscheduler_bin_dir echo @@ -86,7 +97,7 @@ function build_featurization_to_sample() { function build_libauto_schedule() { local -r halide_root=$1 - get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_src_dir $halide_root autoscheduler_dir get_autoscheduler_bin_dir autoscheduler_bin_dir echo @@ -97,7 +108,7 @@ function build_libauto_schedule() { function build_retrain_cost_model() { local -r halide_root=$1 - get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_src_dir $halide_root autoscheduler_dir get_autoscheduler_bin_dir autoscheduler_bin_dir echo @@ -108,7 +119,7 @@ function build_retrain_cost_model() { function build_get_host_target() { local -r halide_root=$1 - get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_src_dir $halide_root autoscheduler_dir get_autoscheduler_bin_dir autoscheduler_bin_dir echo @@ -118,20 +129,18 @@ function build_get_host_target() { } function get_host_target() { - local -r halide_root=$1 + local -r autoscheduler_build_dir=$1 local -n host_target_ref=$2 - get_absolute_autoscheduler_bin_dir $halide_root autoscheduler_bin_dir - echo "Calling get_host_target()..." - host_target_ref=$(${AUTOSCHED_BIN}/get_host_target) + host_target_ref=$(${autoscheduler_build_dir}/anderson2021_get_host_target) echo "host_target = ${host_target_ref}" echo } function build_autoscheduler_tools() { local -r halide_root=$1 - get_autoscheduler_dir $halide_root autoscheduler_dir + get_autoscheduler_src_dir $halide_root autoscheduler_dir echo echo "Building autoscheduler tools..." @@ -144,7 +153,7 @@ function build_autoscheduler_tools() { } function retrain_cost_model() { - local -r halide_root=$1 + local -r halide_build_dir=$1 local -r samples_dir=$2 local -r weights=$3 local -r num_cores=$4 @@ -156,12 +165,12 @@ function retrain_cost_model() { local -r partition_schedules=${10-0} local -r limit=${11-0} - get_absolute_autoscheduler_bin_dir ${halide_root} autosched_bin + get_autoscheduler_build_dir ${halide_build_dir} autoscheduler_build_dir echo "Using learning rate: ${learning_rate}" find ${samples_dir} -name "*.sample" | \ - HL_NUM_THREADS=8 ${autosched_bin}/retrain_cost_model \ + HL_NUM_THREADS=8 ${autoscheduler_build_dir}/anderson2021_retrain_cost_model \ --epochs=${num_epochs} \ --rates=${learning_rate} \ --num_cores=${num_cores} \ @@ -190,15 +199,17 @@ function get_timeout_cmd() { } function predict_all() { - local -r halide_root=$1 - local -r samples_dir=$2 - local -r weights_dir=$3 - local -r predictions_file=$4 - local -r include_filenames=$5 - local -r limit=$6 - - get_autoscheduler_scripts_dir ${halide_root} scripts_dir - bash ${scripts_dir}/predict_all.sh ${samples_dir} ${weights_dir} ${predictions_file} ${include_filenames} ${limit} + local -r halide_src_dir=$1 + local -r halide_build_dir=$2 + local -r samples_dir=$3 + local -r weights_dir=$4 + local -r predictions_file=$5 + local -r include_filenames=$6 + local -r limit=$7 + local -r parallelism=$8 + + get_autoscheduler_scripts_dir ${halide_src_dir} scripts_dir + bash ${scripts_dir}/predict_all.sh ${halide_build_dir} ${samples_dir} ${weights_dir} ${predictions_file} ${include_filenames} ${limit} ${parallelism} } function average_compile_time_beam_search() { @@ -337,6 +348,7 @@ function print_best_schedule_times() { local -r apps="bgu bilateral_grid local_laplacian nl_means lens_blur camera_pipe stencil_chain harris hist max_filter unsharp interpolate conv_layer cuda_mat_mul iir_blur depthwise_separable_conv" + echo "Best found schedule times:" for app in $apps; do local file=$dir/$app.txt if [ ! -f $file ]; then @@ -349,13 +361,13 @@ function print_best_schedule_times() { done } -function get_num_local_cores() { - local -n num_local_cores_ref=$1 +function get_num_cpu_cores() { + local -n num_cpu_cores_ref=$1 if [ $(uname -s) = "Darwin" ]; then - num_local_cores_ref=$(sysctl -n hw.ncpu) + num_cpu_cores_ref=$(sysctl -n hw.ncpu) else - num_local_cores_ref=$(nproc) + num_cpu_cores_ref=$(nproc) fi } From f7fad3153f720077619d1dac989bdf24b9e474fe Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 30 Dec 2022 23:59:36 -0500 Subject: [PATCH 35/63] Tidy up scripts/utils.sh --- .../anderson2021/scripts/utils.sh | 105 +----------------- 1 file changed, 3 insertions(+), 102 deletions(-) diff --git a/src/autoschedulers/anderson2021/scripts/utils.sh b/src/autoschedulers/anderson2021/scripts/utils.sh index 203904c5dc16..c46d98875c11 100644 --- a/src/autoschedulers/anderson2021/scripts/utils.sh +++ b/src/autoschedulers/anderson2021/scripts/utils.sh @@ -1,25 +1,5 @@ #!/bin/bash -#function find_halide() { - #local -n halide_root_ref=$1 - #local -r silent="${2:-0}" - #local dir=$(pwd) - - #for i in {1..5}; do - #if [[ -f ${dir}/build/include/Halide.h ]]; then - #halide_root_ref=$(cd ${dir}; pwd) - #if [[ $silent -ne 1 ]]; then - #echo "Using Halide in ${halide_root_ref}" - #fi - #return 0 - #fi - #dir=${dir}/.. - #done - - #echo "Unable to find Halide. Try re-running $(basename $0) from somewhere in the Halide tree." - #exit -#} - function make_dir_path_absolute() { local -r path=$1 local -n absolute_path_ref=$2 @@ -56,16 +36,6 @@ function get_tools_build_dir() { tools_build_dir_ref=${halide_build_dir}/tools } -function get_autoscheduler_bin_dir() { - local -n autoscheduler_bin_dir_ref=$1 - autoscheduler_bin_dir_ref=bin -} - -function get_autoscheduler_make_bin_dir() { - local -n autoscheduler_bin_dir_ref=$1 - autoscheduler_bin_dir_ref=../autoscheduler/bin -} - function get_autoscheduler_scripts_dir() { local -r halide_src_dir=$1 local -n autoscheduler_scripts_dir_ref=$2 @@ -73,61 +43,6 @@ function get_autoscheduler_scripts_dir() { autoscheduler_scripts_dir_ref=${autoscheduler_src_dir}/scripts } -function build_rungenmain() { - local -r halide_root=$1 - get_autoscheduler_src_dir $halide_root autoscheduler_dir - get_autoscheduler_bin_dir autoscheduler_bin_dir - - echo - echo "Building RunGenMain..." - make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/host-cuda/RunGenMain.o - echo -} - -function build_featurization_to_sample() { - local -r halide_root=$1 - get_autoscheduler_src_dir $halide_root autoscheduler_dir - get_autoscheduler_bin_dir autoscheduler_bin_dir - - echo - echo "Building featurization_to_sample..." - make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/featurization_to_sample - echo -} - -function build_libauto_schedule() { - local -r halide_root=$1 - get_autoscheduler_src_dir $halide_root autoscheduler_dir - get_autoscheduler_bin_dir autoscheduler_bin_dir - - echo - echo "Building libauto_schedule..." - make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/libautoschedule_anderson2021.so - echo -} - -function build_retrain_cost_model() { - local -r halide_root=$1 - get_autoscheduler_src_dir $halide_root autoscheduler_dir - get_autoscheduler_bin_dir autoscheduler_bin_dir - - echo - echo "Building retrain_cost_model..." - make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/retrain_cost_model - echo -} - -function build_get_host_target() { - local -r halide_root=$1 - get_autoscheduler_src_dir $halide_root autoscheduler_dir - get_autoscheduler_bin_dir autoscheduler_bin_dir - - echo - echo "Building get_host_target..." - make -C ${autoscheduler_dir} ${autoscheduler_bin_dir}/get_host_target - echo -} - function get_host_target() { local -r autoscheduler_build_dir=$1 local -n host_target_ref=$2 @@ -138,20 +53,6 @@ function get_host_target() { echo } -function build_autoscheduler_tools() { - local -r halide_root=$1 - get_autoscheduler_src_dir $halide_root autoscheduler_dir - - echo - echo "Building autoscheduler tools..." - build_featurization_to_sample $halide_root - build_retrain_cost_model $halide_root - build_libauto_schedule $halide_root - build_get_host_target $halide_root - build_rungenmain $halide_root - echo -} - function retrain_cost_model() { local -r halide_build_dir=$1 local -r samples_dir=$2 @@ -225,12 +126,12 @@ function average_compile_time_greedy() { } function reset_weights() { - local -r halide_root=$1 + local -r halide_build_dir=$1 local -r weights=$2 - get_absolute_autoscheduler_bin_dir ${halide_root} autosched_bin + get_autoscheduler_build_dir ${halide_build_dir} autoscheduler_build_dir - ${autosched_bin}/retrain_cost_model \ + ${autoscheduler_build_dir}/anderson2021_retrain_cost_model \ --initial_weights=${weights} \ --weights_out=${weights} \ --randomize_weights=1 \ From ae08b76b1bdc9f3d190301f33adcac6be980a630 Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 31 Dec 2022 00:05:54 -0500 Subject: [PATCH 36/63] Check if RunGenMain.o exists --- src/autoschedulers/anderson2021/autotune_loop.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh index 98737981d861..8dd2d2f6b003 100644 --- a/src/autoschedulers/anderson2021/autotune_loop.sh +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -48,6 +48,12 @@ if [ -z ${CXX+x} ]; then exit fi +RUNGENMAIN="${TOOLS_BUILD_DIR}/RunGenMain.o" +if [ ! -f $RUNGENMAIN ]; then + echo "RunGenMain.o not found. Exiting..." + exit +fi + echo Training target is: ${HL_TARGET} if [ -z ${GENERATOR} ]; then @@ -222,7 +228,7 @@ make_featurization() { -O3 -I ../../include \ ${LIBPNG_CFLAGS} \ - ${TOOLS_BUILD_DIR}/RunGenMain.o \ + ${RUNGENMAIN} \ ${D}/*.registration.cpp \ ${D}/*.a \ -o ${D}/bench \ From b44d05c45abdb7d74abc0a6c064bef5c2715a568 Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 31 Dec 2022 00:44:56 -0500 Subject: [PATCH 37/63] clang-format --- apps/hannk/halide/elementwise_generator.cpp | 2 +- src/Generator.cpp | 4 ++-- src/runtime/d3d12compute.cpp | 2 +- test/generator/metadata_tester_generator.cpp | 16 ++++++++-------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/apps/hannk/halide/elementwise_generator.cpp b/apps/hannk/halide/elementwise_generator.cpp index dc7c65fa37f4..1fd761f43b1b 100644 --- a/apps/hannk/halide/elementwise_generator.cpp +++ b/apps/hannk/halide/elementwise_generator.cpp @@ -122,7 +122,7 @@ class Elementwise : public Generator { GeneratorParam output3_type_{"output3_type", Int(0)}; // An array of inputs. - Input[]> inputs_{"inputs"}; + Input[]> inputs_ { "inputs" }; // The program to run. See elementwise_program.h for a description of // this buffer. Input> program_{"program"}; diff --git a/src/Generator.cpp b/src/Generator.cpp index 2902e8bd5d0f..db7253049d65 100644 --- a/src/Generator.cpp +++ b/src/Generator.cpp @@ -2147,11 +2147,11 @@ void generator_test() { public: Input expr_array_input{"expr_array_input"}; Input func_array_input{"input_func_array"}; - Input[]> buffer_array_input{"buffer_array_input"}; + Input[]> buffer_array_input { "buffer_array_input" }; Input expr_array_output{"expr_array_output"}; Output func_array_output{"func_array_output"}; - Output[]> buffer_array_output{"buffer_array_output"}; + Output[]> buffer_array_output { "buffer_array_output" }; void generate() { } diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index fc0102a5a66e..0e041395c62f 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -553,7 +553,7 @@ struct d3d12_buffer { d3d12_buffer *staging; size_t offset; size_t size; - } *xfer; + } * xfer; bool mallocd; void *host_mirror; diff --git a/test/generator/metadata_tester_generator.cpp b/test/generator/metadata_tester_generator.cpp index 3f200d4989dc..655ea10e7782 100644 --- a/test/generator/metadata_tester_generator.cpp +++ b/test/generator/metadata_tester_generator.cpp @@ -37,17 +37,17 @@ class MetadataTester : public Halide::Generator { Input array2_i16{"array2_i16", 16}; Input array_i32{"array_i32", 32, -32, 127}; // must be overridden to size=2 Input array2_i32{"array2_i32", 32, -32, 127}; - Input array_h{"array_h", nullptr}; // must be overridden to size=2 + Input array_h { "array_h", nullptr }; // must be overridden to size=2 Input[2]> buffer_array_input1 { "buffer_array_input1" }; Input[2]> buffer_array_input2 { "buffer_array_input2" }; // buffer_array_input2.dim must be set Input[2]> buffer_array_input3 { "buffer_array_input3" }; // buffer_array_input2.type must be set Input[2]> buffer_array_input4 { "buffer_array_input4" }; // dim and type must be set // .size must be specified for all of these - Input[]> buffer_array_input5{"buffer_array_input5"}; - Input[]> buffer_array_input6{"buffer_array_input6"}; // buffer_array_input2.dim must be set - Input[]> buffer_array_input7{"buffer_array_input7"}; // buffer_array_input2.type must be set - Input[]> buffer_array_input8{"buffer_array_input8"}; // dim and type must be set + Input[]> buffer_array_input5 { "buffer_array_input5" }; + Input[]> buffer_array_input6 { "buffer_array_input6" }; // buffer_array_input2.dim must be set + Input[]> buffer_array_input7 { "buffer_array_input7" }; // buffer_array_input2.type must be set + Input[]> buffer_array_input8 { "buffer_array_input8" }; // dim and type must be set Input> buffer_f16_typed{"buffer_f16_typed"}; Input> buffer_f16_untyped{"buffer_f16_untyped"}; @@ -70,9 +70,9 @@ class MetadataTester : public Halide::Generator { Output[2]> array_outputs6 { "array_outputs6" }; // dimensions and type will be inferred by usage // .size must be specified for all of these - Output[]> array_outputs7{"array_outputs7"}; - Output[]> array_outputs8{"array_outputs8"}; - Output[]> array_outputs9{"array_outputs9"}; + Output[]> array_outputs7 { "array_outputs7" }; + Output[]> array_outputs8 { "array_outputs8" }; + Output[]> array_outputs9 { "array_outputs9" }; // Output untyped_scalar_output{"untyped_scalar_output"}; // untyped_scalar_output.type must be set From 844755ca01c72b07ca95ce4c57bc4f16b5da73f3 Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 2 Jan 2023 16:59:59 -0500 Subject: [PATCH 38/63] Add included_schedule_file.schedule.h --- .../included_schedule_file.schedule.h | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 test/autoschedulers/anderson2021/included_schedule_file.schedule.h diff --git a/test/autoschedulers/anderson2021/included_schedule_file.schedule.h b/test/autoschedulers/anderson2021/included_schedule_file.schedule.h new file mode 100644 index 000000000000..53795fd52ccd --- /dev/null +++ b/test/autoschedulers/anderson2021/included_schedule_file.schedule.h @@ -0,0 +1,102 @@ +#ifndef included_schedule_file_SCHEDULE_H +#define included_schedule_file_SCHEDULE_H + +// MACHINE GENERATED -- DO NOT EDIT +// This schedule was automatically generated by Anderson2021 +// with autoscheduler_params=autoscheduler=Anderson2021 autoscheduler.active_block_limit=32 autoscheduler.active_warp_limit=64 autoscheduler.beam_size=32 autoscheduler.freeze_inline_compute_root=1 autoscheduler.parallelism=80 autoscheduler.random_dropout=100 autoscheduler.random_dropout_seed=32587330000 autoscheduler.randomize_tilings=1 autoscheduler.search_space_options=1111 autoscheduler.shared_memory_limit_kb=48 autoscheduler.shared_memory_sm_limit_kb=96 + +#include "Halide.h" + +inline void apply_schedule_included_schedule_file( + ::Halide::Pipeline pipeline, + ::Halide::Target target) { + using ::Halide::Func; + using ::Halide::MemoryType; + using ::Halide::RVar; + using ::Halide::TailStrategy; + using ::Halide::Var; + + Func relu = pipeline.get_func(4); + Func conv = pipeline.get_func(3); + Var c(relu.get_schedule().dims()[0].var); + Var ci("ci"); + Var n(relu.get_schedule().dims()[3].var); + Var x(relu.get_schedule().dims()[1].var); + Var xi("xi"); + Var xii("xii"); + Var y(relu.get_schedule().dims()[2].var); + Var yi("yi"); + Var yii("yii"); + RVar r13_x(conv.update(0).get_schedule().dims()[0].var); + RVar r13_y(conv.update(0).get_schedule().dims()[1].var); + RVar r13_z(conv.update(0).get_schedule().dims()[2].var); + Var yi_serial_outer("yi_serial_outer"); + Var xi_serial_outer("xi_serial_outer"); + Var ci_serial_outer("ci_serial_outer"); + relu + .split(c, c, ci, 24, TailStrategy::ShiftInwards) + .split(x, x, xi, 16, TailStrategy::ShiftInwards) + .split(y, y, yi, 4, TailStrategy::ShiftInwards) + .split(xi, xi, xii, 4, TailStrategy::ShiftInwards) + .split(yi, yi, yii, 2, TailStrategy::ShiftInwards) + .unroll(xii) + .unroll(yii) + .compute_root() + .reorder(xii, yii, ci, xi, yi, c, x, y, n) + .gpu_blocks(c) + .gpu_blocks(x) + .fuse(y, n, y) + .gpu_blocks(y) + .split(ci, ci_serial_outer, ci, 24, TailStrategy::GuardWithIf) + .gpu_threads(ci) + .split(xi, xi_serial_outer, xi, 4, TailStrategy::GuardWithIf) + .gpu_threads(xi) + .split(yi, yi_serial_outer, yi, 2, TailStrategy::GuardWithIf) + .gpu_threads(yi); + conv.update(0) + .split(c, c, ci, 24, TailStrategy::GuardWithIf) + .split(x, x, xi, 4, TailStrategy::GuardWithIf) + .split(y, y, yi, 16, TailStrategy::GuardWithIf) + .split(yi, yi, yii, 2, TailStrategy::GuardWithIf) + .unroll(yii) + .reorder(yii, r13_x, r13_y, r13_z, ci, xi, yi, c, x, y, n) + .gpu_blocks(c) + .gpu_blocks(x) + .fuse(y, n, y) + .gpu_blocks(y) + .split(ci, ci_serial_outer, ci, 24, TailStrategy::GuardWithIf) + .gpu_threads(ci) + .split(xi, xi_serial_outer, xi, 4, TailStrategy::GuardWithIf) + .gpu_threads(xi) + .split(yi, yi_serial_outer, yi, 8, TailStrategy::GuardWithIf) + .gpu_threads(yi); + conv + .split(c, c, ci, 24, TailStrategy::ShiftInwards) + .split(x, x, xi, 4, TailStrategy::ShiftInwards) + .split(y, y, yi, 16, TailStrategy::ShiftInwards) + .split(yi, yi, yii, 2, TailStrategy::ShiftInwards) + .unroll(yii) + .compute_root() + .reorder(yii, ci, xi, yi, c, x, y, n) + .gpu_blocks(c) + .gpu_blocks(x) + .fuse(y, n, y) + .gpu_blocks(y) + .split(ci, ci_serial_outer, ci, 24, TailStrategy::GuardWithIf) + .gpu_threads(ci) + .split(xi, xi_serial_outer, xi, 4, TailStrategy::GuardWithIf) + .gpu_threads(xi) + .split(yi, yi_serial_outer, yi, 8, TailStrategy::GuardWithIf) + .gpu_threads(yi); + conv.in(relu).store_in(MemoryType::Register).compute_at(relu, ci) + .bound_extent(c, 1) + .unroll(c) + .bound_extent(x, 4) + .unroll(x) + .bound_extent(y, 2) + .unroll(y) + .bound_extent(n, 1) + .unroll(n); +} + +#endif // included_schedule_file_SCHEDULE_H From b8e6ed08487dd7977c1e0282613c9c022a9b2824 Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 2 Jan 2023 22:16:01 -0500 Subject: [PATCH 39/63] clang-format --- .../anderson2021/included_schedule_file.schedule.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/test/autoschedulers/anderson2021/included_schedule_file.schedule.h b/test/autoschedulers/anderson2021/included_schedule_file.schedule.h index 53795fd52ccd..201c2a834686 100644 --- a/test/autoschedulers/anderson2021/included_schedule_file.schedule.h +++ b/test/autoschedulers/anderson2021/included_schedule_file.schedule.h @@ -88,15 +88,7 @@ inline void apply_schedule_included_schedule_file( .gpu_threads(xi) .split(yi, yi_serial_outer, yi, 8, TailStrategy::GuardWithIf) .gpu_threads(yi); - conv.in(relu).store_in(MemoryType::Register).compute_at(relu, ci) - .bound_extent(c, 1) - .unroll(c) - .bound_extent(x, 4) - .unroll(x) - .bound_extent(y, 2) - .unroll(y) - .bound_extent(n, 1) - .unroll(n); + conv.in(relu).store_in(MemoryType::Register).compute_at(relu, ci).bound_extent(c, 1).unroll(c).bound_extent(x, 4).unroll(x).bound_extent(y, 2).unroll(y).bound_extent(n, 1).unroll(n); } #endif // included_schedule_file_SCHEDULE_H From f2928d35b58cfba1d52e7e8fa63f3a8b7f2fbf76 Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 5 Jan 2023 01:47:22 -0500 Subject: [PATCH 40/63] Script usability improvements --- .../anderson2021/autotune_loop.sh | 15 +++++++-- .../anderson2021/generate_data.sh | 31 ++++++++++++++----- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh index 8dd2d2f6b003..cff09fd23704 100644 --- a/src/autoschedulers/anderson2021/autotune_loop.sh +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -8,6 +8,11 @@ fi set -eu +if [ -z ${BASH_VERSION+x} ]; then + echo "${0} should be run as a bash script" + exit +fi + AUTOSCHEDULER_SRC_DIR=$(dirname $0) SCRIPTS_DIR="${AUTOSCHEDULER_SRC_DIR}/scripts" source ${SCRIPTS_DIR}/utils.sh @@ -48,9 +53,9 @@ if [ -z ${CXX+x} ]; then exit fi -RUNGENMAIN="${TOOLS_BUILD_DIR}/RunGenMain.o" +RUNGENMAIN="${TOOLS_BUILD_DIR}/RunGenMain.cpp.o" if [ ! -f $RUNGENMAIN ]; then - echo "RunGenMain.o not found. Exiting..." + echo "${RUNGENMAIN} not found. Exiting..." exit fi @@ -94,6 +99,12 @@ echo "Number of CPU cores detected as ${NUM_CPU_CORES}" # benchmarked serially. BATCH_SIZE=80 EPOCHS=200 + +if ! command -v nvidia-smi > /dev/null; then + echo "nvidia-smi is required for autotuning" + exit +fi + NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) RANDOMIZE_TILINGS="${RANDOMIZE_TILINGS:-1}" diff --git a/src/autoschedulers/anderson2021/generate_data.sh b/src/autoschedulers/anderson2021/generate_data.sh index c407f39c660c..264367c972f1 100644 --- a/src/autoschedulers/anderson2021/generate_data.sh +++ b/src/autoschedulers/anderson2021/generate_data.sh @@ -14,16 +14,16 @@ # halide_build_dir has the same structure that the CMake build will produce # # Arguments: -# halide_build_dir - path where Halide is built -# max_iterations - the number of batches to generate. The cost model is +# halide_build_dir [path] - path where Halide is built +# max_iterations [int] - the number of batches to generate. The cost model is # retrained after each -# resume - resume using the previously generated samples or start a new run? -# train_only - don't generate new data, just retrain the cost model with +# resume [0|1] - resume using the previously generated samples or start a new run? +# train_only [0|1] - don't generate new data, just retrain the cost model with # existing samples -# predict_only - don't generate new data, just predict the costs of the existing +# predict_only [0|1] - don't generate new data, just predict the costs of the existing # samples -# parallelism - the number of streaming multiprocessors in the target GPU -# app - the individual application (in Halide/apps/) to generate data for. If +# parallelism [int] - the number of streaming multiprocessors in the target GPU +# app [string; optional] - the individual application (in Halide/apps/) to generate data for. If # not provided, it will generate a data for all the apps in the list below if [[ $# -ne 6 && $# -ne 7 ]]; then @@ -33,6 +33,11 @@ fi set -eu +if [ -z ${BASH_VERSION+x} ]; then + echo "${0} should be run as a bash script" + exit +fi + HALIDE_BUILD_DIR=${1} MAX_ITERATIONS=${2} RESUME=${3} @@ -51,7 +56,10 @@ if [ -z ${CXX+x} ]; then exit fi -export CXX="ccache ${CXX}" +if command -v ccache > /dev/null; then + echo "ccache detected and will be used" + export CXX="ccache ${CXX}" +fi AUTOSCHEDULER_SRC_DIR=$(dirname $0) SCRIPTS_DIR="${AUTOSCHEDULER_SRC_DIR}/scripts" @@ -183,6 +191,13 @@ for app in $APPS; do PREDICTIONS_WITH_FILENAMES_FILE="${SAMPLES_DIR}/predictions_with_filenames" BEST_TIMES_FILE="${SAMPLES_DIR}/best_times" + if [[ $TRAIN_ONLY == 1 ]]; then + if [[ ! -d ${SAMPLES_DIR} || -z "$(ls -A ${SAMPLES_DIR})" ]]; then + echo "No samples found in ${SAMPLES_DIR}. Skipping..." + continue + fi + fi + mkdir -p ${SAMPLES_DIR} touch ${OUTPUT_FILE} From ff2daff9bb5512cc5f27bc95f850121e0bb6f3e1 Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 12 Jan 2023 01:54:11 -0500 Subject: [PATCH 41/63] Add include path to Makefile --- src/autoschedulers/adams2019/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 774cf9a62e80..564e8535086c 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -85,7 +85,7 @@ $(BIN)/libautoschedule_adams2019.$(PLUGIN_EXT): \ $(BIN)/auto_schedule_runtime.a \ | $(LIB_HALIDE) @mkdir -p $(@D) - $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB) + $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB) -I . $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(COMMON_DIR)/ASLog.cpp \ @@ -110,7 +110,7 @@ $(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ $(HALIDE_RPATH_FOR_BIN) $(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(COMMON_DIR)/Weights.cpp @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ + $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -I . .PHONY: clean From 9741bb32fec524ad1b8024ce70353d5c5912769f Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 13 Jan 2023 02:04:24 -0500 Subject: [PATCH 42/63] Fix include path in Makefile --- src/autoschedulers/adams2019/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 564e8535086c..2ea040527963 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -85,7 +85,7 @@ $(BIN)/libautoschedule_adams2019.$(PLUGIN_EXT): \ $(BIN)/auto_schedule_runtime.a \ | $(LIB_HALIDE) @mkdir -p $(@D) - $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB) -I . + $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB) -I $(SRC) $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(COMMON_DIR)/ASLog.cpp \ @@ -110,7 +110,7 @@ $(BIN)/get_host_target: $(COMMON_DIR)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ $(HALIDE_RPATH_FOR_BIN) $(BIN)/weightsdir_to_weightsfile: $(COMMON_DIR)/weightsdir_to_weightsfile.cpp $(COMMON_DIR)/Weights.cpp @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -I . + $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -I $(SRC) .PHONY: clean From b273cb057fa8f5061dd577281b9585a6c1776331 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 13 Jan 2023 22:22:41 -0500 Subject: [PATCH 43/63] Fix include path in Makefile --- src/autoschedulers/adams2019/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 2ea040527963..8fdbeb08e34d 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -99,7 +99,7 @@ $(BIN)/retrain_cost_model: $(SRC)/retrain_cost_model.cpp \ $(AUTOSCHED_WEIGHT_OBJECTS) \ $(BIN)/auto_schedule_runtime.a @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) $(HALIDE_RPATH_FOR_BIN) + $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) $(HALIDE_RPATH_FOR_BIN) -I $(SRC) $(BIN)/featurization_to_sample: $(COMMON_DIR)/featurization_to_sample.cpp @mkdir -p $(@D) From d6a365669cc6a8ae3cbebc2e1eae5f9551e99108 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 29 Jan 2023 01:48:38 -0500 Subject: [PATCH 44/63] Tidy up --- src/autoschedulers/anderson2021/generate_data.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/autoschedulers/anderson2021/generate_data.sh b/src/autoschedulers/anderson2021/generate_data.sh index 264367c972f1..9bdf15b83434 100644 --- a/src/autoschedulers/anderson2021/generate_data.sh +++ b/src/autoschedulers/anderson2021/generate_data.sh @@ -3,6 +3,7 @@ # This script will generate a batch of data using the autoscheduler, retraining # the cost model after each batch. It can be used for generating training data or # for autotuning on an individual app. +# # It is a wrapper around autotune_loop.sh, which handles compiling, benchmarking, # and retraining the cost model. This file makes the process more user friendly # by providing statistics, support for resuming previous batches, autotuning From d72060e6809569b592264662736b74d51be09279 Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 29 Jan 2023 22:25:22 -0500 Subject: [PATCH 45/63] clang-tidy --- src/autoschedulers/anderson2021/AutoSchedule.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index f562ce5923ad..686b8227c855 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -111,6 +111,8 @@ #ifdef _WIN32 #include #define _isatty isatty; +#else +#include #endif namespace Halide { From 9ec5b03b2f02fba063039d82fb8dcbe956808c05 Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 6 Feb 2023 01:55:20 -0500 Subject: [PATCH 46/63] include directory --- src/autoschedulers/anderson2021/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt index dcfd91f9b833..b78142348f83 100644 --- a/src/autoschedulers/anderson2021/CMakeLists.txt +++ b/src/autoschedulers/anderson2021/CMakeLists.txt @@ -59,7 +59,7 @@ add_autoscheduler( $ ) -target_include_directories(Halide_Anderson2021 PUBLIC "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") +target_include_directories(Halide_Anderson2021 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021") target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser anderson2021_cost_model anderson2021_train_cost_model) From c7bd839302b321b1339c97f7de0d5b7c0c9f20c9 Mon Sep 17 00:00:00 2001 From: aekul Date: Thu, 9 Feb 2023 01:50:30 -0500 Subject: [PATCH 47/63] Non-constant bounds fix --- src/autoschedulers/anderson2021/FunctionDAG.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp index b2e4f0211ee7..55037ff849c9 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.cpp +++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp @@ -865,8 +865,11 @@ FunctionDAG::FunctionDAG(const vector &outputs, const Target &target) // Get the bounds estimate map estimates; for (const auto &b : consumer.schedule().estimates()) { - int64_t i_min = *as_const_int(b.min); - int64_t i_extent = *as_const_int(b.extent); + const int64_t *i_min = as_const_int(b.min); + const int64_t *i_extent = as_const_int(b.extent); + user_assert(i_min && i_extent) + << "Min/extent of estimate or bound is not constant in \"" << consumer.name() + << "\", var:" << b.var << ", min:" << b.min << ", extent:" << b.extent; if ((false)) { // Intentional dead code. Extra parens to pacify clang-tidy. // Some methods we compare to compile for @@ -882,9 +885,9 @@ FunctionDAG::FunctionDAG(const vector &outputs, const Target &target) // like unroll across color channels, so // it affects the scheduling space. Func(node.func).bound(b.var, b.min, b.extent); - estimates[b.var] = Span(i_min, i_min + i_extent - 1, true); + estimates[b.var] = Span(*i_min, *i_min + *i_extent - 1, true); } else { - estimates[b.var] = Span(i_min, i_min + i_extent - 1, false); + estimates[b.var] = Span(*i_min, *i_min + *i_extent - 1, false); } } for (const auto &b : consumer.schedule().bounds()) { From 64148a9280a815609504f5b20a15a8d02adaf16d Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 17 Mar 2023 00:31:19 -0400 Subject: [PATCH 48/63] Fix long line --- src/autoschedulers/anderson2021/AutoSchedule.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.h b/src/autoschedulers/anderson2021/AutoSchedule.h index 29819de7534a..9f7136913e97 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.h +++ b/src/autoschedulers/anderson2021/AutoSchedule.h @@ -18,7 +18,13 @@ namespace Autoscheduler { typedef PerfectHashMap StageMapOfScheduleFeatures; -void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); +void find_and_apply_schedule(FunctionDAG &dag, + const std::vector &outputs, + const Anderson2021Params ¶ms, + const Target &target, + CostModel *cost_model, + int beam_size, + StageMapOfScheduleFeatures *schedule_features); } // namespace Autoscheduler } // namespace Internal From 86c40156964b3bce3790864bb7c91f31965ad484 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 17 Mar 2023 00:31:43 -0400 Subject: [PATCH 49/63] Add braces around if statements --- .../anderson2021/DefaultCostModel.cpp | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index 18c240418a4d..074b3e2ff2b4 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -36,7 +36,9 @@ bool ends_with(const std::string &str, const std::string &suffix) { if (str.size() < suffix.size()) return false; size_t off = str.size() - suffix.size(); for (size_t i = 0; i < suffix.size(); i++) { - if (str[off + i] != suffix[i]) return false; + if (str[off + i] != suffix[i]) { + return false; + } } return true; } @@ -54,12 +56,16 @@ void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::Func "Incorrect size for pipeline features"); int num_stages = 0; for (const auto &n : dag.nodes) { - if (!n.is_input) num_stages += (int)n.stages.size(); + if (!n.is_input) { + num_stages += (int)n.stages.size(); + } } Runtime::Buffer pipeline_features(head1_w, head1_h, num_stages); int stage = 0; for (const auto &n : dag.nodes) { - if (n.is_input) continue; + if (n.is_input) { + continue; + } for (auto it = n.stages.rbegin(); it != n.stages.rend(); it++) { const auto &s = *it; const int *pipeline_feats = (const int *)(&(s.features)) + 7; @@ -108,7 +114,9 @@ void DefaultCostModel::enqueue(const Internal::Autoscheduler::FunctionDAG &dag, for (const auto &n : dag.nodes) { // Inputs are computed outside of the pipeline and don't count. - if (n.is_input) continue; + if (n.is_input) { + continue; + } // The remaining stages are not yet // scheduled. Optimistically assume their internal costs @@ -118,7 +126,9 @@ void DefaultCostModel::enqueue(const Internal::Autoscheduler::FunctionDAG &dag, // cost for loading from these unscheduled stages is // already baked into the scheduled stages that consume // them. - if (stage >= num_stages) break; + if (stage >= num_stages) { + break; + } // Load up the schedule features for all stages of this Func. for (auto it = n.stages.rbegin(); it != n.stages.rend(); it++) { @@ -257,7 +267,9 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim } internal_assert(true_runtimes(i) > 0); } - if (any_nans) abort(); + if (any_nans) { + abort(); + } // Update weights locally auto update_weight = [](const Runtime::Buffer &src, Runtime::Buffer &dst) { @@ -276,7 +288,9 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim } void DefaultCostModel::evaluate_costs() { - if (cursor == 0 || !schedule_feat_queue.data()) return; + if (cursor == 0 || !schedule_feat_queue.data()) { + return; + } internal_assert(pipeline_feat_queue.data()); internal_assert(schedule_feat_queue.data()); From 08c778e4f28282d3819010701d25c590b1cd6baa Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 17 Mar 2023 00:37:08 -0400 Subject: [PATCH 50/63] aslog(0) -> aslog(1) --- .../anderson2021/AutoSchedule.cpp | 46 +++++++++---------- .../anderson2021/DefaultCostModel.cpp | 14 +++--- .../anderson2021/Featurization.h | 4 +- .../anderson2021/FunctionDAG.cpp | 24 +++++----- src/autoschedulers/anderson2021/LoopNest.cpp | 4 +- .../anderson2021/LoopNestParser.h | 22 ++++----- .../anderson2021/SearchSpace.cpp | 14 +++--- .../anderson2021/SearchSpaceOptions.h | 12 ++--- src/autoschedulers/anderson2021/State.cpp | 18 ++++---- src/autoschedulers/anderson2021/Statistics.h | 6 +-- 10 files changed, 82 insertions(+), 82 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 686b8227c855..4b6b95cd309e 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -279,7 +279,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, std::function &&)> enqueue_new_children = [&](IntrusivePtr &&s) { - // aslog(0) << "\n** Generated child: "; + // aslog(1) << "\n** Generated child: "; // s->dump(); // s->calculate_cost(dag, params, nullptr, true); @@ -332,7 +332,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, } if ((int)pending.size() > beam_size * 10000) { - aslog(0) << "Warning: Huge number of states generated (" << pending.size() << ").\n"; + aslog(1) << "Warning: Huge number of states generated (" << pending.size() << ").\n"; } expanded = 0; @@ -450,14 +450,14 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, continue; } - aslog(0) << "Options:\n"; + aslog(1) << "Options:\n"; for (int i = (int)q.size() - 1; i >= 0; i--) { auto state = q[i]; LoopNestParser option = LoopNestParser::from_string(state->root->to_string()); - aslog(0) << "Option " << i << ":\n"; + aslog(1) << "Option " << i << ":\n"; option.dump(); } - aslog(0) << "\nTarget partial schedule:\n"; + aslog(1) << "\nTarget partial schedule:\n"; partial_schedule->dump(); internal_assert(false) << "Partial schedule not found"; } @@ -482,7 +482,7 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, if (target_loop_nest->contains_sub_loop_nest(option)) { found = true; selection = choice_label; - aslog(0) << "\nFound matching option\n"; + aslog(1) << "\nFound matching option\n"; break; } } @@ -492,30 +492,30 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, // The user has set HL_CYOS, and wants to navigate the // search space manually. Discard everything in the queue // except for the user-chosen option. - aslog(0) << "\n--------------------\n"; - aslog(0) << "Select a schedule:\n"; + aslog(1) << "\n--------------------\n"; + aslog(1) << "Select a schedule:\n"; for (int choice_label = (int)q.size() - 1; choice_label >= 0; choice_label--) { auto state = q[choice_label]; - aslog(0) << "\n[" << choice_label << "]:\n"; + aslog(1) << "\n[" << choice_label << "]:\n"; state->dump(); } int next_node = q[0]->num_decisions_made / 2; if (next_node < (int)dag.nodes.size()) { const FunctionDAG::Node *node = &dag.nodes[next_node]; - aslog(0) << "\nNext node to be scheduled: " << node->func.name() << "\n"; + aslog(1) << "\nNext node to be scheduled: " << node->func.name() << "\n"; } } cost_model->evaluate_costs(); if (cyos_from_file && !found) { - aslog(0) << "\nTarget loop nest was not found.\n"; + aslog(1) << "\nTarget loop nest was not found.\n"; } if (!cyos_from_file || !found) { // Select next partial schedule to expand. while (selection < 0 || selection >= (int)q.size()) { - aslog(0) << "\nEnter selection: "; + aslog(1) << "\nEnter selection: "; std::cin >> selection; } } @@ -572,9 +572,9 @@ IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { tick.clear(); if (aslog::aslog_level() == 0) { - aslog(0) << "Pass " << pass_idx + 1 << " of " << num_passes << ", cost: " << pass->cost << "\n"; + aslog(1) << "Pass " << pass_idx + 1 << " of " << num_passes << ", cost: " << pass->cost << "\n"; } else { - aslog(0) << "Pass " << pass_idx + 1 << " result: "; + aslog(1) << "Pass " << pass_idx + 1 << " result: "; pass->dump(); } @@ -589,7 +589,7 @@ IntrusivePtr AutoSchedule::optimal_schedule(int beam_size) { } } - aslog(0) << "Best cost: " << best->cost << "\n"; + aslog(1) << "Best cost: " << best->cost << "\n"; return best; } @@ -647,11 +647,11 @@ void generate_schedule(const std::vector &outputs, std::unique_ptr partial_schedule; if (!params.partial_schedule_path.empty()) { - aslog(0) << "Loading partial schedule from " << params.partial_schedule_path << "\n"; + aslog(1) << "Loading partial schedule from " << params.partial_schedule_path << "\n"; partial_schedule = LoopNestParser::from_file(params.partial_schedule_path); - aslog(0) << "Partial schedule:\n"; + aslog(1) << "Partial schedule:\n"; partial_schedule->dump(); - aslog(0) << "\n"; + aslog(1) << "\n"; } std::mt19937 rng{(uint32_t)params.random_dropout_seed}; @@ -675,9 +675,9 @@ void generate_schedule(const std::vector &outputs, // Print out the schedule if (aslog::aslog_level() > 0) { - aslog(0) << "BEGIN Final generated loop nest and schedule:\n"; + aslog(1) << "BEGIN Final generated loop nest and schedule:\n"; optimal->dump(); - aslog(0) << "END Final generated loop nest and schedule\n"; + aslog(1) << "END Final generated loop nest and schedule\n"; optimal->print_compute_locations(); } @@ -796,11 +796,11 @@ void find_and_apply_schedule(FunctionDAG &dag, std::unique_ptr partial_schedule; if (!params.partial_schedule_path.empty()) { - aslog(0) << "Loading partial schedule from " << params.partial_schedule_path << "\n"; + aslog(1) << "Loading partial schedule from " << params.partial_schedule_path << "\n"; partial_schedule = LoopNestParser::from_file(params.partial_schedule_path); - aslog(0) << "Partial schedule:\n"; + aslog(1) << "Partial schedule:\n"; partial_schedule->dump(); - aslog(0) << "\n"; + aslog(1) << "\n"; } SearchSpace search_space{dag, params, target, rng, cost_model, stats, partial_schedule.get()}; diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index 074b3e2ff2b4..1d2fde5b925e 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -252,18 +252,18 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim *(cost_ptrs(i)) = dst(i); if (std::isnan(dst(i))) { any_nans = true; - aslog(0) << "Prediction " << i << " is NaN. True runtime is " << true_runtimes(i) << "\n"; - aslog(0) << "Checking pipeline features for NaNs...\n"; + aslog(1) << "Prediction " << i << " is NaN. True runtime is " << true_runtimes(i) << "\n"; + aslog(1) << "Checking pipeline features for NaNs...\n"; pipeline_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); - aslog(0) << "None found\n"; - aslog(0) << "Checking schedule features for NaNs...\n"; + aslog(1) << "None found\n"; + aslog(1) << "Checking schedule features for NaNs...\n"; schedule_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); - aslog(0) << "None found\n"; - aslog(0) << "Checking network weights for NaNs...\n"; + aslog(1) << "None found\n"; + aslog(1) << "Checking network weights for NaNs...\n"; weights.for_each_buffer([&](const Runtime::Buffer &buf) { buf.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); }); - aslog(0) << "None found\n"; + aslog(1) << "None found\n"; } internal_assert(true_runtimes(i) > 0); } diff --git a/src/autoschedulers/anderson2021/Featurization.h b/src/autoschedulers/anderson2021/Featurization.h index 66cb476bfc88..65d44dc34635 100644 --- a/src/autoschedulers/anderson2021/Featurization.h +++ b/src/autoschedulers/anderson2021/Featurization.h @@ -158,7 +158,7 @@ struct PipelineFeatures { } } void dump() const { - auto os = aslog(0); + auto os = aslog(1); dump(os); } }; @@ -422,7 +422,7 @@ struct ScheduleFeatures { } void dump() const { - auto os = aslog(0); + auto os = aslog(1); dump(os); } diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp index 55037ff849c9..6c3308d7bddc 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.cpp +++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp @@ -310,38 +310,38 @@ class Featurizer : public IRVisitor { void LoadJacobian::dump(const char *prefix) const { if (count() > 1) { - aslog(0) << prefix << count() << " x\n"; + aslog(1) << prefix << count() << " x\n"; } for (size_t i = 0; i < producer_storage_dims(); i++) { - aslog(0) << prefix << " ["; + aslog(1) << prefix << " ["; for (size_t j = 0; j < consumer_loop_dims(); j++) { const auto &c = (*this)(i, j); if (!c.exists()) { - aslog(0) << " _ "; + aslog(1) << " _ "; } else if (c.denominator == 1) { - aslog(0) << " " << c.numerator << " "; + aslog(1) << " " << c.numerator << " "; } else { - aslog(0) << c.numerator << "/" << c.denominator << " "; + aslog(1) << c.numerator << "/" << c.denominator << " "; } } - aslog(0) << "]\n"; + aslog(1) << "]\n"; } - aslog(0) << "\n"; + aslog(1) << "\n"; } void BoundContents::validate() const { for (int i = 0; i < layout->total_size; i++) { auto p = data()[i]; if (p.max() < p.min()) { - aslog(0) << "Bad bounds object:\n"; + aslog(1) << "Bad bounds object:\n"; for (int j = 0; j < layout->total_size; j++) { if (i == j) { - aslog(0) << "=> "; + aslog(1) << "=> "; } else { - aslog(0) << " "; + aslog(1) << " "; } - aslog(0) << j << ": " << data()[j].min() << ", " << data()[j].max() << "\n"; + aslog(1) << j << ": " << data()[j].min() << ", " << data()[j].max() << "\n"; } internal_error << "Aborting"; } @@ -1092,7 +1092,7 @@ void FunctionDAG::dump_internal(OS &os) const { } void FunctionDAG::dump() const { - auto os = aslog(0); + auto os = aslog(1); dump_internal(os); } diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index c2da65e37822..1bcfa5fceccb 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -2270,7 +2270,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); if (!jac.first.all_coeffs_exist()) { - aslog(0) << " (not all coeffs exist)"; + aslog(1) << " (not all coeffs exist)"; } aslog(2) << "\n\n"; } @@ -2777,7 +2777,7 @@ const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const { } void LoopNest::dump() const { - auto stream = aslog(0); + auto stream = aslog(1); dump(stream, "", nullptr); } diff --git a/src/autoschedulers/anderson2021/LoopNestParser.h b/src/autoschedulers/anderson2021/LoopNestParser.h index 4f3e5c1a1278..278ce2d631f3 100644 --- a/src/autoschedulers/anderson2021/LoopNestParser.h +++ b/src/autoschedulers/anderson2021/LoopNestParser.h @@ -106,31 +106,31 @@ class LoopNestParser { } void dump() const { - aslog(0) << "All stages:\n"; + aslog(1) << "All stages:\n"; for (const auto &s : all_stages) { - aslog(0) << s << "\n"; + aslog(1) << s << "\n"; } - aslog(0) << "\ncompute_root stages:\n"; + aslog(1) << "\ncompute_root stages:\n"; for (const auto &s : compute_root_stages) { - aslog(0) << s.first << " with vector_dim = " << s.second << "\n"; + aslog(1) << s.first << " with vector_dim = " << s.second << "\n"; } - aslog(0) << "\nPartially scheduled stages:\n"; + aslog(1) << "\nPartially scheduled stages:\n"; for (const auto &s : partially_scheduled) { - aslog(0) << s << " with vector_dim = " << compute_root_stages.at(s) << "\n"; + aslog(1) << s << " with vector_dim = " << compute_root_stages.at(s) << "\n"; } - aslog(0) << "\nInlined stages:\n"; + aslog(1) << "\nInlined stages:\n"; for (const auto &s : inlined) { - aslog(0) << s << "\n"; + aslog(1) << s << "\n"; } - aslog(0) << "\nFull loop nest:\n"; + aslog(1) << "\nFull loop nest:\n"; for (const auto &s : loop_nest) { - aslog(0) << s << "\n"; + aslog(1) << s << "\n"; } - aslog(0) << "\n"; + aslog(1) << "\n"; } bool is_in_partial_schedule(const FunctionDAG::Node *node) const { diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index c616d918ede1..c3b591302cb3 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -277,7 +277,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, // We don't need to schedule nodes that represent inputs, // and there are no other decisions to be made about them // at this time. - // aslog(0) << "Skipping over scheduling input node: " << node->func.name() << "\n"; + // aslog(1) << "Skipping over scheduling input node: " << node->func.name() << "\n"; auto child = state->make_child(); child->num_decisions_made++; accept_child(std::move(child)); @@ -285,14 +285,14 @@ void SearchSpace::generate_children(const IntrusivePtr &state, } if (!node->outgoing_edges.empty() && !root->calls(node)) { - aslog(0) << "In state:\n"; + aslog(1) << "In state:\n"; state->dump(); - aslog(0) << node->func.name() << " is consumed by:\n"; + aslog(1) << node->func.name() << " is consumed by:\n"; for (const auto *e : node->outgoing_edges) { - aslog(0) << e->consumer->name << "\n"; - aslog(0) << "Which in turn consumes:\n"; + aslog(1) << e->consumer->name << "\n"; + aslog(1) << "Which in turn consumes:\n"; for (const auto *e2 : e->consumer->incoming_edges) { - aslog(0) << " " << e2->producer->func.name() << "\n"; + aslog(1) << " " << e2->producer->func.name() << "\n"; } } internal_error << "Pipeline so far doesn't use next Func: " << node->func.name() << '\n'; @@ -563,7 +563,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, } if (num_children == 0) { - aslog(0) << "Warning: Found no legal way to schedule " + aslog(1) << "Warning: Found no legal way to schedule " << node->func.name() << " in the following State:\n"; state->dump(); // All our children died. Maybe other states have had diff --git a/src/autoschedulers/anderson2021/SearchSpaceOptions.h b/src/autoschedulers/anderson2021/SearchSpaceOptions.h index ccb862085f1b..0d4c239b60b4 100644 --- a/src/autoschedulers/anderson2021/SearchSpaceOptions.h +++ b/src/autoschedulers/anderson2021/SearchSpaceOptions.h @@ -19,12 +19,12 @@ struct SearchSpaceOptions { SearchSpaceOptions(const std::string &bit_str) : options{bit_str} { - aslog(0) << "Search space options:\n"; - aslog(0) << "Input string: " << bit_str << "\n"; - aslog(0) << "Compute root: " << compute_root() << "\n"; - aslog(0) << "Compute inline: " << compute_inline() << "\n"; - aslog(0) << "Compute at block: " << compute_at_block() << "\n"; - aslog(0) << "Compute at thread: " << compute_at_thread() << "\n"; + aslog(1) << "Search space options:\n"; + aslog(1) << "Input string: " << bit_str << "\n"; + aslog(1) << "Compute root: " << compute_root() << "\n"; + aslog(1) << "Compute inline: " << compute_inline() << "\n"; + aslog(1) << "Compute at block: " << compute_at_block() << "\n"; + aslog(1) << "Compute at thread: " << compute_at_thread() << "\n"; } bool compute_root() const { diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 9cab832e7773..0584b8a075ce 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -696,7 +696,7 @@ bool State::calculate_cost(const FunctionDAG &dag, const Anderson2021Params &par const auto &feat = it.value(); std::string name = stage.node->func.name(); sanitize_names(name); - aslog(0) << "Schedule features for " << name << "_s" << stage.index << "\n"; + aslog(1) << "Schedule features for " << name << "_s" << stage.index << "\n"; feat.dump(); } } @@ -751,26 +751,26 @@ IntrusivePtr State::make_child() const { } void State::dump() const { - aslog(0) << "State with cost " << cost << ":\n"; + aslog(1) << "State with cost " << cost << ":\n"; root->dump(); - aslog(0) << schedule_source; + aslog(1) << schedule_source; } void State::print_compute_locations() const { StageMap> descendants; root->get_stages_computed_in_each_compute_root_loop(descendants); - aslog(0) << "BEGIN compute locations\n"; + aslog(1) << "BEGIN compute locations\n"; for (const auto &d : descendants) { - aslog(0) << d.first->sanitized_name << " -> "; + aslog(1) << d.first->sanitized_name << " -> "; for (const auto &descendant : d.second) { - aslog(0) << descendant.first->sanitized_name << " "; + aslog(1) << descendant.first->sanitized_name << " "; } - aslog(0) << "\n"; + aslog(1) << "\n"; } - aslog(0) << "END compute locations\n"; + aslog(1) << "END compute locations\n"; } void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents, const vector &constant_extents) const { @@ -906,7 +906,7 @@ bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, for (size_t i = 0; i < edge_chain.size() - 1; ++i) { s = edge_chain.at(i)->producer->func.name() + ".clone_in(" + s + ")"; } - aslog(0) << "Chain with length > 1: " << producer_node->func.name() << ".in(" << s << ")\n"; + aslog(1) << "Chain with length > 1: " << producer_node->func.name() << ".in(" << s << ")\n"; continue; } diff --git a/src/autoschedulers/anderson2021/Statistics.h b/src/autoschedulers/anderson2021/Statistics.h index e0ccdd1bbb57..2d0080e69a2f 100644 --- a/src/autoschedulers/anderson2021/Statistics.h +++ b/src/autoschedulers/anderson2021/Statistics.h @@ -20,7 +20,7 @@ struct ScopedStatistic { } ~ScopedStatistic() { - aslog(0) << msg << " = " << value << "\n"; + aslog(1) << msg << " = " << value << "\n"; } }; @@ -30,13 +30,13 @@ struct ScopedTimer { ScopedTimer(const std::string &msg) : start{Clock::now()}, msg{msg} { - aslog(0) << "Start: " << msg << "\n"; + aslog(1) << "Start: " << msg << "\n"; } ~ScopedTimer() { auto duration = Clock::now() - start; auto ms = std::chrono::duration_cast(duration).count(); - aslog(0) << "Duration (ms): " << msg << " = " << ms << "\n"; + aslog(1) << "Duration (ms): " << msg << " = " << ms << "\n"; } }; From 8468f59a547e13198ffafb75af81c50f8f5493c4 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 17 Mar 2023 00:40:27 -0400 Subject: [PATCH 51/63] abort -> internal_assert --- src/autoschedulers/anderson2021/DefaultCostModel.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index 1d2fde5b925e..3eede5993d98 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -254,22 +254,20 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim any_nans = true; aslog(1) << "Prediction " << i << " is NaN. True runtime is " << true_runtimes(i) << "\n"; aslog(1) << "Checking pipeline features for NaNs...\n"; - pipeline_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); + pipeline_feat_queue.for_each_value([&](float f) { internal_assert(!std::isnan(f)); }); aslog(1) << "None found\n"; aslog(1) << "Checking schedule features for NaNs...\n"; - schedule_feat_queue.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); + schedule_feat_queue.for_each_value([&](float f) { internal_assert(!std::isnan(f)); }); aslog(1) << "None found\n"; aslog(1) << "Checking network weights for NaNs...\n"; weights.for_each_buffer([&](const Runtime::Buffer &buf) { - buf.for_each_value([&](float f) { if (std::isnan(f)) abort(); }); + buf.for_each_value([&](float f) { internal_assert(!std::isnan(f)); }); }); aslog(1) << "None found\n"; } internal_assert(true_runtimes(i) > 0); } - if (any_nans) { - abort(); - } + internal_assert(!any_nans); // Update weights locally auto update_weight = [](const Runtime::Buffer &src, Runtime::Buffer &dst) { From 940f64477cf0618ffc43fc647587965a89ef5246 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 17 Mar 2023 00:42:17 -0400 Subject: [PATCH 52/63] Remove default destructor --- src/autoschedulers/anderson2021/DefaultCostModel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.h b/src/autoschedulers/anderson2021/DefaultCostModel.h index a5a03423a3ca..87d17245d567 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.h +++ b/src/autoschedulers/anderson2021/DefaultCostModel.h @@ -39,7 +39,6 @@ class DefaultCostModel : public CostModel { stats{stats} { load_weights(); } - ~DefaultCostModel() override = default; // Configure the cost model for the algorithm to be scheduled. void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, From c86069643057cf231d2ea62f475d27ba70fb9a05 Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 17 Mar 2023 00:44:11 -0400 Subject: [PATCH 53/63] Fix long line --- src/autoschedulers/anderson2021/Featurization.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/Featurization.h b/src/autoschedulers/anderson2021/Featurization.h index 65d44dc34635..60e2718367c5 100644 --- a/src/autoschedulers/anderson2021/Featurization.h +++ b/src/autoschedulers/anderson2021/Featurization.h @@ -427,7 +427,14 @@ struct ScheduleFeatures { } bool equal(const ScheduleFeatures &other) const { - return num_realizations == other.num_realizations && num_productions == other.num_productions && points_computed_per_realization == other.points_computed_per_realization && points_computed_per_production == other.points_computed_per_production && points_computed_per_thread == other.points_computed_per_thread && points_computed_total == other.points_computed_total && points_computed_minimum == other.points_computed_minimum && innermost_loop_extent == other.innermost_loop_extent && innermost_pure_loop_extent == other.innermost_pure_loop_extent && unrolled_loop_extent == other.unrolled_loop_extent && inner_parallelism == other.inner_parallelism && outer_parallelism == other.outer_parallelism && bytes_at_realization == other.bytes_at_realization && bytes_at_production == other.bytes_at_production && bytes_at_root == other.bytes_at_root && innermost_bytes_at_realization == other.innermost_bytes_at_realization && innermost_bytes_at_production == other.innermost_bytes_at_production && innermost_bytes_at_root == other.innermost_bytes_at_root && inlined_calls == other.inlined_calls && unique_global_bytes_read_per_realization == other.unique_global_bytes_read_per_realization && unique_shared_bytes_read_per_realization == other.unique_shared_bytes_read_per_realization && unique_register_bytes_read_per_realization == other.unique_register_bytes_read_per_realization && unique_global_lines_read_per_realization == other.unique_global_lines_read_per_realization && unique_shared_lines_read_per_realization == other.unique_shared_lines_read_per_realization && unique_register_lines_read_per_realization == other.unique_register_lines_read_per_realization && unique_global_bytes_read_per_thread == other.unique_global_bytes_read_per_thread && unique_shared_bytes_read_per_thread == other.unique_shared_bytes_read_per_thread && unique_register_bytes_read_per_thread == other.unique_register_bytes_read_per_thread && unique_global_lines_read_per_thread == other.unique_global_lines_read_per_thread && unique_shared_lines_read_per_thread == other.unique_shared_lines_read_per_thread && unique_register_lines_read_per_thread == other.unique_register_lines_read_per_thread && global_allocation_bytes_read_per_realization == other.global_allocation_bytes_read_per_realization && shared_allocation_bytes_read_per_realization == other.shared_allocation_bytes_read_per_realization && register_allocation_bytes_read_per_realization == other.register_allocation_bytes_read_per_realization && working_set == other.working_set && num_scalars == other.num_scalars && global_bytes_at_task == other.global_bytes_at_task && shared_bytes_at_task == other.shared_bytes_at_task && register_bytes_at_task == other.register_bytes_at_task && global_innermost_bytes_at_task == other.global_innermost_bytes_at_task && shared_innermost_bytes_at_task == other.shared_innermost_bytes_at_task && register_innermost_bytes_at_task == other.register_innermost_bytes_at_task && unique_bytes_read_per_point == other.unique_bytes_read_per_point && unique_lines_read_per_point == other.unique_lines_read_per_point && unique_bytes_read_per_task == other.unique_bytes_read_per_task && unique_lines_read_per_task == other.unique_lines_read_per_task && working_set_at_task == other.working_set_at_task && working_set_at_production == other.working_set_at_production && working_set_at_realization == other.working_set_at_realization && working_set_at_root == other.working_set_at_root && num_blocks == other.num_blocks && num_warps_per_block == other.num_warps_per_block && block_occupancy == other.block_occupancy && warp_lane_utilization == other.warp_lane_utilization && num_active_warps_per_block == other.num_active_warps_per_block && warp_lane_utilization_at_block_y == other.warp_lane_utilization_at_block_y && warp_lane_utilization_at_block_z == other.warp_lane_utilization_at_block_z && idle_lane_wastage == other.idle_lane_wastage && num_shared_mem_loads_per_block == other.num_shared_mem_loads_per_block && num_global_mem_loads_per_block == other.num_global_mem_loads_per_block && num_shared_mem_stores_per_block == other.num_shared_mem_stores_per_block && num_global_mem_stores_per_block == other.num_global_mem_stores_per_block && shared_mem_store_efficiency == other.shared_mem_store_efficiency && shared_mem_load_efficiency == other.shared_mem_load_efficiency && global_mem_store_efficiency == other.global_mem_store_efficiency && global_mem_load_efficiency == other.global_mem_load_efficiency && working_set_at_thread == other.working_set_at_thread && shared_mem_occupancy == other.shared_mem_occupancy && shared_mem_block_limit_factor == other.shared_mem_block_limit_factor && max_warp_occupancy == other.max_warp_occupancy && max_block_occupancy == other.max_block_occupancy && num_threads_per_block == other.num_threads_per_block && expr_branching == other.expr_branching; + const size_t n_features = ScheduleFeatures::num_features(); + for (size_t i = 0; i < n_features; i++) { + if ((*this)[i] != other[i]) { + return false; + + } + } + return true; } }; From 1df7dc27ffc2e6d8777d2aa12ba580d1fbd1288e Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 18 Mar 2023 00:49:44 -0400 Subject: [PATCH 54/63] Fix long lines --- src/autoschedulers/anderson2021/LoopNest.cpp | 47 +++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index 1bcfa5fceccb..dcd01aa643e6 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -1091,12 +1091,36 @@ void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_i mem_info.add(min_info); } -template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; - -template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const; +template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, + int producer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &producer_store_bounds, + bool producer_has_been_scheduled, + const ThreadInfo &thread_info, + MemInfoType &mem_info, + double points_accessed_per_thread, + bool verbose) const; + +template void LoopNest::compute_mem_load_features(const LoadJacobian &jac, + int producer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &producer_store_bounds, + bool producer_has_been_scheduled, + const ThreadInfo &thread_info, + MemInfoType &mem_info, + double points_accessed_per_thread, + bool verbose) const; template<> -void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { +void LoopNest::compute_mem_load_features(const LoadJacobian &jac, + int producer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &producer_store_bounds, + bool producer_has_been_scheduled, + const ThreadInfo &thread_info, + MemInfoType &mem_info, + double points_accessed_per_thread, + bool verbose) const { compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); } @@ -1263,7 +1287,20 @@ std::pair LoopNest::find_innermost_and_paren return {child, parent}; } -int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { +int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, + const Target &target, + const GPULoopInfo &gpu_loop_info, + const std::vector &edge_chain, + const LoadJacobian &jac, + const LoopNest *parent, + const LoopNest *grandparent, + int64_t n, + const ScheduleFeatures &feat, + const LoadJacobian &serial_jac, + bool producer_has_been_scheduled, + int producer_innermost_dim, + const GPUMemoryType &mem_type, + bool verbose) const { std::unique_ptr innermost_parent_clone = std::make_unique(); innermost_parent_clone->copy_from(*parent); From cf15af23678635bd0d193e6b5f6f62fd052c8fb1 Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 18 Mar 2023 01:05:01 -0400 Subject: [PATCH 55/63] Reorder parameters --- src/autoschedulers/anderson2021/LoopNest.cpp | 10 +++++----- src/autoschedulers/anderson2021/LoopNest.h | 6 +++--- src/autoschedulers/anderson2021/State.cpp | 2 +- src/autoschedulers/anderson2021/autotune_loop.sh | 1 - 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index dcd01aa643e6..95ecf4259f17 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -1662,13 +1662,13 @@ void LoopNest::compute_features(const FunctionDAG &dag, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, + GPULoopInfo gpu_loop_info, + bool use_memoized_features, + const StageMap &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap *features, - GPULoopInfo gpu_loop_info, - bool use_memoized_features, - const StageMap &total_shared_mem_alloc_sizes, Statistics &stats, bool verbose) const { @@ -1802,7 +1802,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, ++stats.num_memoization_misses; } - c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose); if (use_memoized_features) { c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id); @@ -2054,7 +2054,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, // Recurse inwards for (const auto &c : children) { - c->compute_features(dag, params, target, sites, subinstances, subparallelism, this, parent, root, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, stats, verbose); + c->compute_features(dag, params, target, sites, subinstances, subparallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose); } for (const auto *node : store_at) { auto &feat = features->get(&(node->stages[0])); diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h index 034838de9a8e..9e60e4f091e1 100644 --- a/src/autoschedulers/anderson2021/LoopNest.h +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -357,13 +357,13 @@ struct LoopNest { const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, + GPULoopInfo gpu_loop_info, + bool use_memoized_features, + const StageMap &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap *features, - GPULoopInfo gpu_loop_info, - bool use_memoized_features, - const StageMap &total_shared_mem_alloc_sizes, Statistics &stats, bool verbose = false) const; diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 0584b8a075ce..4b0ada17f283 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -496,7 +496,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para } Timer timer; - feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, nullptr, nullptr, nullptr, features, {feature_root.get()}, true, total_shared_mem_alloc_sizes, stats, verbose); + feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, {feature_root.get()}, true, total_shared_mem_alloc_sizes, nullptr, nullptr, nullptr, features, stats, verbose); stats.featurization_time += timer.elapsed(); ++stats.num_featurizations; diff --git a/src/autoschedulers/anderson2021/autotune_loop.sh b/src/autoschedulers/anderson2021/autotune_loop.sh index cff09fd23704..3ca5061b5cf1 100644 --- a/src/autoschedulers/anderson2021/autotune_loop.sh +++ b/src/autoschedulers/anderson2021/autotune_loop.sh @@ -193,7 +193,6 @@ make_featurization() { GPU=$((RANDOM % NUM_GPUS)) CMD="HL_DEBUG_AUTOSCHEDULE=1 \ - HL_DEBUG_CODEGEN=1 \ /bin/time -f 'Compile time (s): %e' ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \ ${GENERATOR} \ -g ${PIPELINE} \ From 79c471350d2fac12b47ccca714ac9304ebe34735 Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 18 Mar 2023 01:06:12 -0400 Subject: [PATCH 56/63] Fix long line --- src/autoschedulers/anderson2021/LoopNest.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index 95ecf4259f17..60fbeefdaa3d 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -2054,7 +2054,24 @@ void LoopNest::compute_features(const FunctionDAG &dag, // Recurse inwards for (const auto &c : children) { - c->compute_features(dag, params, target, sites, subinstances, subparallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose); + c->compute_features(dag, + params, + target, + sites, + subinstances, + subparallelism, + this, + parent, + root, + gpu_loop_info, + use_memoized_features, + total_shared_mem_alloc_sizes, + &working_set_here, + &working_set_here_local_constant, + &working_set_here_local_dynamic, + features, + stats, + verbose); } for (const auto *node : store_at) { auto &feat = features->get(&(node->stages[0])); From 50ba6e023d69bd819758aa2fab070c50fb0198cb Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 18 Mar 2023 01:07:40 -0400 Subject: [PATCH 57/63] Remove empty clause --- src/autoschedulers/anderson2021/LoopNest.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index 60fbeefdaa3d..8ab0f42818b7 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -1909,8 +1909,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, internal_assert(!stage->node->is_input); ScheduleFeatures &feat = features->get_or_create(stage); - if (innermost) { - } else { + if (!innermost) { // We want these features just outside the innermost loop, // so just set them at every level and let them get // progressively overwritten as we descend the loop nest From 6924d1aac5183da9e8f78f5488c4c8465e617b9d Mon Sep 17 00:00:00 2001 From: aekul Date: Sat, 18 Mar 2023 01:08:14 -0400 Subject: [PATCH 58/63] Remove blank line --- src/autoschedulers/anderson2021/LoopNest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index 8ab0f42818b7..215ded27820f 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -2687,7 +2687,6 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (innermost && !is_scalar()) { compute_warp_features(feat, gpu_loop_info); - compute_warp_and_block_occupancy(params, feat, gpu_loop_info); } } From 5aad4195899ea392a505d9b26ae41dd245e7f02f Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 26 Mar 2023 15:20:36 -0400 Subject: [PATCH 59/63] Uppercase enum, std::vector, constexpr --- src/autoschedulers/anderson2021/LoopNest.cpp | 155 ++++++++++-------- src/autoschedulers/anderson2021/LoopNest.h | 40 ++--- src/autoschedulers/anderson2021/State.cpp | 36 ++-- src/autoschedulers/anderson2021/ThreadInfo.h | 2 +- src/autoschedulers/anderson2021/Tiling.cpp | 49 +++--- src/autoschedulers/anderson2021/Tiling.h | 33 ++-- .../anderson2021/test/tiling.cpp | 46 +++--- 7 files changed, 193 insertions(+), 168 deletions(-) diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index 215ded27820f..0190a40f9d04 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -11,6 +11,25 @@ namespace Halide { namespace Internal { namespace Autoscheduler { +std::string stringify(GPU_parallelism label) { + if (label == GPU_parallelism::Block) { + return "block"; + } + if (label == GPU_parallelism::Thread) { + return "thread"; + } + if (label == GPU_parallelism::Serial) { + return "serial"; + } + if (label == GPU_parallelism::Simd) { + return "simd"; + } + if (label == GPU_parallelism::Parallelized) { + return "parallelized"; + } + return "None"; +} + // How small should an innermost loop cluster be before you just // entirely unroll the thing const int kUnrollLimitGPU = 16; @@ -82,7 +101,7 @@ vector LoopNest::get_union_thread_counts(const FunctionDAG::Node *f) co // find the loop nests we just created and get max gpu_thread extents of other children for (const auto &c : children) { if (c->node != f) { - if (c->gpu_label == thread) { + if (c->gpu_label == GPU_parallelism::Thread) { vector lowered_size; lowered_dims(c->size, c->vectorized_loop_index, lowered_size); for (int dim = 0; dim < (int)(lowered_size.size()); dim++) { @@ -173,7 +192,7 @@ bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, if (!made_child) { // if we can't tile into gpu threads the inserted node, make it serial for (auto &c : children) { if (c->node == f) { - c->gpu_label = serial; + c->gpu_label = GPU_parallelism::Serial; } } } @@ -275,19 +294,19 @@ void LoopNest::structural_hash(uint64_t &h, int depth) const { GPUMemoryType LoopNest::get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined) const { if (is_inlined) { - return GPUMemoryType::inlined; + return GPUMemoryType::Inlined; } if (in_thread) { internal_assert(in_block); - return GPUMemoryType::local; + return GPUMemoryType::Local; } if (in_block) { - return GPUMemoryType::shared; + return GPUMemoryType::Shared; } - return GPUMemoryType::global; + return GPUMemoryType::Global; } std::vector LoopNest::unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const { @@ -342,7 +361,7 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target for (const auto *alloc_node : store_at) { const auto &store_site = sites.get(&alloc_node->stages[0]); - if (store_site.gpu_store_memory_type != GPUMemoryType::local) { + if (store_site.gpu_store_memory_type != GPUMemoryType::Local) { continue; } @@ -357,7 +376,7 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target auto unrolled = unrolled_loops(target, parent, grandparent); for (const auto *e : stage->incoming_edges) { - if (sites.get(&e->producer->stages[0]).gpu_store_memory_type != GPUMemoryType::local) { + if (sites.get(&e->producer->stages[0]).gpu_store_memory_type != GPUMemoryType::Local) { continue; } @@ -405,7 +424,7 @@ void LoopNest::get_sites(const Target &target, sites.get_or_create(&s).is_constant_allocation = alloc.second; const LoopNest *store_site = sites.get_or_create(&s).store; - if (store_site->gpu_label == block && s.index == 0) { + if (store_site->gpu_label == GPU_parallelism::Block && s.index == 0) { total_shared_mem_alloc_sizes.get_or_create(store_site->stage) += alloc.first; } } @@ -420,7 +439,7 @@ void LoopNest::get_sites(const Target &target, // Accumulate all the innermost loop nests into which this func is // inlined s.inlined_innermosts.push_back(this); - s.gpu_store_memory_type = GPUMemoryType::inlined; + s.gpu_store_memory_type = GPUMemoryType::Inlined; s.task = task; } if (innermost) { @@ -439,8 +458,8 @@ bool LoopNest::promote_allocs_to_registers(const Target &target, StageMap } for (const auto &stage : node.first->stages) { - internal_assert(sites.get(&stage).gpu_store_memory_type == GPUMemoryType::local); - sites.get(&stage).gpu_store_memory_type = GPUMemoryType::registers; + internal_assert(sites.get(&stage).gpu_store_memory_type == GPUMemoryType::Local); + sites.get(&stage).gpu_store_memory_type = GPUMemoryType::Registers; } } @@ -455,7 +474,7 @@ bool LoopNest::exceeds_serial_extents_limit(const Target &target, const LoopNest } } - if (gpu_label == serial && stage->index == 0) { + if (gpu_label == GPU_parallelism::Serial && stage->index == 0) { int64_t serial_loop_extents = 1; for (const auto &i : stage->loop) { if (!i.pure) { @@ -475,7 +494,7 @@ bool LoopNest::exceeds_serial_extents_limit(const Target &target, const LoopNest } for (const auto &c : children) { - if (c->exceeds_serial_extents_limit(target, this, in_threads_loop || c->gpu_label == thread)) { + if (c->exceeds_serial_extents_limit(target, this, in_threads_loop || c->gpu_label == GPU_parallelism::Thread)) { return true; } } @@ -496,7 +515,7 @@ bool LoopNest::node_has_dynamic_region_computed(const FunctionDAG::Node *f) cons } bool LoopNest::has_dynamic_allocation_inside_thread(bool in_thread_loop) const { - in_thread_loop = in_thread_loop || (gpu_label == thread); + in_thread_loop = in_thread_loop || (gpu_label == GPU_parallelism::Thread); if (in_thread_loop) { for (const auto &f : store_at) { @@ -610,7 +629,7 @@ bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, c } bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const { - if (loop_index < 0 || mem_type != GPUMemoryType::shared) { + if (loop_index < 0 || mem_type != GPUMemoryType::Shared) { return false; } @@ -620,7 +639,7 @@ bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const Functio int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { int vector_size = 1; - if (mem_type != GPUMemoryType::shared) { + if (mem_type != GPUMemoryType::Shared) { return vector_size; } @@ -825,7 +844,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ } const ThreadInfo &thread_info = *gpu_loop_info.thread_info; - bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::shared; + bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared; size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds); @@ -856,9 +875,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ std::string consumer_name = node->func.name(); sanitize_names(consumer_name); std::string mem_type = "global"; - if (consumer_site.gpu_store_memory_type == GPUMemoryType::shared) { + if (consumer_site.gpu_store_memory_type == GPUMemoryType::Shared) { mem_type = "shared"; - } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::local) { + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) { mem_type = "local"; } aslog(2) << "BEGIN MEM ACCESS " << mem_type << "_mem_" << type; @@ -888,7 +907,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ internal_assert(in_range_zero_one(feat.shared_mem_store_efficiency)) << "Invalid shared mem store efficiency: " << feat.shared_mem_store_efficiency << " for " << node->func.name(); - } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::global) { + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Global) { if (verbose) { aslog(2) << "vector_size = " << vector_size << "\n"; } @@ -910,7 +929,7 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ internal_assert(in_range_zero_one(feat.global_mem_store_efficiency)) << "Invalid global mem store efficiency: " << feat.global_mem_store_efficiency << " for " << node->func.name(); - } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::local) { + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) { auto local_mem_info = compute_mem_store_info( jac, consumer_innermost_dim, @@ -934,9 +953,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ std::string consumer_name = node->func.name(); sanitize_names(consumer_name); std::string mem_type = "global"; - if (consumer_site.gpu_store_memory_type == GPUMemoryType::shared) { + if (consumer_site.gpu_store_memory_type == GPUMemoryType::Shared) { mem_type = "shared"; - } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::local) { + } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) { mem_type = "local"; } aslog(2) << "END MEM ACCESS " << mem_type << "_mem_" << type << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name; @@ -1126,17 +1145,17 @@ void LoopNest::compute_mem_load_features(const LoadJacobian &jac, // Assumes block, serial, thread or block, thread nesting const LoopNest *LoopNest::get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const { - internal_assert(gpu_label == thread); + internal_assert(gpu_label == GPU_parallelism::Thread); - if (parent->gpu_label == block && grandparent->is_root()) { + if (parent->gpu_label == GPU_parallelism::Block && grandparent->is_root()) { return parent; } - if (parent->gpu_label == serial && grandparent->gpu_label == block) { + if (parent->gpu_label == GPU_parallelism::Serial && grandparent->gpu_label == GPU_parallelism::Block) { return grandparent; } - internal_error << "Invalid nesting: " << parent->gpu_label << ", " << grandparent->gpu_label << "\n"; + internal_error << "Invalid nesting: " << stringify(parent->gpu_label) << ", " << stringify(grandparent->gpu_label) << "\n"; return nullptr; } @@ -1171,7 +1190,7 @@ std::pair LoopNest::get_block_and_serial_extents(const LoopNes } bool LoopNest::all_paths_to_leaves_have_thread_loop() const { - if (gpu_label == thread) { + if (gpu_label == GPU_parallelism::Thread) { return true; } @@ -1189,7 +1208,7 @@ bool LoopNest::all_paths_to_leaves_have_thread_loop() const { } bool LoopNest::has_thread_loop_descendant() const { - if (gpu_label == thread) { + if (gpu_label == GPU_parallelism::Thread) { return true; } @@ -1400,7 +1419,7 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, if (points_accessed_by_loop_extents <= points_accessed_by_region_required) { points_accessed = points_accessed_by_loop_extents; - if (mem_type == GPUMemoryType::shared) { + if (mem_type == GPUMemoryType::Shared) { int vector_size = parent->vectorized_load_access_size( serial_jac, producer, @@ -2285,8 +2304,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, site.produce->vector_dim); // Shared, global, or local memory? - bool is_global_mem = site.gpu_store_memory_type == GPUMemoryType::global; - bool is_shared_mem = site.gpu_store_memory_type == GPUMemoryType::shared; + bool is_global_mem = site.gpu_store_memory_type == GPUMemoryType::Global; + bool is_shared_mem = site.gpu_store_memory_type == GPUMemoryType::Shared; // Grab the jacobians that describe the memory dependence for (size_t i = 0; i < thread_jacobians.size(); ++i) { @@ -2307,7 +2326,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::shared, verbose); + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Shared, verbose); compute_mem_load_features( jac.first, @@ -2338,7 +2357,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::global, verbose); + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Global, verbose); compute_mem_load_features( jac.first, @@ -2362,7 +2381,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, } } - if (site.gpu_store_memory_type == GPUMemoryType::local) { + if (site.gpu_store_memory_type == GPUMemoryType::Local) { internal_assert(false) << "Loop nest contains local_mem_load"; for (const auto &jac : jacobians) { if (jac.second != e->producer) { @@ -2378,7 +2397,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::local, verbose); + int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Local, verbose); compute_mem_load_features( jac.first, @@ -2548,7 +2567,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, register_bytes_loaded_per_thread += thread_footprint; register_lines_loaded_per_thread += thread_line_footprint; } else { - internal_assert(producer_store_site->gpu_label == GPU_parallelism::serial); + internal_assert(producer_store_site->gpu_label == GPU_parallelism::Serial); register_bytes_loaded_per_thread += store_footprint; register_lines_loaded_per_thread += store_line_footprint; } @@ -2880,17 +2899,17 @@ void LoopNest::dump(T &stream, string prefix, const LoopNest *parent) const { if (innermost) { stream << " *"; } - if (gpu_label == block) { + if (gpu_label == GPU_parallelism::Block) { stream << " gpu_block\n"; - } else if (gpu_label == serial) { + } else if (gpu_label == GPU_parallelism::Serial) { stream << " gpu_serial\n"; - } else if (gpu_label == none) { + } else if (gpu_label == GPU_parallelism::None) { stream << " gpu_none\n"; - } else if (gpu_label == simd) { + } else if (gpu_label == GPU_parallelism::Simd) { stream << " gpu_simd\n"; - } else if (gpu_label == thread) { + } else if (gpu_label == GPU_parallelism::Thread) { stream << " gpu_thread\n"; - } else if (gpu_label == parallelized) { + } else if (gpu_label == GPU_parallelism::Parallelized) { stream << " gpu_parallelized\n"; } else if (parallel) { stream << " p\n"; @@ -3079,11 +3098,11 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, // if computing at serial loop set gpu_label to thread. if (target.has_gpu_feature()) { if (is_root()) { - node->gpu_label = none; + node->gpu_label = GPU_parallelism::None; } else if (!in_threads_loop) { - node->gpu_label = thread; + node->gpu_label = GPU_parallelism::Thread; } else { - node->gpu_label = serial; + node->gpu_label = GPU_parallelism::Serial; } } // Set up a bound for the inside of the @@ -3148,7 +3167,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, one_vector->vector_dim = v; one_vector->size.resize(loop_dim, 1); one_vector->innermost = true; - one_vector->gpu_label = simd; + one_vector->gpu_label = GPU_parallelism::Simd; auto *b = node->get_bounds(f)->make_copy(); // Set the region computed inside this node to be the first vector lane if (node->vectorized_loop_index >= 0) { @@ -3191,24 +3210,24 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const vectorvectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index; if (target.has_gpu_feature()) { - if (gpu_label == none) { - inner->gpu_label = serial; - outer->gpu_label = parallelized; + if (gpu_label == GPU_parallelism::None) { + inner->gpu_label = GPU_parallelism::Serial; + outer->gpu_label = GPU_parallelism::Parallelized; outer->parallel = true; - } else if (gpu_label == parallelized) { - inner->gpu_label = thread; // compute root funcs always allowed to use GPU threads - outer->gpu_label = block; + } else if (gpu_label == GPU_parallelism::Parallelized) { + inner->gpu_label = GPU_parallelism::Thread; // compute root funcs always allowed to use GPU threads + outer->gpu_label = GPU_parallelism::Block; outer->parallel = true; - } else if (gpu_label == thread) { - inner->gpu_label = serial; - outer->gpu_label = thread; + } else if (gpu_label == GPU_parallelism::Thread) { + inner->gpu_label = GPU_parallelism::Serial; + outer->gpu_label = GPU_parallelism::Thread; outer->parallel = false; - } else if (gpu_label == serial) { - inner->gpu_label = serial; - outer->gpu_label = serial; + } else if (gpu_label == GPU_parallelism::Serial) { + inner->gpu_label = GPU_parallelism::Serial; + outer->gpu_label = GPU_parallelism::Serial; outer->parallel = false; } else { - internal_error << "invalid gpu label " << gpu_label << " for parallelized loop\n"; + internal_error << "invalid gpu label " << stringify(gpu_label) << " for parallelized loop\n"; } } @@ -3291,7 +3310,7 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const vector> LoopNest::compute_in_tiles(const FunctionDA } } - if (gpu_label == block) { + if (gpu_label == GPU_parallelism::Block) { // once we enter a gpu block loop compute union thread counts to pass down union_counts = get_union_thread_counts(f); } @@ -3467,7 +3486,7 @@ vector> LoopNest::compute_in_tiles(const FunctionDA continue; } - in_threads_loop |= (children[child]->gpu_label == thread); + in_threads_loop |= (children[child]->gpu_label == GPU_parallelism::Thread); // we must pass down union thread count constraints computed at block level when computing further in auto opts = children[child]->compute_in_tiles(f, this, params, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); for (IntrusivePtr &n : opts) { @@ -3653,7 +3672,7 @@ void LoopNest::apply(LoopLevel here, fv.extent = p.extent(); fv.constant_extent = p.constant_extent(); fv.outermost = true; - fv.parallel = l.pure && target.has_gpu_feature() ? gpu_label == block : parallel; + fv.parallel = l.pure && target.has_gpu_feature() ? gpu_label == GPU_parallelism::Block : parallel; fv.exists = true; fv.pure = l.pure; fv.index = i; @@ -3718,7 +3737,7 @@ void LoopNest::apply(LoopLevel here, // In case the threads loop is innermost for (size_t i = 0; i < symbolic_loop.size(); i++) { StageScheduleState::FuncVar &v = state.vars[i]; - v.gpu_threads = gpu_label == thread && symbolic_loop[i].pure; + v.gpu_threads = gpu_label == GPU_parallelism::Thread && symbolic_loop[i].pure; } if (vectorized_loop_index >= 0) { @@ -3763,7 +3782,7 @@ void LoopNest::apply(LoopLevel here, StageScheduleState::FuncVar v; StageScheduleState::FuncVar &parent = state.vars[i]; - parent.gpu_threads = gpu_label == thread && symbolic_loop[i].pure; + parent.gpu_threads = gpu_label == GPU_parallelism::Thread && symbolic_loop[i].pure; int64_t factor = product_of_descendants(parent.index); @@ -3906,7 +3925,7 @@ void LoopNest::apply(LoopLevel here, } } - if (gpu_label == thread && state.all_innermost_unrolled && num_serial_loops() <= 1) { + if (gpu_label == GPU_parallelism::Thread && state.all_innermost_unrolled && num_serial_loops() <= 1) { update_producers_to_be_staged(state, all_inlined); } diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h index 9e60e4f091e1..391c29cd36d0 100644 --- a/src/autoschedulers/anderson2021/LoopNest.h +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -29,19 +29,21 @@ using NodeMap = PerfectHashMap; template using StageMap = PerfectHashMap; -enum GPU_parallelism { block, - thread, - serial, - simd, - parallelized, - none }; +enum class GPU_parallelism { Block, + Thread, + Serial, + Simd, + Parallelized, + None }; + +std::string stringify(GPU_parallelism label); // inlined => func is inlined so has no memory store location -enum class GPUMemoryType { global, - shared, - local, - registers, - inlined }; +enum class GPUMemoryType { Global, + Shared, + Local, + Registers, + Inlined }; bool may_subtile(const Anderson2021Params ¶ms); @@ -120,7 +122,7 @@ struct LoopNest { int vectorized_loop_index = -1; // Apply gpu threads to this loop nest - mutable GPU_parallelism gpu_label = none; + mutable GPU_parallelism gpu_label = GPU_parallelism::None; struct FeatureIntermediates { double inlined_calls; @@ -138,15 +140,15 @@ struct LoopNest { mutable std::map> features; bool is_gpu_serial(const Target &target) const { - return target.has_gpu_feature() && gpu_label == serial; + return target.has_gpu_feature() && gpu_label == GPU_parallelism::Serial; } bool is_gpu_thread(const Target &target) const { - return target.has_gpu_feature() && gpu_label == thread; + return target.has_gpu_feature() && gpu_label == GPU_parallelism::Thread; } bool is_gpu_block(const Target &target) const { - return target.has_gpu_feature() && gpu_label == block; + return target.has_gpu_feature() && gpu_label == GPU_parallelism::Block; } bool is_scalar() const { @@ -219,16 +221,16 @@ struct LoopNest { uint64_t hash_of_producers_stored_at_root; bool is_stored_in_global_mem() const { - return gpu_store_memory_type == GPUMemoryType::global; + return gpu_store_memory_type == GPUMemoryType::Global; } bool is_stored_in_shared_mem() const { - return gpu_store_memory_type == GPUMemoryType::shared; + return gpu_store_memory_type == GPUMemoryType::Shared; } bool is_stored_in_local_mem() const { - return gpu_store_memory_type == GPUMemoryType::local; + return gpu_store_memory_type == GPUMemoryType::Local; } bool is_stored_in_registers() const { - return gpu_store_memory_type == GPUMemoryType::registers; + return gpu_store_memory_type == GPUMemoryType::Registers; } }; diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 4b0ada17f283..454a6ab3e850 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -57,7 +57,7 @@ const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params & continue; } - if ((*it)->gpu_label == block) { + if ((*it)->gpu_label == GPU_parallelism::Block) { new_shared_mem_alloc_size = node.bytes_per_point; for (int i = 0; i < node.dimensions; ++i) { new_shared_mem_alloc_size *= (*it)->get_bounds(&node)->region_computed(i).extent(); @@ -69,7 +69,7 @@ const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params & } } - if ((*it)->gpu_label == thread || (*it)->gpu_label == serial) { + if ((*it)->gpu_label == GPU_parallelism::Thread || (*it)->gpu_label == GPU_parallelism::Serial) { int64_t total = node.bytes_per_point; for (int i = 0; i < node.dimensions; ++i) { total *= (*it)->get_bounds(&node)->region_computed(i).extent(); @@ -92,7 +92,7 @@ const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params & candidate = *it; } - if (candidate->gpu_label == block) { + if (candidate->gpu_label == GPU_parallelism::Block) { total_shared_mem_alloc_sizes.get(candidate->stage) += new_shared_mem_alloc_size; internal_assert(total_shared_mem_alloc_sizes.get(candidate->stage) <= get_shared_memory_limit(params)); } @@ -167,7 +167,7 @@ const LoopNest *State::deepest_common_ancestor(const mapchildren) { - if (c->gpu_label != block) { + if (c->gpu_label != GPU_parallelism::Block) { continue; } @@ -183,7 +183,7 @@ bool State::has_loop_nest_without_thread_loops() const { bool State::has_compute_root_loops_without_blocks() const { for (const auto &c : root->children) { - if (c->gpu_label == none) { + if (c->gpu_label == GPU_parallelism::None) { return true; } } @@ -207,7 +207,7 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest for (auto it = loop_nest->children.rbegin(); it != loop_nest->children.rend(); ++it) { auto &c = *it; - if (c->gpu_label != none) { + if (c->gpu_label != GPU_parallelism::None) { continue; } @@ -274,7 +274,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) return; } - if (loop_nest->gpu_label == block) { + if (loop_nest->gpu_label == GPU_parallelism::Block) { // Example: // block // serial (a) @@ -286,20 +286,20 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) continue; } - internal_assert(c->gpu_label == serial); + internal_assert(c->gpu_label == GPU_parallelism::Serial); // We want outer thread loops with extents 1 vector tiling(c->node->dimensions, 1); // Mark as 'thread' so this loop is split into threads and // serial - c->gpu_label = thread; + c->gpu_label = GPU_parallelism::Thread; c = c->parallelize_in_tiles(tiling, loop_nest, params, target, false, true); } return; } - if (loop_nest->gpu_label == serial) { + if (loop_nest->gpu_label == GPU_parallelism::Serial) { bool has_child_with_thread_descendant = false; for (const auto &c : loop_nest->children) { @@ -333,7 +333,7 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) // Mark as 'thread' so this loop is split into threads and // serial - c->gpu_label = thread; + c->gpu_label = GPU_parallelism::Thread; c = c->parallelize_in_tiles(tiling, loop_nest, params, target, false, true); } } @@ -364,21 +364,21 @@ void State::set_gpu_store_site(const mapgpu_label == thread) { - site.gpu_store_memory_type = GPUMemoryType::registers; + if (candidate_block->gpu_label == GPU_parallelism::Thread) { + site.gpu_store_memory_type = GPUMemoryType::Registers; type_has_been_set = true; break; } if (candidate_block->is_root()) { - site.gpu_store_memory_type = GPUMemoryType::global; + site.gpu_store_memory_type = GPUMemoryType::Global; type_has_been_set = true; break; } - if (candidate_block->gpu_label == block) { + if (candidate_block->gpu_label == GPU_parallelism::Block) { site.store = candidate_block; - site.gpu_store_memory_type = GPUMemoryType::shared; + site.gpu_store_memory_type = GPUMemoryType::Shared; type_has_been_set = true; break; } @@ -413,7 +413,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para if (s.compute == nullptr) { s.compute = feature_root.get(); s.store = feature_root.get(); - s.gpu_store_memory_type = GPUMemoryType::global; + s.gpu_store_memory_type = GPUMemoryType::Global; } } } @@ -592,7 +592,7 @@ bool State::exceeds_serial_extents_limit(const Target &target) const { int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const { int64_t result = 0; - if (loop->gpu_label == thread) { + if (loop->gpu_label == GPU_parallelism::Thread) { return result; } diff --git a/src/autoschedulers/anderson2021/ThreadInfo.h b/src/autoschedulers/anderson2021/ThreadInfo.h index 803c51003667..3482c2e1002d 100644 --- a/src/autoschedulers/anderson2021/ThreadInfo.h +++ b/src/autoschedulers/anderson2021/ThreadInfo.h @@ -17,7 +17,7 @@ namespace Halide { namespace Internal { namespace Autoscheduler { -#define MAX_THREADS_PER_BLOCK 1024 +static constexpr int MAX_THREADS_PER_BLOCK = 1024; struct LoopNest; diff --git a/src/autoschedulers/anderson2021/Tiling.cpp b/src/autoschedulers/anderson2021/Tiling.cpp index e3dae9ba72e9..780151e6b9ec 100644 --- a/src/autoschedulers/anderson2021/Tiling.cpp +++ b/src/autoschedulers/anderson2021/Tiling.cpp @@ -24,17 +24,17 @@ bool equal_to_existing_size(const std::vector &s, const std::vector> generate_serial_tilings(const vector &s, int d, - int last_d, - int vectorized_index, - const vector &vec_dim_serial_sizes, - bool filter_small_outer_extents, - bool allow_inner_ones) { - vector> result; +std::vector> generate_serial_tilings(const std::vector &s, int d, + int last_d, + int vectorized_index, + const std::vector &vec_dim_serial_sizes, + bool filter_small_outer_extents, + bool allow_inner_ones) { + std::vector> result; if (d == -1) { result.emplace_back(); } else { - vector> v; + std::vector> v; v = generate_serial_tilings(s, d - 1, last_d, vectorized_index, vec_dim_serial_sizes, filter_small_outer_extents, allow_inner_ones); for (auto t : v) { t.push_back(0); @@ -90,14 +90,14 @@ vector> generate_serial_tilings(const vector &s, int d, // producer-consumer fusion, or tiling for parallelism. // inner_sizes is optional vector of fixed sizes to choose from for inner loop. // used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop -vector> generate_tilings(const vector &s, int d, int factor, - bool allow_splits, - const vector &inner_sizes) { - vector> result; +std::vector> generate_tilings(const std::vector &s, int d, int factor, + bool allow_splits, + const std::vector &inner_sizes) { + std::vector> result; if (d == -1) { result.emplace_back(); } else { - vector> v; + std::vector> v; v = generate_tilings(s, d - 1, factor, allow_splits); // If we're already generated too many tiling configurations // for the inner loops, search the outer loops with coarser @@ -199,7 +199,7 @@ vector> generate_tilings(const vector &s, int d, int fa // Moves vectorized dimension first and also removes dimensions with size 1 // to reflect actual thread dimensions when loop nests are lowered -void lowered_dims(const vector &size, int vector_loop_i, vector &lowered_size) { +void lowered_dims(const std::vector &size, int vector_loop_i, std::vector &lowered_size) { if (vector_loop_i >= 0 && size[vector_loop_i] > 1) { lowered_size.push_back(size[vector_loop_i]); } @@ -218,11 +218,14 @@ void lowered_dims(const vector &size, int vector_loop_i, vector> generate_gpu_tilings(const vector> &stage_sizes, - const vector> &pure_dims, - const vector &max_s, - int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage) { - vector> result; +std::vector> generate_gpu_tilings(const std::vector> &stage_sizes, + const std::vector> &pure_dims, + const std::vector &max_s, + int d, + const std::vector &vectorized_indices, + bool serial_inner, + bool is_compute_root_stage) { + std::vector> result; if (d == -1) { result.emplace_back(); } else { @@ -234,7 +237,7 @@ vector> generate_gpu_tilings(const vector> &stag innermost_warp_extent = 1; } - vector> v; + std::vector> v; v = generate_gpu_tilings(stage_sizes, pure_dims, max_s, d - 1, vectorized_indices, serial_inner, is_compute_root_stage); for (auto t : v) { @@ -246,13 +249,13 @@ vector> generate_gpu_tilings(const vector> &stag // have more than three dimensions with ext > 1, or result in large serial loops std::function is_valid_tiling = [&]() { if (d == ((int)(stage_sizes[0].size()) - 1)) { - vector lowered_size, thread_t; + std::vector lowered_size, thread_t; thread_t = t; lowered_dims(thread_t, vectorized_indices[0], lowered_size); // see how tiling will be applied to other stages of this func and update max_s accordingly - vector new_max_s = max_s; + std::vector new_max_s = max_s; for (size_t stage = 0; stage < pure_dims.size(); stage++) { - vector stage_thread_t, stage_lowered_size; + std::vector stage_thread_t, stage_lowered_size; for (int i : pure_dims[stage]) { if (i >= 0) { stage_thread_t.push_back(thread_t[i]); diff --git a/src/autoschedulers/anderson2021/Tiling.h b/src/autoschedulers/anderson2021/Tiling.h index 3753cffc59bf..fb82672b2e06 100644 --- a/src/autoschedulers/anderson2021/Tiling.h +++ b/src/autoschedulers/anderson2021/Tiling.h @@ -4,8 +4,6 @@ #include #include -using std::vector; - namespace Halide { namespace Internal { namespace Autoscheduler { @@ -14,12 +12,12 @@ bool all_ones(const std::vector &nums); bool equal_to_existing_size(const std::vector &s, const std::vector &nums); -vector> generate_serial_tilings(const vector &s, int d, - int last_d, - int vectorized_index, - const vector &vec_dim_serial_sizes, - bool filter_small_outer_extents = false, - bool allow_inner_ones = false); +std::vector> generate_serial_tilings(const std::vector &s, int d, + int last_d, + int vectorized_index, + const std::vector &vec_dim_serial_sizes, + bool filter_small_outer_extents = false, + bool allow_inner_ones = false); // Given a multi-dimensional box of dimensionality d, generate a list // of candidate tile sizes for it, logarithmically spacing the sizes @@ -29,13 +27,13 @@ vector> generate_serial_tilings(const vector &s, int d, // producer-consumer fusion, or tiling for parallelism. // inner_sizes is optional vector of fixed sizes to choose from for inner loop. // used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop -vector> generate_tilings(const vector &s, int d, int factor, - bool allow_splits, - const vector &inner_sizes = vector()); +std::vector> generate_tilings(const std::vector &s, int d, int factor, + bool allow_splits, + const std::vector &inner_sizes = std::vector()); /** moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dimensions when loop nests are lowered **/ -void lowered_dims(const vector &size, int vector_loop_i, vector &lowered_size); +void lowered_dims(const std::vector &size, int vector_loop_i, std::vector &lowered_size); // creates tilings for gpu threads loops. // Innermost thread loop is always the vectorized dim and its extent is a multiple of 32. @@ -45,10 +43,13 @@ void lowered_dims(const vector &size, int vector_loop_i, vector> generate_gpu_tilings(const vector> &stage_sizes, - const vector> &pure_dims, - const vector &max_s, - int d, const vector &vectorized_indices, bool serial_inner, bool is_compute_root_stage); +std::vector> generate_gpu_tilings(const std::vector> &stage_sizes, + const std::vector> &pure_dims, + const std::vector &max_s, + int d, + const std::vector &vectorized_indices, + bool serial_inner, + bool is_compute_root_stage); } // namespace Autoscheduler } // namespace Internal diff --git a/src/autoschedulers/anderson2021/test/tiling.cpp b/src/autoschedulers/anderson2021/test/tiling.cpp index 6e0827c9ede8..748f565d9e8e 100644 --- a/src/autoschedulers/anderson2021/test/tiling.cpp +++ b/src/autoschedulers/anderson2021/test/tiling.cpp @@ -7,7 +7,7 @@ using namespace Halide; using namespace Halide::Internal; using namespace Halide::Internal::Autoscheduler; -using tilings_t = vector>; +using tilings_t = std::vector>; std::string to_string(const tilings_t &tilings) { std::ostringstream s; @@ -42,13 +42,13 @@ void Halide::Internal::Autoscheduler::expect_eq(int line, const tilings_t &expec void test_serial_tilings() { { // Don't split small, odd extents - vector s; + std::vector s; s.push_back(3); - vector> expected; + std::vector> expected; expected.push_back({3}); - vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); + std::vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); EXPECT_EQ(expected, actual); @@ -69,42 +69,42 @@ void test_serial_tilings() { } { - vector s; + std::vector s; s.push_back(8); - vector> expected; + std::vector> expected; expected.push_back({8}); expected.push_back({4}); expected.push_back({2}); - vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); + std::vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, false, true); EXPECT_EQ(expected, actual); } { - vector s; + std::vector s; s.push_back(8); - vector> expected; + std::vector> expected; // If 'filter_small_outer_extents' is true, don't split small extents - vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, true, true); + std::vector> actual = generate_serial_tilings(s, 0, 0, 0, {}, true, true); EXPECT_EQ(expected, actual); } { - vector s; + std::vector s; s.push_back(8); - vector> expected; + std::vector> expected; expected.push_back({8}); expected.push_back({4}); expected.push_back({2}); // If 'filter_small_outer_extents' is true but we're not considering the // vectorized_loop_index, do split - vector> actual = generate_serial_tilings(s, 0, 0, 1, {}, true, true); + std::vector> actual = generate_serial_tilings(s, 0, 0, 1, {}, true, true); EXPECT_EQ(expected, actual); } @@ -112,23 +112,23 @@ void test_serial_tilings() { // Test that generate_gpu_tilings does not exit when it encounters a tiling // option with too many threads { - vector> stage_sizes; + std::vector> stage_sizes; stage_sizes.push_back({16, 16, 32}); - vector> pure_dims; + std::vector> pure_dims; pure_dims.push_back({0, 1, 2}); - vector max_s; + std::vector max_s; max_s.push_back(16); max_s.push_back(16); max_s.push_back(2); - vector vectorized_indices; + std::vector vectorized_indices; vectorized_indices.push_back(0); bool serial_inner = true; - vector> expected; + std::vector> expected; expected.push_back({16, 1, 2}); expected.push_back({16, 1, 4}); expected.push_back({16, 1, 8}); @@ -148,21 +148,21 @@ void test_serial_tilings() { } { - vector> stage_sizes; + std::vector> stage_sizes; stage_sizes.push_back({128}); - vector> pure_dims; + std::vector> pure_dims; pure_dims.push_back({0}); - vector max_s; + std::vector max_s; max_s.push_back(1); - vector vectorized_indices; + std::vector vectorized_indices; vectorized_indices.push_back(0); bool serial_inner = false; - vector> expected; + std::vector> expected; expected.push_back({16}); expected.push_back({32}); expected.push_back({64}); From 5fab2952e91ffb471d798fe9304bd69ab70da1fa Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 31 Mar 2023 00:47:20 -0400 Subject: [PATCH 60/63] Remove HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API code --- .../anderson2021/AutoSchedule.cpp | 87 ------------------- 1 file changed, 87 deletions(-) diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 4b6b95cd309e..8ff7e3560799 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -27,48 +27,6 @@ HL_PERMIT_FAILED_UNROLL Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - - Most of the settings in this Autoscheduler are controlled by the values specified via - an `autoscheduler.fieldname` GeneratorParam, as listed in the Anderson2021Params struct; - this is the preferred way to set these. - - For now, however, you can (instead) control these settings via env vars; - doing so requires that you compile all of Halide with HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - defined. (Note that this ability is deprecated, and likely to be removed in Halide 16.) - - That said, here are the (legacy) env vars you can still use when HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - is defined: - - HL_BEAM_SIZE - Beam size to use in the beam search. Defaults to 32. Use 1 to get a greedy search instead. - - HL_RANDOM_DROPOUT - percent chance of accepting each state in the beam. Normalized by the number of decisions made, so 5 would be there's a 5 percent chance of never rejecting any states. - - HL_SEED - Random seed used by the random dropout. - - HL_WEIGHTS_DIR - When training or schedule, read weights from this directory or file - (if path ends in `.weights` it is written as a single file, otherwise a directory of files) - - HL_NO_SUBTILING - If set to 1, limits the search space to that of Mullapudi et al. - - HL_SEARCH_SPACE_OPTIONS - Allow/disallow search space options to be considered by the autoscheduler. - Expects a string of four 0/1 values that allow/disallow the following options: compute root, inline, compute at the block level, compute at the thread level e.g. 1000 would allow compute root only - - HL_RANDOMIZE_TILINGS - If set, only a random subset of the generated tilings for each stage will be accepted into the beam - - HL_FREEZE_INLINE_COMPUTE_ROOT - If set, run a pre-pass where only compute_root and inline scheduling options are considered. - The cheapest stages (according to the cost model) have these decisions 'frozen' for the remaining autoscheduling passes. - -#endif - #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS HL_CYOS @@ -204,21 +162,6 @@ struct AutoSchedule { IntrusivePtr optimal_schedule(int beam_size); }; -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API -template -T get_scalar_env_var(const char *nm, T def = T()) { - auto str = get_env_variable(nm); - if (str.empty()) { - return def; - } - std::istringstream iss(str); - T t; - iss >> t; - user_assert(!iss.fail() && iss.get() == EOF) << "Unable to parse: " << str; - return t; -} -#endif - // Decide whether or not to drop a beam search state. Used for // randomly exploring the search tree for autotuning and to generate // training data. @@ -682,9 +625,6 @@ void generate_schedule(const std::vector &outputs, } if (auto_scheduler_results) { -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - auto_scheduler_results->scheduler_name = "Anderson2021"; -#endif auto_scheduler_results->schedule_source = optimal->schedule_source; { std::ostringstream out; @@ -720,32 +660,6 @@ void generate_schedule(const std::vector &outputs, } struct Anderson2021 { -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms_in, AutoSchedulerResults *results) { - std::vector outputs; - for (const Func &f : p.outputs()) { - outputs.push_back(f.function()); - } - Anderson2021Params params; - params.parallelism = params_in.parallelism; - params.beam_size = get_scalar_env_var("HL_BEAM_SIZE", 32); - params.random_dropout = get_scalar_env_var("HL_RANDOM_DROPOUT", 100); - params.random_dropout_seed = get_scalar_env_var("HL_SEED", (int)time(nullptr)); - params.weights_path = get_scalar_env_var("HL_WEIGHTS_DIR"); - params.disable_subtiling = get_scalar_env_var("HL_NO_SUBTILING", 0); - params.randomize_tilings = get_scalar_env_var("HL_RANDOMIZE_TILINGS", 0); - params.search_space_options = get_scalar_env_var("HL_SEARCH_SPACE_OPTIONS", "1111"); - params.freeze_inline_compute_root = get_scalar_env_var("HL_FREEZE_INLINE_COMPUTE_ROOT", 0); - params.partial_schedule_path = get_scalar_env_var("PARTIAL_SCHEDULE", ""); - params.num_passes = get_scalar_env_var("HL_NUM_PASSES", 0); - params.stack_factor = get_scalar_env_var("HL_STACK_FACTOR", 0.95f); - params.shared_memory_limit_kb = get_scalar_env_var("HL_SHARED_MEMORY_LIMIT", 48); - params.shared_memory_sm_limit_kb = get_scalar_env_var("HL_SHARED_MEMORY_SM_LIMIT", 96); - params.active_block_limit = get_scalar_env_var("HL_ACTIVE_BLOCK_LIMIT", 32); - params.active_warp_limit = get_scalar_env_var("HL_ACTIVE_WARP_LIMIT", 64); - Autoscheduler::generate_schedule(outputs, target, params, results); - } -#else void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *results) { internal_assert(params_in.name == "Anderson2021"); @@ -777,7 +691,6 @@ struct Anderson2021 { Autoscheduler::generate_schedule(outputs, target, params, results); results->autoscheduler_params = params_in; } -#endif }; REGISTER_AUTOSCHEDULER(Anderson2021) From 766a0107fb4a025511c5339fcbab174a64f9707c Mon Sep 17 00:00:00 2001 From: aekul Date: Fri, 31 Mar 2023 01:02:35 -0400 Subject: [PATCH 61/63] Remove HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API code --- test/autoschedulers/anderson2021/test.cpp | 96 ----------------------- 1 file changed, 96 deletions(-) diff --git a/test/autoschedulers/anderson2021/test.cpp b/test/autoschedulers/anderson2021/test.cpp index 7235d50f5ef8..980d1ce24c9f 100644 --- a/test/autoschedulers/anderson2021/test.cpp +++ b/test/autoschedulers/anderson2021/test.cpp @@ -8,11 +8,7 @@ int main(int argc, char **argv) { load_plugin("autoschedule_anderson2021"); constexpr int hardware_parallelism = 80; -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - MachineParams params(hardware_parallelism, 1, 1); -#else AutoschedulerParams params = {"Anderson2021", {{"parallelism", std::to_string(hardware_parallelism)}}}; -#endif // Use a fixed target for the analysis to get consistent results from this test. Target target("x86-64-linux-sse41-avx-avx2-cuda"); @@ -28,11 +24,7 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(h).auto_schedule(target, params); -#else Pipeline(h).apply_autoscheduler(target, params); -#endif } if (true) { @@ -52,11 +44,7 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(h).auto_schedule(target, params); -#else Pipeline(h).apply_autoscheduler(target, params); -#endif } if (true) { @@ -69,11 +57,7 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(h).auto_schedule(target, params); -#else Pipeline(h).apply_autoscheduler(target, params); -#endif } // Smaller footprint stencil -> smaller tiles @@ -86,11 +70,7 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(h).auto_schedule(target, params); -#else Pipeline(h).apply_autoscheduler(target, params); -#endif } // A stencil chain @@ -109,11 +89,7 @@ int main(int argc, char **argv) { } f[N - 1].set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(f[N - 1]).auto_schedule(target, params); -#else Pipeline(f[N - 1]).apply_autoscheduler(target, params); -#endif } // An outer product @@ -124,11 +100,7 @@ int main(int argc, char **argv) { f.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(f).auto_schedule(target, params); -#else Pipeline(f).apply_autoscheduler(target, params); -#endif } // A separable downsample that models the start of local_laplacian @@ -147,11 +119,7 @@ int main(int argc, char **argv) { downx(x, y, k) = downy(2 * x - 1, y, k) + downy(2 * x, y, k) + downy(2 * x + 1, y, k) + downy(2 * x + 2, y, k); downx.set_estimate(x, 1, 1022).set_estimate(y, 1, 1022).set_estimate(k, 0, 256); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(downx).auto_schedule(target, params); -#else Pipeline(downx).apply_autoscheduler(target, params); -#endif } // A Func with multiple stages, some of which include additional loops @@ -170,11 +138,7 @@ int main(int argc, char **argv) { g.set_estimate(x, 1, 1022).set_estimate(y, 1, 1022); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(g).auto_schedule(target, params); -#else Pipeline(g).apply_autoscheduler(target, params); -#endif } if (true) { @@ -198,11 +162,7 @@ int main(int argc, char **argv) { after[4].set_estimate(x, 0, 1024).set_estimate(y, 0, 1024); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(after[4]).auto_schedule(target, params); -#else Pipeline(after[4]).apply_autoscheduler(target, params); -#endif } if (true) { @@ -221,11 +181,7 @@ int main(int argc, char **argv) { out.set_estimate(j, 0, 1024).set_estimate(i, 0, 1024); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(out).auto_schedule(target, params); -#else Pipeline(out).apply_autoscheduler(target, params); -#endif } if (true) { @@ -255,11 +211,7 @@ int main(int argc, char **argv) { p3[N - 1].set_estimate(x, 0, 1024).set_estimate(y, 0, 1024); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(p3[N - 1]).auto_schedule(target, params); -#else Pipeline(p3[N - 1]).apply_autoscheduler(target, params); -#endif } if (true) { @@ -279,11 +231,7 @@ int main(int argc, char **argv) { out.set_estimate(x, 0, 10); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(out).auto_schedule(target, params); -#else Pipeline(out).apply_autoscheduler(target, params); -#endif } if (true) { @@ -298,11 +246,7 @@ int main(int argc, char **argv) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(h).auto_schedule(target, params); -#else Pipeline(h).apply_autoscheduler(target, params); -#endif } if (true) { @@ -320,11 +264,7 @@ int main(int argc, char **argv) { a.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); b.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline({a, b}).auto_schedule(target, params); -#else Pipeline({a, b}).apply_autoscheduler(target, params); -#endif } if (true) { @@ -335,11 +275,7 @@ int main(int argc, char **argv) { g(x, y) = f(x, y); g.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(g).auto_schedule(target, params); -#else Pipeline(g).apply_autoscheduler(target, params); -#endif } if (true) { @@ -349,11 +285,7 @@ int main(int argc, char **argv) { f(x, y) = im(x, y) * 7; f.set_estimate(x, 0, 3).set_estimate(y, 0, 5); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(f).auto_schedule(target, params); -#else Pipeline(f).apply_autoscheduler(target, params); -#endif } if (true) { @@ -370,11 +302,7 @@ int main(int argc, char **argv) { .set_estimate(t, 0, 3) .set_estimate(u, 0, 2) .set_estimate(v, 0, 6); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(f).auto_schedule(target, params); -#else Pipeline(f).apply_autoscheduler(target, params); -#endif } if (true) { @@ -393,11 +321,7 @@ int main(int argc, char **argv) { out1.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); out2.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline({out1, out2}).auto_schedule(target, params); -#else Pipeline({out1, out2}).apply_autoscheduler(target, params); -#endif } if (true) { @@ -422,11 +346,7 @@ int main(int argc, char **argv) { g(x, y) = f[N - 1](x, y) + f[0](clamp(cast(sin(x) * 10000), 0, 100000), clamp(cast(sin(x * y) * 10000), 0, 100000)); g.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(g).auto_schedule(target, params); -#else Pipeline(g).apply_autoscheduler(target, params); -#endif } if (true) { @@ -441,11 +361,7 @@ int main(int argc, char **argv) { g(x, y) = f(x, y); g.set_estimate(x, 0, 10).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(g).auto_schedule(target, params); -#else Pipeline(g).apply_autoscheduler(target, params); -#endif } if (true) { @@ -477,11 +393,7 @@ int main(int argc, char **argv) { out(x, y) = up[0](x, y); out.set_estimate(x, 0, 2048).set_estimate(y, 0, 2048); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(out).auto_schedule(target, params); -#else Pipeline(out).apply_autoscheduler(target, params); -#endif } if (true) { @@ -499,11 +411,7 @@ int main(int argc, char **argv) { casted(x, y) = scan(x, y); casted.set_estimate(x, 0, 2000).set_estimate(y, 0, 2000); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(casted).auto_schedule(target, params); -#else Pipeline(casted).apply_autoscheduler(target, params); -#endif } if (true) { @@ -519,11 +427,7 @@ int main(int argc, char **argv) { f.set_estimate(x, 0, 2000).set_estimate(y, 0, 2000); output.set_estimate(i, 0, 256); -#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API - Pipeline(output).auto_schedule(target, params); -#else Pipeline(output).apply_autoscheduler(target, params); -#endif } return 0; From 25da22a9cb738a6aa3cfb63f25910ab328975bbd Mon Sep 17 00:00:00 2001 From: aekul Date: Sun, 2 Apr 2023 02:06:16 -0400 Subject: [PATCH 62/63] Tidy up test input, move inner_extent --- src/autoschedulers/anderson2021/State.cpp | 12 ++++++++---- test/autoschedulers/anderson2021/test.cpp | 9 ++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 454a6ab3e850..a1958013e922 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -217,10 +217,6 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest continue; } - // Make the vectorized dimension of the inner loop 32 (or as - // close as possible) - int64_t inner_extent = std::min(c->size[vectorized_loop_index], (int64_t)32); - if (c->stage->index == 0) { vector tiling(c->node->dimensions, 1); @@ -228,6 +224,10 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest c = c->parallelize_in_tiles(tiling, loop_nest, params, target, true, false); if (vectorized_loop_index >= 0) { + // Make the vectorized dimension of the inner loop 32 (or as + // close as possible) + int64_t inner_extent = std::min(c->size[vectorized_loop_index], (int64_t)32); + tiling[vectorized_loop_index] = inner_extent; } // Split parallelized into blocks and threads @@ -257,6 +257,10 @@ void State::FeatureLoopNestMutator::split_compute_root_loops(LoopNest *loop_nest // outer_vec_extent and instead only have a single thread vector thread_tiling(c->node->dimensions, 1); if (vectorized_loop_index >= 0) { + // Make the vectorized dimension of the inner loop 32 (or as + // close as possible) + int64_t inner_extent = std::min(c->size[vectorized_loop_index], (int64_t)32); + thread_tiling[c->stage->loop[vectorized_loop_index].pure_dim] = inner_extent; } diff --git a/test/autoschedulers/anderson2021/test.cpp b/test/autoschedulers/anderson2021/test.cpp index 980d1ce24c9f..b866769a12bd 100644 --- a/test/autoschedulers/anderson2021/test.cpp +++ b/test/autoschedulers/anderson2021/test.cpp @@ -3,9 +3,12 @@ using namespace Halide; int main(int argc, char **argv) { - // Loads libautoschedule_anderson2021.so - // which is presumed to be in current library search path - load_plugin("autoschedule_anderson2021"); + if (argc != 2 || !strlen(argv[1])) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + load_plugin(argv[1]); constexpr int hardware_parallelism = 80; AutoschedulerParams params = {"Anderson2021", {{"parallelism", std::to_string(hardware_parallelism)}}}; From 1a842373397cbc58c742ce8e41243643da25441a Mon Sep 17 00:00:00 2001 From: aekul Date: Mon, 3 Apr 2023 02:09:02 -0400 Subject: [PATCH 63/63] clang-format --- src/autoschedulers/anderson2021/Featurization.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/autoschedulers/anderson2021/Featurization.h b/src/autoschedulers/anderson2021/Featurization.h index 60e2718367c5..f1c0b3dbed71 100644 --- a/src/autoschedulers/anderson2021/Featurization.h +++ b/src/autoschedulers/anderson2021/Featurization.h @@ -431,7 +431,6 @@ struct ScheduleFeatures { for (size_t i = 0; i < n_features; i++) { if ((*this)[i] != other[i]) { return false; - } } return true;

LGdGOme$6aKHnfPHzp;?{w(c2-*|BiqksGJ;R5A|y{mMHR_Zzn@P;>Cr)UM-pPJ$u!zh=*!Vrt%hn_MRDei zayb6%JoG$OrnAK+6Cs6ssJ>rK|#`%jGb<+*KmD*7=FY@UgGAHGCc*(<1&B88t=|2zlFJ5!rnAP1!Z47m3QtFe*e&!Vv;S2H3SSR``DT2`DH_JroShJ~y^dwV zoAKP6c=(o`0n2PyXxb#urH0kIi$XZ3Y?${O_rULov2K6uwvg zhdmPvTb>1v>QQFrl>jy& z`y$KDsb$pUkFnz>h1e=<%e3zhVAGWW#`^P8#(4HmlwY}xRbOxoyx*UPV>SoySKm#j zIW!Nt7q_xDSu=4{OgO8{S&jcnufo;kqQKp*gilK4sKe7NTz6yX+N$-JahN zq#@3>)qcUnCi9Wq_6Z9vL{}_!sjWP)_&sy%!AZ9C@O+$RRK_zBWRt!%O4A8v7eU7w zE|h$*Ktq>#bV;%o4iPEhcQyjnv`2vZflchBXVdY}Uf9A+{^G%uOdDj}rd)$SDPMFw5W%^k<^xL( zZ2-+}OR+Irm65qOh5htJ2omqUX7$9QIFBP^m^4E1jLkwpeiFgzNGRL?c|MeB-a!ve zJg5A2IM(egLY-aLaNhn(c3fd2s5JONy~`lD9Ps9?EFD=;!k5br`$&U_I)5b9LO{r zyM}*%`a=G~Xw347$1X<-JNj>-Y4?BNy+eUrIV%pWN37x9unD~Qyq#w%H6ODbt=LPq zCXkA=#*CN3PS6djM}w(1@Y#gTFuc1T+`RWfv4IlKVIl$A3hIS=PGCwDO^{AAY%P2mTnq zC|;?>$1M}#;8auCD6Yqf?X8A04HJk`NGiIAOHeg+3taYN5|KQx3Ma1YUMMI$DyAhTydjo1;f1vBx zc~s)$G?y(d(zw#F2A?*cXsihFt7l&hV zR$%cm3VY5~q2Zhu5WDq|wN6*&5EW$ec3p+lF@+de?k>prya0tM5ul*<0iGV}#uK0K zgHM?fm9&fl<@f4Hbl%}(k4J2&LLr7dONFmv9w@OumOd*8q{_ynpfsAzcrNl}8c$E7 z>i!lWUw@aq;3tO9G%A?ht;og++T!?^YiLt5guk0=pk<;EZvWGcRevaRMB+PG-Wy_W zJlO?R-CEFZ;Q^bLxZ~?}id369fyJte&~TUqU>pmbSbQ&;UZWe7jod>t)uf~a1*O-}Qvalq364_IyN`D7Ck^>d? zG+I~-o)O@0S+tc#E^1+-vNiDh0SQ>H<-Fa8(w^`B#G= z`|T>o%Toqp@t^oDB_3t|xdLOI%AEcFlxMI0o_EpuI7&$D#>0ViaQZ|JsASEBBWokk zGP*xwJR!BEXP{mI2<>Q0^!hH@Xg%}loiXZM`Uj!T8SLt1m%vus2XlmnVeCUX?|0QUbY2t)#i9-DujDW6 z`~D3yPLt(rxKslNwxtX7g-`6Q4Wl^b-(|2o-Hd^I@7UBGEK08RWB%?HrrW|w;rmi` z{HOhpT^`fNTGiJ=e3-8Oh3OwLb*B}a@{uMny_yW`^$pFBd!ozb&zzyhH+knmC*sxU zudqTl7`*@d0u$RA_;SH=cKi4i?B8C1BMw=N0$Wnae5QCf;WAEWj)D2w*O+4Oc>FsY ziIWdxfLBf@E|Q&2$2K-HG0j`CNLQK~inj5BT6U0zfB?olUj%!fg)p(x|G+C*3#QF- z8g5vhhug~pIQg*()it8v#~T}_(tR#kv?aiHdo6^rIP?;~g|iyFanNiAPxtF2Odhxj zsk2-#rR67Br}cvQ`!wA1(vavo_+a7IAjYL?9yDYuB4UjbiQ5EMnzrf~Mtkj~?$r!v z=^SC=CEmd6z4tLuI|DLjr9t($2EI-{itcek;G24!as$4wey<7zGoA_1(YM-(pI`$2Q2(2I-Sx@$LvoKld zW`inL6=>Kj0al-5L07=hC749m)1kzAue`^xwm(T#?p($7Tm0F|^UCxCD$puVe@Ka$ z53ypeV1LtAEOgd^N1|D1qENv&4Yr};r3Wa!L>n~{=EK`Pl6XXKI^0;CfzCgAa7K7N zDA$L>Xu-?!s5RSmY{? zVf@88Ow56Bn3thRMlKye(b{YbL0S5;=o372olSQog`vngGxBw?gN?bJ2QM#+v7aAq zMw?{|fVB~U+TXf#d(RVCbgcnjyK2$2k?pL>hR<*-`Z902R#;`Kju5r#MELgiKD2*Y zkD(t!m@|T8{6pmmTwqnii`J?^ZI4rI$A2I2_qJ%L4!gu29=eEsm>m31dl9uA>cE{( z#=vl9H8W|RBoRH%h1KzIKr^R;9duD9JEPLUU2tzLM>sHKCeVbM=flm2i;Uji8K^IH z7avx2fOFME+Oq30M?!v>owF_pZXB1U(%mo0U9PQQSMM=Hhpq1PyyOhh*Hi-PuLb?O z?lSY}c_I@bHj8}QyA|mg%gSlsg|QmJ%)87(6(8;Syr@kp!2F3hTvHVIo~#Wce_#)! z+}MY0Dhjx9Sa2TXD_~h|9Ima_;iT4tGGW(!L3YVn=n4A>AL@MILh1%o(ko_~icjIz zDZNa+Alo*iD@4e00&-p-I3FZ}kbCYJoHs3i*XuLc$r4kbKKCSa_k4!%BO>^4-wpWB zxsUZd&;!XkrnBq!9E5lwmTAZd=SX}^g%fi2m5~C--T!Md{B{xSe1HyC$u*$s@HkvJ z-~bKAybxah| z1(VZSFtg_;=DZSUNAIFgL${52Hi5vE+#>w#;b)-AT}s&Kee8cPT5u#rz|qa}FyYEi zR>$~1Skx4YMVoq=%QA)RtAtPNKt?+J9kPMXdwe0WHy5fZMQEn~06e+*msy+WhZpjc z=~$u-Y;n5J#(VH#v3@*bsy+eL7kZ(k^;Bj7dg9;7txS8#b)5g=KJbVT^56Bc3nLnt z#f&L5?7M@Pt|X#CzAl7i|3QuKGN_QC1$~cmn8n9B!Bb)u(bznhB-Ko$cjg$Ur$3tvE=m;Z+sq%+Ap?ni8moD3cJf3QDV@<48j4A~kINL72y*_HdAV0Fh@ z{3_;yhE|F6{x(9_e^ZAuA1m>;`WGBNkqbx8>?H3;70B4UFR*>fE^7Au6KXy*CJ*cF ziFN#LV*9FJfYVE80`ET@tCq*kf3>*(`%Jobn-mPaKMR~0Gq9ppuFCwV_+Xyb|-vPM;br{SMLhFyaP;;8&qumfcUBqhHw9wQ{obVF%vw=_UJ1(;loWd4%EUoTXb;weH9qx zh?2L94beP14<$Cr;K8hLG}fL;Pu~28O-6#9r1u9|_moh0p0b*o6F8MGRl0|N?ppzo z{42^mE4G`_l=8gUThNUwFj z0=3WYp(Rfr&QG*uOUopvlwB56dUgQwZ%ic}hY}cxQ=gb|6IHtN`UCvs@CS?E4}kCG z7wnpy7twu@Fz3O%X{5mTEj}9w#qnN$RwsqvlY2wO{WY2KOv`I=?;?=Uz`mepWeD zeg4CgJgs7OXhyP5NoROn$}_P4x-^}$D;!LomasnuUt!kHt?+&PETrf43;dLLW|goX zu6+FpOs?j@RqZ>tsVNfmcy_?&?lQQLH=X_z-AZh?$&-HrN9Zk2TkPqOCB3Or$^38A z==!p4Plgmvs5#fQ@EfA{+eb%YSC1(=xqw;jgThJs-8;rh)$%_J72TEA9|BTLJ9PoOFb?; zeH6v_YJh!50nS`%4pA~GVBI^9j9inTQww@nC&wlD;m;_etlnm1z>4O}TlO%rDm8v#fAeXp4+%RXS<_fm)$%_kNo zJZaj%FYK#b%{QA-OJ~_M5JSN%@bqjvn~zhCv}pZfLh1xrNa=Lg z>HQc##b-c{TriAoi-2O=P`o5wi`VZLvK79yfYs6PUw;*=$jxLDBN2tXr-PWsI2=qS zbjnaWdt&ZQ+_PMk$QmpmKT7VwzMsV~wO_Cc4WiWBaT0B>J4@)6{qSeTF`VLdi;n#H z&F&7?MWY)ZVM+Z##&q&!ES@S(Q`<(_%Q5jp;*b>mlQV#?eby6xU=V5VUqdT%{@{C= zYK+JWhCOEwg4CA{bb)CpE%Kl=#OWzneDoIxJgHySj33wFHb?XzOiZ~TFkg6xLpvhBq6!7{36-pOPd1b}|vDr!)tO>Lha zWkWs=F^$8w@PD4%|GYNz3V?mm)2N-_Py7&d5)_u2fZ4!&dURqVRl~<$Epk4behSk;L5$9Pv*Wo<-S2Y6W$Iybm}mBZ(bajydQ%n z<+Di1oE6Yulm$XD#n`Rc$TJR7ChckVI4#KtM@NpM*yaMm~ zSJU54>A2@v3|^{|CGD@jK&WpCJN4H__<5)j@~o9GYr`EhnOBV^+#k$fhz;lruLFfW zf<1D>VNOxTba;HXk|D!NtkBXRPU{vANO@fXOoKSwTd)om6iG9#vqebEn^hQ<6bDj~ zaj4Sk&5PT60CQH}C*iq2!P|K?V#5n2IqDj_ILQUiga<(Sj5LESvwy(d?oG@F|1>!6 z|CT*{Kbl!NZvdP#zc6FUR`51K8GpaK&qy5ER;jsWfIWG!hFKA&kL|lVAUQc3y+`}8 zv&#To_1@tlpK!LNXEta1wHT__XpP#|HOh1hTnYYb@MgJMw?xW{_pji?pqaxw??a2`H;dW&6? z$`RnqQFODs4HNXLP~PwoELZ)%3eLn4%d8FKlqI3YR#HMLS}59i&V3$=N>WoPDN>i=tw9&;5uNjLIZySxTZLiON#>-tT`n%YA;=?>anZ_T(Cv_qKu2 zDsjh0XFS}Uwj7cqWcl6YBWSctEV}6}W0~<;%pzNs?z-*+!vBQ7Io z^6v;yo_+yb8UDr%JXWV~1|KkApXBl975n->OcYhm`I zlcDa47i@jA1?1FqL9J>PsHLqT0Y^pfOwttVl0M@uzyIL4T!bj<^9Hn1u0)H?`t*^F zBqmqbVA1WT=qo%U-=YmL!elIeZt67%D_0WFMOkXBMfEPN z#OsJZ8L`}vUKqa5_SwE;v(~&OKNB*TS^N;qSP%-|MkaD`(b_oE*qvD~w1MY0QaQJ8 zI@EQYH|Uz_VPRH0o?W_>H7XcFc12Zzyk0Qu&Z>ciJY)WwLmhu&**dE(@*K$RHoARA0*eK9b>W*~a zc^N-0CK{^WtOYR56s%GUabl_pM8)kZ%q8V~_>xudtTYzqhK!`1herqvzO#f43;r^F zcfpEeBxV#=X3C+~EdybDoeXWNPbXX0HeNY=G;jAd1B*9^-MNpd!kKfU1l8xN!uWwj zy!LWGm@fMSY7P;w^4t#v6Xejs*b$~m6+zaupYZFv4_@+LM>x9^yO-+D=!SOdyX5%WB(yl@OD*?&r~4WWs8*yD`P(CbwzDF9^YRg! z6BErsWrHzZ+?&&Jb!55iiRiCt&wc&v4F?MHL^loRaot09bbaeeuq|?fbDq&82>xQw zt^++fa&&BB1ZXV@fqm9WOtxMVHkx0B4x^E{MbU~#>79YFS>IWnQ!CwBRmo=#iF0~K z75uS;r}KBUL4si(L{cdl8hnaf4H^(tnYO}PJ6)XKwM`VZbQo-73ZO&Grs$6s{F{3= z-0c`;7+$a!Bi!uasAnXTml(tysn79|TMrD+%)_qD!6<*Y8_FlOGVIHOi`CEBRm(0a z^XwNLXne_AMGRtY^KPQ*Wi0Hj?uFJgTgX$3K!^7|e3QvZQYk|0DM_Srto|@-cZs0y zH5?fnR-6MT} z?_uD|7?3KNhDHgcxI3^2Sa(0jE^49gO5CW!=|ADY>pL{R_Z=BklE$yQYRYE?+M>-- zUlOxM6E;aMM2V;I>{W)5ncmnM^5U00d)+h!S&Ia{kX%DtuO`4Q4^yz|J4y3bN3!yH z<4~dc1_?(iA?ZgrXXP9NrakXD=TDb;|43iD^gKXNu_RpDvxACWPKQTxrK!o*zt}R1 zH^ghB5BWn$oZ}{6B2;Ub^sKuAeZ_BKQ;s4n{Q`J-+X8&tAAmt|TDUP)njbjq$q%QO z!J5gzpzb|QIQ{)9c1#oM7pej8@+5~!O!#3~g> z3G1!SP#LLTyulACdRyERG5BFkc6})==q2hX?OTQC!|!2#c{X}SwLwwnPuyAL1O0wy zfM0nSYA0(!Q)(nwCf9(RLO!G%^%ZAJ3Giykb@-6|6D<#lv%cUV*rzW8C!0TEaLFQQ z z9be4BeVz_8Ww$}=66H5#E)(?}9b)fwQ_)nxjQO>H<<~0Cq`%jeu-?oxx@Fo#rpZNc zHPepaOIsN%GAsh;V=F-9ehnsl)D+J(DyZFmpS^EthRkQ}L?c4HdM78Nl+p)Q=XV{q zN54R)tnUEH5vbR%i2mlCM0H^k1pE#Fd-pnw9OZ!~dlq4yZz?;#>I1%ZH^D0jW}qw4 z534PQ*@|ho_|8HZDsl|KcfAi^w7!j4edz!fjRDv?q>0{fD}{>q2$8L55{u5~=)_T8 z;4dE!!NNg^U2OysH;&TZDWNbsu^4CQ+mf98%b?06@y4DYw2D^erN2%FVV428*qPzu zcij+@nQjuD69yA260wV4$Ily_gGOH@@YA9(U@CTUO|7d7Hcyih-C2E?$zun^i*iNd z)+Ujk6VK4clN|UOv0D&0dNG^hY7dFlyScG(ubE%*%R;DE;3tePXGxm^Og&U*!L-Ul zbnF#(?o4A04E$%0RaL_@EXa~SnOx5w2qkRM<++ffm`bztqrlVS6zFD33fEe0l56J6 z$cwTQU=Y|y&q+@a5?tm8kL!%kaEcm^o^qXkOmaBy$WWBo8$}rLtj=T-<|I`UzcjSO40GFW`k4| zp))6DvVG_4xnuM9!dG$J)b#x}c8X%*Rb(tWRTzSa_bsxiX9KzaKpWr3=fXIJBwALM z%fHKp-WD8$h3ZeiX;2+4uO{KZi$DmN_XK~-D$y65dFp+CKwkNwk%ymUCDmlysZ;rl^Dge?I8!qc zerB8@-k$j|_P8uKhP?GpxrO>~LF4Z&&}=MObn(brl7CwYRvpq7OxniM`qU8eR8<0w|4!gf zI@FQH#df4Gp#?rh7odAVgJ{u>K9R{kyD%-XfKQHK@JQ%|%7I-;FRB_kBgUg&5EcN-rwxy{MU zxSR$dX#cNmYaj!VY`B)V3d7#9h|$$&GN#JdT|u&#=Qh zPZ;&@I>By}1!|gVm|4$?!*%k&{?8o#&jKEZbK99EBL&|h9dJXu&vf^`Kv7a5+7&pl z-r@Da;2-8Vc-D^hYM742(^b*QzY={8)`78J9i-p70G1CeD2a51lsY>yJH(5`JiCr^ rRT99prbB7xL+~DDfg=?>V8FYA*NM|&wZ|g3Zn+fnUQvprZlB?Q@p@p+ literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/camera_pipe.weights b/src/autoschedulers/anderson2021/weights/camera_pipe.weights new file mode 100644 index 0000000000000000000000000000000000000000..e134e1ef1b95a65af72c23ed6cf93441ea8744a9 GIT binary patch literal 20400 zcmXV%c~nl{_s5${Dn*lsP|B2~LOpk%r-Tqf5-L;rWGEp+gfx)md6Fchc~a8w+b^p0{t$WwG>%I4Rz29ee zAs1d^r&<~F!|E%ZjH%`9HuIt-M#*%!*Gu{^FAge_lZl1rYcS1q!?0K3Xf1mHUi1W` zY(pL+Ahnm%A(h0jJM;n+u6dB6ZwjYv}Xa z^QE|{Iyd;;db-5TQwcfMBhkce>3ISErANy6 zL@^%A6*`IK=Nz0`XO8b9j7i6mo7ASf6&viGNrA8oze!e)_>_my^0pFs@AGLaHQb9^ z_akzDe&JN6*TAH)QLIyQ28-B4wtDVjd|7jq&e(E>IKOw~T0dApJ5t`kiMT~HdeI2J zJRv}L$nM|{JI&+Lo#7;8-ZmoI9nSX{o537@xsxW|MaJWj7t_|{2a~uz@VV0fdhSlg zyc2FvbE6miUA_VLtO8$gs0;T*Hj^tK^Ki#^R~~gOgH<~u_{)tp&`qyA=+d=e+_7CZ zu)CQ{8qS$V7So}P7G4Wvk9u)wMxN!Q~z?8Iq6{Ee4)=bvWL(KU+ zipLgzVojg<;!3S?;Qo8fxo3JAW8P-M>Y-zF3t58~PI^tOYdAF-QK!P5PDEB-n)}jt zCsA1!PoHY_&>QPwxa|tLynGo0>iqxpQN|elc)laIwydf7vA5a%9Pr5E8gqy251+5{JI_Qe?gL8h6ZAY)bo%R^= zU85d^BL0y55f=eIS`C{Cq|F*#FRDhi}=mlpknq z{(#9>?Z$HsI*_Fp32O4T?9m0!Xik112QY{{+`gSsIcfU3eI4|Djwk)*dBpKs1le_H z0e|guIc|@71$47lY>1kTeP!{+vFP@L}zwL>dNo4q>m zyk1RTbp>*l|M|x(`YFOHd^i~W1dP^ANQq&%cn3JMXpp=O^eLBaf2+d$e?zEp=7`H zIBq;)K`vXSlJj%Y;lZXV6sWn%NN6;pz`pB-E5VjcundLGBYV*LYcF(d+JViVp0Ujf zouIM(J>E^qhE|#7cw`_KubQU9DYYjUZ|sd)JD)OU<99(@y&O7Tvz!kb%Ft};Sxh53 z5cMMpLNZ4|r^yqg;tE;QEPpHQHqR>X6 z31o+S*gG5fRN#*(zo=D(509$2vKCK3EY1lJ|JCQR3G--kE}zl6QI3+II4G|m3jB(z zoCe)7o?k{AT6W!KE!T&$l{zoMCASsLf*+IXv>K-PInbf6xiB^08ZEX6CUGw+@#g4! z64rQ>yJpQIDxodWG8$=}MGrMP&`F8^;OMe!sG7G6g3F_+he;Pv?-a(gd=pfdzJ%{s zzkxeL<_MMhagoZN|AreK7IL+d6=9*W1d+Jbhz}IaX}EDI(^{DhFQ1)&8=v~1)mDfK z%sXZ@pMFBWh6WJ$afZ`2c{9W)>|oP2RxoMLN8p{)JUZ+218Uga0ArDo zx7~h8X8n51T%MzYGq=d{e>-jCnraKuk|~$z`kP*K+8R^tl@>2>tEq!mvtKhaHE**= zLVkc`bra`gwgRb6ZG|`K%i+l8d@wLe1gFVAaj+>Fl#{|iZI&D)&a8wlb&*i|E14?h zOH=E$3S^)noE^M<1$Owh-4q&mOEZfDnGd z-G)v$mUx5)Pm?7(6|aNUBo&Gqb5LhA9YU6Vktm$xpuisz>b}Tq83xDJj%HOPaj`=<(4+QgLe@eX-hwy1ntIB~1nJ-##TU zFtO%EzOTT)HyFYS-2mAmF^uk275?Hb69hf`u|DDgsfY(!d(dGaj&KP82@|4AJPB97wZ7NwE%_-2Z)Rk?r^g&09 zJ>Wa%5vrAo!l&VOw3*?6OEe2npevu<=Zv!mf`n5u zSo$tNclCYHFv%2ZH(Ihzdq%Oh{WZ#dEx~U^d@Q{X3}clZP#Y!)*5312$#2=17&9H} z?{ZOnM;j|&^A%!#-GNrgESS1?Kc4?$gw<(P(6@un^d!H8q+@qs^|y9(X|Mo+2opn@ z!XIRvkudjavpB#1r#KYy)!BV3&!W=RAsY2xIvLOj0`7^8Oye1KG!@(jMa?`=H_bwU z4pp{tt`!#FtidR=7CgQ+3};>Jh5UmtXfo7`TTF^z?j}Qc(tLpHKjRAjv4AJ{@$zQE z*=xvO|1lRn+x@1wjYo-x&3~+x(LdILsb};IHbb9VKH7<12lwzH(6V~Y+=%t2zbbZ8 zM=c;8Qz^~%nvP1os~K`ng6n!lfQUX`LBGE?p!U=L!o!lguyC3_&FlOW`;?#>lyrps>D+G5cJCd(-b=Z0IXI746TK_?{-wvxjhNqc1M?n1%^SJvcws zfbtG-Y3xudbgaDywu(Azi(3%qhDJ8?`}$Sv`g|I6TxZe43vS{o$@SP_c@4gA41~L{ z!Xa%~352rwnARB2hV-6fJMMeo)es4K=UNF|GEt@#tzy_L6w7&(tVo@*2k_q2E_l2o z7e_?5)5k91j8;tpJgYnk_qazeu3|HBQ$EAUJ^Bq~Rw2v+1@?WpAmvxz#r1LB80vHl zf5`z6{+NjY!+ciltU0?jQv(e@6oOxTJlO0w4S8Z&%zKAg{Kne`Ldw^0lI9Yc{!bUa zznwz`lzyYMPz}sl^bp5ZzdynPti6I7 z$CjeXJZd2H6k9cB(!5;;WQi&t58dU^u!B>`+fiZsXf8lqUzOnV>odt| zZ9O=5N*v9c{{mNioD<^0<4G9QqHaMJn($-c*v5aLv`UV;^p;>%@n?)W>`4Bt8^bE& zeD?GU3p$XN28Ani=^^bYp!8w~Op}hGIydUdw&-inxUXTvaANOZ(HBmIlXl>^?i=3x z*JbR+@$abiG!Lg;mn6NC+02ixYx%>zCvohV3fHellYd}Z6kNH$rPkA`$^74SB<5BS z-Sck~=}}T9ef!+N!*v1IspcJB<)Fl>*S@1($@`g8D?LeD%_=%;-XprnE*)b}%h0qq zH(KE81~QTUwBhD!Xp0SlEtNa@d_@iJVjl&ldl*gLwXP>SBIK!e$rNh5eGPwl*cuvF z=}AWgg1CxHGD(nAC@Gf9Lle2CbBD zp+`3dln?TlP1nAn5x)p4Q+1%(t{Fw5Qeg3$m)Ns$51QGhLZjF>7+CuU9!rM70L@MNafL? zmOI4lmm5xY65#qr1G81=EMKtHo*Fu+lg#K`+_Auy^kwFt==dUX_pw}~#f z<4HD)DbP26%ZXNO8!EIbV@$3W&dLPzvTeuO4JC}!h6Na8nFAJoCua1)(}4Oqe7k3w z-0k&hT<+L&S~#$RyK7wlzu-$Cx%=lDZkg50Q5%&Y;mjWD-X%g-Rc^-p38(2#+of=F z@FHkxeFB&AB>MiqQz~SdNrY-Q;BC!9GHPB0>|8V2k#HX3OBEn3CX!w0tI67Ij(|?- zvlIO<86^wr@cK_}e)SX)Zs#clZri+O>O5}-*JyVa`Tc(lRB>SpTGYRNh4Rt<77YNVvQfCemnOE=7RCaOD&FzRy$QBa!D$1_Q2V#x``8V$Np*HGu&L8c*ZIkX%q#k(1;z;T?yx7j(BtE;EWmE1I!f9Crf?&9yk{Jx~? zq&~S6QmeXQpNa`tZnujbKUf22T(+U`+p9EYK@+|*-;CUYVZ>BtVy)x8QIYr)M9WDH z%Xq#-dG=<`vLRi1o&G|PEz&%0>PB|(ng#4{YQe5O^sByKkRAfAUW&67udd&S!pO@^WeDaoz^pXG**tVO6Lt2x48*Z6M(Qf?-S! z*2hn*^SdOB2s;jH7xI~=cMDPCQ7Jg|4WiYYUWhZug!7Tkn1N*=|2B{MA$#t{w(RdH4y`RhMwzvo9b|i!B%C8`hk;eus z8DTruiLvSd9MFyV2XMs`v@fx!7WxGvc80S7uhJmxK^wzA8_(3P(8a{udXSP!g0A6Z zs6Kr$#8>pAPE`@uxBG%x$q?sDO$AE4_`@VjdIkd9+DZ!-%s|)D6g24QM2Wy^=*V<~ zp8adUJyHhG$J_-|?RaQ5(nHzw2BtqL1CP9NMiCc0qGF2oe~ zjevLPbqH&C4+7p{hC=a|Frx4lMoImFI*dlEO);o-@CR&uq=^F8YRZyrH9&bJ8SP`; zVQoS505us*WnWAStbtx2j3X&dsGD;X)BniQnBD6m|`TgD2mF>AQV#P)MKCc6X-kh0*AQ|VDY-i zz@H<|%Kv6SRsfjYDPJ(}QWul=E`t$h4rc~`-oky(;yioZNC;VDj6%a-VYzl6>Q1l& zQ;k9NN$A4^Q)1k9(4%}Z7iBfCg(B{Y#_#PGrGM2f(_n6C?iH}0-oJ4FmdW)AR ztq*z{?d)Hbjo7ibk8yNefc8T7*|n<<+^EVKkK>Gy(1!|k#UHP68^Diu5013@n!0SAsnf%k?%cv#Vg z8G#otWp6l2{F{z#j=xzG%O?0c5CisU5);qw#SX`-O!Moh?C$7qu-h&j7L!o)Yt2CQ zdN=6yc#G1FuhFk#1U*)r<*1kjz?f+uG^iuoAx|;H`!Dmj(i>M!uvcZxKTJcl0yuOl zqRx+4p7+a#h&NbF%Z>qY!*Yza9zv7Dp3t{bl66_#i7{)t8Ao$}OkQRTZ7u8JrC0@? z9lnVcU9%aFNJ;RZf6Kz<{^97eW!_xdyY} zJ;L+TWbpm>JLp^RmI-_0iXZF?!L)k`&ipbH;%4u}^Hovk5>o)`|6Zb5#d=oj?F07Q zk4vEU0@2hwi1iIP3o3@`uzOD$+{?O*YX1Ad?${~hc^!krpa^lYq*!~?D*Zvt2L;RV$b|+Ebwgy}K)&gvcD$w`9a_HEy6P8zHP+O7d z{8z71F!b1NcYm{`j1&6$dY_u$>UDbg^ktUAgtoPu#+6skKK7fGfQnon6 z7X-&r=@0)0)Od0*hPWr&J*G?j0)$%3nA6v6i_ zbRvprs{CRiLWkc?rRg`4=(3l!ph*ldiL9olvMKI6U_{cReR<08mjx1|CX9RVX53pJ- z6M4%-q4?(#`2Hpc;<`^GpXG&s=L~G?n+a-KOR;lO4Q{$F8n^>TTNTa z!|HHf@UTcQ+SeY#;rs~48vZSfz z60}OjRu)aj(Fl#giRu zSdGOKJl^4%NQj19}_&uxoHOZ!tCT47BVtrVakD-&sQ^C$TEjKh&Rd7JKd zd;qHt2SH0<8C#zvLU+76hAT2>g2^>y`b%mVnIYs4#UiW7rY$$9-MW5ED2ru}H2Hu@ z`)Z8p^+U;B1F(2?I;ic<;G}-3#4_55&GNbsG%#Jhd6U5f2dN7vrwcP`qXJfNq^s0Qami z*oU7Q`EM&?x$3*Zxk(z+_yRLGa(^Ga#UA=y>_KT9C z=yW;cnT`WzPYkPgYZ$8K6mT)`4?1p>1u3IjnATy3yOMjbZTcwgetREh*2LqLrxaz+ z-(otK4>9|st}=t;Wo(_#58xlI;aLBOgUuuk?C+R>m2)dxdQgJe6@1XS>dy3Z%D|cj z-B@V03Ah&`IUOs$a|T5=f~SKY#O=KbaWoShji%trCwp1`%QHOXqg9ajc@yZ%Wun@> zagNl(SXk(CA5W@f!O@TyOp+GB2#*}_kXGjC$S-HsN_hdF<5TAKpaVBOd4e4mgE+Hh zjX~Czk6^cX2wj#wXKWtv;mi4#c=Y-x_Py@{GCdg+A2DpD_&W5k_v01MSp{P+>Y(_V zAse==4DC9du(MqdO6NatJbCHs--e2QlpzCq3s z9@ty|g_wq3c)jTY`CvJ!28mm5F?Nk_ zz+!qQuXw#aHk&?WhHaK&*VQttjmw0blU~qup$hpo3mC4UH7D9D6Z2Z{F*cUDi0fW} zcw!y;em&0WWsk7ePiEuyd94^R{{(p4`Nk?cO=zf4E_w`Fax5NB@T!8}(6ibTd)^g; zW5;7gPptzx{cA>`;@CPyp= zdyTJS<%let`*nvI7fi*@l3}>3lnr(v1~~Rp1CJN=qS=`SX1-|zF8uxl3e~(=-ltI* zpJR#!&XJflx17aYrLeE#JX5<)ldUWˈcJa@a4xn1-SdA;+Y@->2mC7;(d7zq&x zb;y4j1d3TPIF`8r?`c}%;O`i)v6z_q{u@B+&2i8?D$mMxZ(_|_vM1g<1p`taApMEL<1zJ@( z&4Jh9>ajNHi$!=URY)Fwui=sp*6}*Ky?<8QW&_}e6 z%L1uiOF*mfDY!W0^IYd@AU9W>cgz#_#QavRDc3kl{nVVQ<>V% zF%(!iQaY@Xh&q-d405g_U-?tn^A~w|B;zdxlvra-hcMevxEj_*7~k4d7{=&)PQV0o>K3`AMcif>+)0%!9k3_^p2OiP#ntFML}Az8XGNNh%JR`&=GEe zD=eqt^Xxcqu4_jPBU99bJ0LC2p}E&pIroKxz$@}RgpCBi%?n0&d9^PyUi1o#uQ))n z#~)}O@`TiW7Wg~Wnb+$ zaE7)Kbxmo(MDuIl$8Ta9o=l_LPDi5H>_mLJEDH9m&B2(Z`AkODY*w)PEHt@`WBImk zAhN!b6J&WECvNGxa3F|1mvD!1=BNNn~fxh|8^~NZk^A}wUQzazS&{dA2FhMeGGRx|AS3> zt!ONs#|f11MBl9WqmfjWm>z@+g8iJ-N5@XB0h}wxm1!GtkdlX;(5d_=ARUlKb9ee`y zd7QN$0Pm)9>aPXk;H^6l^fnVG32~UnWr_Gn~8;AOQYIq?%{WZ?l6MgNT0xem4f?1xkoS9P^%Eo)Of*7X)7usmj2mKm2xTA^r?0Jl; zUXx?n-0kq(k``7A9%8pwIU6tKfx3qbar|2vHtsgePjA#pp*)fL6hbCCEo*)Kisxj6RdYdJ` z3AC|hQDpfNJgC}=A%U}@en1=aymv6~EcdWuYdo3l14+!B*Q-F)ND|5v4zqr* z_p>`IzcFUd-Y_ncjCr!|z>GHs;ih*JnTid3==(W>pK7mw{)ck*qDCLqr&fSfrXQ@= zECfa65L`U%D8BgWgbx$y*yB#-@FGwTV<+jO&&3j`lJ!K}(+LnJcm*>K#G#$45ZU6H zic`jSVyH$F=9!&9=6uC!06_^@cPkpUch{U z%W5~VPj7l*(VJ1wVRrFUxq?`2Hp(FW>>!rho8AJjJ0VG6l__@b6-=0QlW}C z?0*~coGCLuF^VT{TFLBKaFGrCv7F;@w2G+=JdScdzvAO@G59{s3bywLp>o!9ERBi7 z1_u#x!aW-Hw1$JNg$OgZK>YQlEmB$R zkp6ex=T77aOa*nD6OehkiZz{T460#kvEDM1)4Hyg-DsPQk2k$R zQ-dJzTI9nnNW2cG9gPWjqsB!1yvu0wYIt9g5{--|vJx9}3i0KO68PBr66bSUIhL|X z*j;bJo>cXO&r5T0N@54JxqbxhXfntDuOK{ca7NW9&74agLP73^F1BahVUwmlV?M9m z%!p)sK(|OgUUQZ>{nWb#()2~}@=_)I_;4E2{jLGcJPyOxZ&A#Osi*O{(oEnU55QAnf50-lh(J#;yE~JDqZ-n#Ec4Z(4|1oDH%-k5pVG6z;@Wm&G+aZ1IKF>j523*c- zLa}MFIE$5oa=TuTJ08ZEw5T#Q1GzwCzoBf>6V}qhAMbokfL+>c;Ap1|i|!`lJHb>Y zbCv+H2vUWu+fq<^bShpCm0)e{pTf7=FwQ|UC-9e9O?Pfs01K!u^EkQ_?=(+_Aghm% zzh;>Ay!{j;n+3?)-CnHu2Tgbpsl$W~E0Xi~C$gkc-x;E$kAbC!p*_rkeIIrRio{$X zecBXiJHgq#wARw){xVeDGa3cj3Yf?9WSGEybJ*EchMCSCtk6dv<}C+t`#uxslKacd z82QBhmM~&OUiD*ClO(OT^@n|azc^4)3Vk^R`=!y3@=xt zS34Zp6!3*rdH>jcD|yy;&>DmC${;*m4VIoz1S`!RnCe=A35$8WP4ZT(h)*M4bDKo; z-7`49j%Tp!QX9BaFp9bNC)2xgGr>Qt6{I~@a(*1T1{E6vG2neMoUb#XvKBWW(0&RK z-H)gxYs^R-Uyds}CzE*fd~`eIUA9{B06pOm1mV|KF)MHSLzA2l4!p4k5#}rF=f$!z zYIiv2e}3QvHyj1gs4IB2+kyVuTm@3|PC$g-Acl8P=58NCS=KN!oOvB~{`P=NsTW|w z`f7IhvuN-eA7DSsyTJOa$U*N@kRavkoTI<@A3{NUqb?hMW<5Ug2N0YC#_%+ zbqXyDz47`W3mqjfG)4LY`{~Rj?DkuNtG@>$=?tZC9$FQxM9&?;>`iVb@BcMGJx&SCSq zx1;9$->`W7H2Bvsj_mw&FnL`7GDfe^;7I|_D4L8NM;Un7nat*XFT+m?>L7GGmzmNf zN2N#Q*dK~n?8;TvxY)In{ju^AwA#I7<7+SA%h)0qFnR^#?M(2TT81Z+XOfle4mdYq zHv4xV8s9owpnGH{yC!KCRo`-ul0`wZ_oWavpIJo4^ELp#N|9LKi(%Uyx*})&RScC{ z!CN=H3tV&?P>coArXenObH>om#yF;t z&hhwi5eAo?LUCZhEb;^veU5~~#=B6$&5kLvEx;Rv7AS7n%CQ@ZL$Q@STs`6p`bG6j z#U){SpvV%B^xVf*hm)}IP*UmUpYh;OBSLCsuOpXVhvFdrJ=z-RH%;N!jm-}ZL1&xx^J#M0w(JEH%zchHcE*p;itY!q0 zZb6Ji06wxZpcfKWu-9M}nN#D2$%+gbUz>uR-k-s4!!OPUTX9?#cad%JsABHM7U9;K zgW%(v#kg$q!Dq{T;mFUUxW42DuKt^hi=KC2KXZ|}P^`=fuRjU?FH-RNa4S4)$^p*2 zMD$+EA)l6~;%@I4X2T(X!}sOL`I9$5&5B1aoZW^S@__!n*}=F)%P}$o;b76!j`juj zU{ECmE(@)~oYdpE@7@6Hd*ng?EEPoa*->~F)`NKQBb@Ekz#hXuU`tyWdnYaU_brx9 zs8u5pYyP26$qiO_dNw^dF;|@)e+8jIw=tFMBn8_qv!A_$h@-PLo+}JS<0f%}?*x1?R8AuJWUBGqed-D7kXZ z#}1sk`4Vrz+{?K5=})xGHG-{oYMFWaC2)cJ9eBGq4VSuC;dvWL_Q`yAPCpS^;CNus1&y%qfqSn{?6O=Vw+zGKpyAJNMz?D59AR^0nxKlJAo^DNH3 zVM1K~;h+9@u)r)ILf{*FTDg|lE1rdyJ6EB1TI7V^Hk%RHbr~<{a!}`B4f}mcF{oC& z;SIb!!kitQ%`Uw(k7!>OrIj&v*~RNl(+f|OVdwQ5=+zU;h{#SQ5h}X)->SK&n^eMb zbyj0VAz&ZaS_&y~Y#dc02AAjtH*~SA{w>1*Q)56*9 z(eIeO3en7r6d}mj?T21@yWxz}LXwxA2(9-_z&~Gu&AGWArahR6jV-2-+RlZTl@ZKN z(FV>)O)}Kotb>1b1H7N!DVXXq5440%uGuM`4Z`22zZTyD~ z3>gM<|7FB4VIfqc?!dztU+`i6S8!VS0}B&EvAWcgD0Lmg??)1V*FZ4tuOO|M=;sZS zGC?Jrq0%e9Fki@htj(E0Cw(elH@1F+9-lA}@Mz$T#4FL7u3hj>wI4Qk`6D$u3|yZ; z)>mj5&o5@io5-wH^@#rQfb(+xa%m=K*Q)|wH7v`9<<}^)TE=WZBwnM;v6HxtjjWvCR%nf~g zVk)>Bo9seZ{}o|)`@Aso=41-9JzRxZy>U5cKlB7yCl{`L~YDw$4Z`mqU5-OH6tp2(|Gl&kgGb>zKOX1PE2E zhf}wd;p2bXIqN3{AphYq5Zjaw4F(5r=->+YH@FqN72iWx-!rV*`xx94KeE%-=fRoa zU+nB4Plz>G#?%hDutO@hV4dLP62J2|U`g*LxG1!i<115%2bT%bZ#i9XveyFiYrR<8 zEqta<{1~oDdxO2NB;oqI0Ny~kKc*@bGOM|z_+C>R+eR-j!lMH4y2BT5)eC~|!L4xd z_$R!7L>dKl_~HGraK_^95%^JCh!UJ3PHNsP@;Ta*m3Y~Vv!Ww0az`$Xg`8y0$9jW$ zOFka-w}o%PS~zq0adzJ@3z0tsp``aBl=Mdc*LVu$#J>cQl#B3tC>%qIrqOjS3_KSz z#jguvz-Xx^lX_^vS2z=lVzyHmdGA>mDA2-CkF9Lcrf;xt%^Nmu+g(=Gd?9E*)t=bT zKENw(C($X9(p+#_(%a zE!%v_fd0+-jLNNhNbjr3R8-BMVI{6Ja#z&Bt3!ndc~sGjR@czz;1e{yxRQPq^~9Di zPsCli_;hYMEKL)jM3j#%Undj&Qx^*%6mvxP}rljyF-EbQ@_M%*l|3Ardq3=Nxc z<~2q7ULzCxR+X3RifYHFXLazy>^vsp(*@9!RL0=0d>jxKr%SBO=uBxp=IqovJmHn0 zFvH&#MYk=%i=Tgk^@UXWr|uZ1@$_`EV7DQBes~MTG&jHk?^F0|ULa2T@DyJ4*F(Qc zGw+$`5w^2B38L1hvNLyjG4frB7~egSCB0YA4v8e=xpn=ptiT&bJZ>|gvau}r_ZoLu ze#e7yl*zfY4JKzC1l5c7Y<<*4tn(S-qzCL|AMBb70wdRf|Lw{|epG^3h6UsMhE}|> zA{4n%YjKO=ay!==XO$j9*o~=fm&+)kGq6`LX)y4GEC9p+sH?&_= z!T)lNajRSw9(6o{lGA6=1dSl&E1o!kK+nh{5gg*uxTrrpD%#!3GQP0VlL;zGI1!C^=A6} zR$`&paS*D@g7=~EDZ)%eNxu*;?4C^bkU9S!!UmYx5|oHCk^0T-80p9LA{)BG9lb@ajU|7uu;y;o2x zc{22=zJ&eljkqDz8Q(k%Wggp&u}-Te{74NYA_&vTrlAB_{8AWgbCmI8sV`g-mEhYJ z%hCJC6EJMU9FV=Vo3!_Euyp-XgjiQxF`-)p<>{=r!7|ivTut2lh3OgXX{_6$p0d;K z>R7iZf>nI@4jcB?;F_dX${FR-c zvBL^S>rK$^p5x+FOmviE zyQ2{cPKki&vsOsks!aS-vN0>aliikf7q^d(F`MUJhXes0Ivy9qRg%-lt0Q8tZnOvF zKP)CEj;tbOp%8?=Ow z{HN5+To!BXX4CH3!ziQ?O;ys9NnrbR8rfMyc1w3com(bN-&jR%zly=+QU=!z&Y}zV z9YBXmVN@$nh}2qs#t)Om@z3lPOvBj(=I&uLo_h3D;&{@C9ITxTO~!k$D`;DpenBx@ z`!#JsTLYMXNyW^R`*Ap9u{kOpF{fJ`+^AhK5AQ}rP|v%kuym>?o{GCk65NHj+9GkR z%d=p5a?VE_-truxkH^#6bA7mR(roN7eoL$;Z3OS}C{&ZFWER_0gMRKoC_j0Mj3iZjhalhpYXc zgZ8y5xU{MV+6pdUzSv{DT;I+Pcs&Nm%6Ql$oR1Sn;~3Sr)u$*}UNe<$zVwkvdU~0Z)bD^72X|nr@F5a^{{h)`b~Tlad5ur(+i?Hi zaJ=<37`|8}Q>R^J^ykwE+7kPm?Ajg$Z|_&(n+e|YUQv|XPO1bGGaWpdwGkX^N?>=Y z2@3qyCDnC&CYhbgt$Z&8URyuW|3D7)hKgpH|l*j z!$zVW>*2bNr}1JDn@QDaV8RO2-#XFT&0BTx~O z2P0G3;IeEByI(CFk??tfm}+xg|7FE)hlMke@^c_(xFLJKpkV+t)Exe5+p4fuPCDeR$>VCkkW z6MW$xyKC16&{gn&7wOONwnh>J&hO{dXm7(2Zwj&!EShKP;(`i2UeBI>ur``Z{^#kb z{eP`%eG21Wv+${3A>1CT!}>l7w7wl886RfT;VRCnoSXPDM}r-j@TA*|g~`G0>Ezh$ ztxzGI3H|1^xT<3Vo4ao+xw~)=`W`aCg5?MBQf?)F8hZjdpAsQGUjj{hM7am@h3Lz- z!5G>bgIBIjAr+E8;N3_nEB{OhF8G$f>xyV2EA;}rro+dTi?zY%w>Mm^oCB9{>Z7CA z3T#?04)%|ZGyO}pvPb(Z7;kb2j(<1gKQ}Ov6X)JejZDmyM@~cL^cdX! zI&j`i$%+~4{?9o zHCA}eD>PNfV&yVCnW?wWgGf&yT(f%uD}uX7&WwvVW4AYM{2k4vZz3z$$jk{x2quB*I*<@pom`4e;D|0+21cdWBFjuTn3l(j`ANuq@) zS)OyBkD81nBGi-+Wuh8s8dCa>Fb}e)w4*3%X|$N)Irj&dD56xPL|TXx$&!@5&-?}F zr*mER^?twJuT!n!toT=JtVq9)<35MBOks+&U$mrLp00~L1m>GhAZKw4<$fDw zaPKMs~c5gA+#Lb|Rg1PuQr&nbCB8B<&6@u`o4Yr%BqFZqaI<0I(bqh`2?&%6R zZaaxDPi|n%`;#!%j%OiyuSj5NHSXgz;k|?>xVvc!q@ypObMs3wWShv{6X|u z#vU!AKhl7Wo$$UsfxJ7A2Ol^EzH~+=)=j#Bn}j?JJO2YJBOKu)u0i61Kk&C?WPgw4 z!9cDM^dEXa$@)Gk+hqVr^CZZn+Uu}J$_mXtyhh1FfAINpg5X?pEIulz$2e;pT9WjH zsYKggnP(ZsuDnAe3e?bKy*^iJtVGWnzase_*|>AVZB%ayW z66pv2D@tK%)pWkpWG#eVSPu)Ma%rwq8gG`?jBSZ=bnux4SJ}UV3;ZHOdtTMS?eOPi zo4z~4s%TkkwioabGVvtUI|BwkMPj_}8P=%Lg?=^5%6i)6`Ho&W>~659`3X@p|8o-; z7TW<=WNOKi+-QDk0HRF(ba)@qK*p2a({Vp%qMe$G8g;|uPRSJh+j%G6euF9-mDm7J zbV5+U@r~fyhXHI^UJb#0(?yxR4b)#0$heH4 z?-C`1=i=LC7qNWzQkb{1joq8jjA_LNd{U%8h#$mH=FS*qp+uEM41HImK-~CSwxi6J zc2xD!aEF)N{^~IPd;Urim$@B$OFzTe8Ma`Q-HEIA4T6U8U&O<>1`oP7(VaE&*5}iu zxt6fQI2_}^_lNv|>ic(T!9TaLNV$@R?r@~bCto0nSEh>WMAw}Zs8~_5n_jLC& zSvYk13fc^wfkw5mGN-H3{AT_g==|k=+WIEUL2 z8O)z`>=6Gw)qY^QX(uoSzRu&CECO$8#xAcB+M5 zfi3H7Q-Zb|7VKPf1uSaOC0G3op=S3PXiYc|>$Rp)57UdHjh^?Jq)iOQ`TqpbUsbr} z&ttjBMbdOiwLChuM1qO%40O{B^t@DtmzL^*xm-McK9U8M2}Wgs`Cs9d&p8}g`jxy` zFXpC~dva$J>d5`33jED#2%JpDiQMok;;Pw-b|Evs(qk5X%;+k5KIlQ^AR(A4D)R9Q zLztbK9IAW#1a8l_(W2Hq?yu2!Xn3^+kM@nhJ1)^G)mIC@9DIaH$v1H1-W3cv`WEV= zzTv0JP>AnV7#ybzL$zM*T-88C?v?_Eq~ zVDIZ*)T&W<;p-{Kq@ZIOHFGw_oYigz{kfP+<#Ue^kC zMU%?=vsJ0L6JWBb8j;%WLS|W=#1}rru<*cfk{tGe+K72e#r?@(vfvos>UD$prc|Xaf!ssiaASc&|3fYG$>Nb+I#^!bE)A?IM%sO5*ly#VuVf!hxtpn>2vRL-KS^Z?k&!6)Kr%9rAMI8UkESv zl_2hSA(iQp0>9=UP)%LHJ(AVoiK7Isz3)DI?IXv3Dh}dy#ngxnOg@b92F2vKlq7$& z-3rf4QfE5X40xZbmqdQC^88+#Xb9SP4RexoVW2fgls`C`I%vIT?z2zRu0G^KWxPOp zMKFHI2;t|2gu`TIX;%3A7N%r1kM3+z!>l_-Wf2{_pi_E2p7wHtU5QexB7U58SsukX z*_T18fx^RsGW4R_IJ+AVD)R0Z!T34^0c*nAdol3u!p&IVHAh%IK- zVJmPyj9Ip$o(j7Az^T8+u)x6p-YU7VSW zez~HhFWs?A3M#8Bnc46{nrHKf8(|+|&d3q=^7EJ=LhOv_c>ah!6>4yM44&h$(Jj<< z-@nvi&I6p@RDfGAMdA~Kaq!MK1omrAa%HL()-lTGQ26&GnmKz6L)Ipv%IVF}D?OKv z59N^iY69?^TSPpKy@^w~G`(9ok(en?N9~>GcpKpn>^RUQP}%Ik&reH(l67im;g5b<*7dtQG|#0Iy!T@8 zwHA`5a0pW5qu_K=EO)Gv$7x;}(4}$_*O$E)Tyarht~S-BpNs-wM?wmk6}y3S`Am@B zrp0Bq>EcuD0IOD8_|o?`gojL|&oc4_<=ds{uj+^4nf5C%t$aZH#DAx2a3VRYX2e@B z8OC{yo4FTR-lQZd2E!Ac!qtK_5~_Kd=ssCR-Rxdt9i0eQ7c|4_`a?Lc;RmkzYC>i; z)WZuEReE++KDv&Iohqx_IO`uV%v&d)NpwG8(z|+yN6ckWiiIDob9+MvK8YE3={UjS z&~%hYCe$EIg1&E;px1V{!{QUi_;pIU^x{KTbonuv`Yf(1ZOA(b1CA45<$EE7%{YQOChC^muCR+a2d;ir7r_b*e?Kx1v9r}8cR^EFk$Vx6kPhkty zsGq>o^7D8|7>0&E*I9vw3cPW73wCLbX{CoOrsGsXBmE`n-)DPcGmp6 zEEgkprr}fWn?RUg&O5Nhm=iukrx+*TvZX;R{Gc2^CO?xVw*Le9>c7IYe>CasOPLU~ zGK%>>=n#=tXHe3TiXA#9;F)O!L>hNuT1OzvzfjM)67dd}iMtP*S9mDy4sy?W;s05g Bfv5lg literal 0 HcmV?d00001 diff --git a/src/autoschedulers/anderson2021/weights/conv_layer.weights b/src/autoschedulers/anderson2021/weights/conv_layer.weights new file mode 100644 index 0000000000000000000000000000000000000000..940526ea12b583fd1fd2b8160f137f00d004dbe5 GIT binary patch literal 20400 zcmXV1d033^_s&{qS812Z5+MnxdCoJEL?NU?WJ!`Fd$vTSMZ2O>32j=mNb9`k8A&A} z%ZHF8A#IX2gx`Fx>-WdG-uYwZy5^mEpL6c}KKDJ(<(TUzF)^|K)!6^PN{FsAL?te& zzVx^35kH5t1U-P2uOrA4gFH+TBA$Bng?`^3Kr1i!KyScp5OeDx+oY>`*89bvt z^N5(F6R%nFI&bT)XE2N*)Z@Z(wk#}*&T4pt>XY_@kIFe3^dXE@I?V=yQ=wdiCXH6AJVZ>q1ZQ z$Iv@#8B^FdLbw-lSwv6Zl!ur4`gV8#jI=UgS~IyV55(~$Uh`;x|&!7z^h4WI7o#dDi(V%~w{ zP<^Efw;Ax+1&dQ`K;o+QzQJ3lhLgF=?rn$NCHiRNn^KFs#r zQxz&Z7Bjo6OZooZI#jQFH?4FCW^KHwEJ|V-3;CVT^h;c6Rk1V9h>C#h{=0bV(-6Ar zw{qrB0?~NJF!2Aq;uV;m!Ps|Mka~3uxb2vMkDf`Azq;LMQL~FV>+6=Wl<311k=jdz%b*NyICCk3R4xa%*Gee zzavgeD#nqzW~~224Q72_&E79cfWF~$82{-YEtoM%7=NajUG?zczZhJ_Ryu1`-?M9ysf#nlZzoCP5 zG0E5HKra^q$X1F0Erl)ImO~?%;@vUC=h9@HK^}19bb&GD>$u=)4D97?qw{`-QZdUn ztZc~-V0VqE*V}ITaOwsIP3PF3?5o)J!;dpiqA)YG63)HshTyLsfp^Ia94FrA>Yvzv z+KX+h=U)}*961RI?poy7jy$@~t%wD#I)fe8%&B4HdQ7qOM+ukT*gkd>_GT5q-=@ph z#$`fG({J2zFP+BCcVg1V`nUjv@qDi(ld!UIGXL^WLI;gt*6F;RMm!0kE}SMo`L z^?ENRZF-eo)iMO0r{?gjJ3}ECAF*2N%Q$guGyA*aI=oe|p`aF@C+JIu)f=rBI3zuXY39AP;qr;ajXkWDf8(N=m4Kw$_ zi>42lc0C8)NY6*N-dsFyo(_k!9%8~0Kb&Fzh*%D%!5rNRXnw`payX1 z+a-v|8Ullt=TI`fkTcJ|fQ3U1pymF8D6c(@*(R5vVeCi9n?8)4)?c}ZYxNL&Mh8^w z2cbggHijMh46#4LN!?t6@OUvgP7Xnnb;US3_dVCQWf-jE{lQjC8V>rN1?kL7=&1M- zWc&TO6NXXzsX7JBuGK>r^s<59Q6b5{4%uv5;9RD2*hScoD1)rsih zcp0`XhzA9h4Px_(Or@8cLvvXr;Ttm{E!*Hr*FV|7uGO4m%BKV2xKl7KvT(<>>*9nz zYbWrD${;=%tI2{yf5G+-gx|LFDqY$;g;qb5Wtr{%EKb@1@4rrlzhe8qAXEp2!ZT4! z$K7-qYsKLDdJy||l-D?6HN+}z;4-Z4k&LH(vTOpVidjl}8KM(Xh(`)30Xmy=V%ZJT;4(rZ9#|NKXOhvHx+W9wma8AQgJAZ^WLt zb|n8}F7_-;$I<;MnCm7b^x7rH_X{Ya58SUX!{Cjqk!;7tv`qST-g~V5eVBD9-G*SV zY#28y8V|Kwzyo%L%Z^v18MohZNoEmX`96i%%+QC|e8kj+g}8F}H}G045AV5U77uBgiRZg1rFM?n0G}iIr&`;ZmciHy))y4MdwfOORZ)LZ~pg{X05L0AF0?R zoMyXN_}!%s9L*z0#FIt5TQ}lq-n2w$-s3^GT;7dEs&l}2#B>PVaEdJ*70DD1U4#Lp zYEC8Rr{LsbO`e2CKZKfGH$8M_JD-dmDNLN7Bkb_dpnffK{N(SNOt2+}&l~%i{kQi8 zeVFu<`E9nLN!4xC?4G^whSmh;kd#P^IB~w;?N#i~KXums(UJemMqar2sHxCS5YJXG zbfwDHJNXxHVa^%sc!c>~^z&jxv??Res|DOP3Nhn@{W(wXucuDhqf(yvYES#JYk z(Pk!J+Qo&Zv_A@uMKtpNChG7@u}xUIw^%r7LlI6^InM7j2m}5BEArx~E}D;m6 ze)}K`nf*uM-NBt4*WCBVAZwNR#(2*Xx- zFw~Tfr*`c|m(^u3Qbr8bKUG8FD`grLw~@}Pxz7x~j2HUXbWs`Ad`Q_U#&o(aVd;qq zOftC11>II>-wVdFjIY{o{fZ`@>b}R*-R;V*F7{&2(iTxIsY~?jwcBuikq$l6AI6$( z1EAX~6w>~?2pI#DU}TOEGhQTc5nTtl<~zRVm=J&=lH;hZ}6H0pn;#jorL}95V zM`zD8M%Q~~IHhqT+x>Jp?lxaRBX{0}enppZ_x5r0^t@brWgtVBPL^kmmKWF*5k|@x zoJR8>kEGv5&!GoP7vQLmSr{@Pqml!R>t<7w?JptN%djm+YqqwgOsEbA!_1x^Po|678_G(`0)@9|IR zd6b-y!~KXlj!Ckq+=!PCN$IXGc>2DJwT6icHGgDLIpNmml ztpx^nTOu7dpYkK;pw}UBwDkB3eBEJQgr`6tv9JaWZ)c;KFb>?U{z2~WMwZtZq8WqyZ!(EShP{ijol@z=4S_zBaj z5#wZz&POv!(JN9&Zr#&mm2)?;j#0Jn{F;{V=!#P`nKu%@-|(lFesNSHZYMMaAEJlv zG_rW}R`%Vnll825Mw1nufTfoYt=R5M>x}H_mW3Tmsoa-6k1S`qgVO10#~^s3Aw%T{ z-$Q5dS?&l^ zht#&WXs3J?eL8rcyi-6{1$;qMVG&lQ8$iR>1{{4U74+Xe$4;xwXz82|FUEd_UW-3a zJ3b0(o8DsV)^M)PBp$3!2%&N1cO0%ntT!qnd4(^C*rTel7sx@`-`l9}`x-jWOQYl2 zm(VJy~xw`Z@M zGU-`sYyO`jA;Mv)TOxiIBm9~*Te$ddAV0%E7LC&p1luF=@q~O_DK;OcY2<=sXcZ~3 z|3qT-(@|;#Pk1@Xf_mu9qLv;v!Tz>_&~rMUKbRWAewv-9muuIsJMRu6=C$g zBQS9cpT+-ZpbK3>z`RqL{?i!A4>-sdPBRMzr}Lpqte_qBpD76cv*2O-fp|J6`vPb4 zJOsX8Uq{UsY=&DuQo-VF5}Od3#B8P|Vv6%$(qRxydpdI{CSgG_*+_6V1OA9A8+6J!&RwqiqD`XBUzFGm>y57$zGKz z3Zr|6iPzT@TDVx2n!fD7OIkDOBgH#lw`l~`7t3Semu|6Ew?*vuqX}Hnsk=15>;tXj zJ2Nv|A;`Mq;*O!qV0bA24ZL=e`n>t@dRG~yWxjzE<94BrOb(s-asvINpMWvXU(xK| z<-+|2$KY(!AP(HB5q!-UE8JIYgc@&y=$8=}x!`$=A*Ia&=C*m_6`ON((TZ#)M#`CX z*L+r&rUwa1Yw4O7XQ+npbr!^aqR$#B0YBY}>kF`f?Jq@I(&ootVkChMp+khMm%)Si z6?Dp{C)|O$8MHrTh;Aw*!Yj*)O%4B~vT5Wurp%XuPLFlKe=W+8qKYf3zThb{2{^cXGCmmYM6D0nWRpz+cx^}lS>rDt zmYK(e%pT<0mXGChLwH~q`w!skIhfH>3{J&JpYd$XQ=zn+P1|9RkOQx1%;8FFGP@0%8&h%=hp z22Tk;*zDAcb2fZOhb70ra8?4i&%eh-&oqNK>;7V2$8VVYu?~GAA~^T4sqjKI11qki zf@|&?%zG&d-53TD0qsOW;yGp>h=+vTe>v?HHE0vpE2#M$3fUiAK|(r{tBH+6gDv`; zR?tXN(lZEtkwFku{{h7OqD)36cw=Ic(NETFnPWy@4h z9!x>!xKm(}bP<}0{y70XJnwdr_db(|HC!ZpKd$074{?FBVGKkpTY@78zQFuB zJ!t4KjOJ7O(EnyL_DE@i1gJn;&lI>bI|e0}#iOH}40K0@g0yWe2z|6UOYtFiGbI`8 z_g)nwODzDSsZHEpRV!?E>LIR&_0W0b9nNC$9^CeF2R5wDCXI$1$R=t6??wSuX}-gC zm9^lbv4qP$Bf6)OnUKxpq4DY_(8@l75yf68KO-2$`YxNchE}846lWgH>4S{>HN;{4 zNYE0#<{}bqLSK~;@{bwuqSZ#@ugFsLUeE~ z;O@O1%nUt=sZJMB;-3mOy8htT$9SsdA@IvQ6zX*mZqP><;rEx+R{Ei_NPAVD{)g08 zDS}If5*mDq6Zk#9i+F{@jGR~yH>tqO4*h7h=N$CdkLNs>wqdMA2XVE&fGP8qK;!Eb z@Ob-5#}oH2p=a!E(EaxuE$^-1 zX1puryuNvZ(KAGI>o6`b*3~F824qM$1p}^N2^g#*Yb0Wd6Re|Qgjakcg%<8HTEC^ z#q7*Of2PL1;n%EmFzXz{?0%-RZ@K}R{KP7#)=q)qb+`CSx3ma$p4cgTG^zsT zJyjFd4gP_QcoU)WTygwRABpo8C$aOBrQrL?VJ@I!BfGNi5Y92Npz->%O?Qgsl&bQ0 zq28!KCKlgG_ejLB3nm;an)-y<6@8!@wL%RiP5zEvec_&clFUSMJ&TI5WEToUsYl&w;>S0k zb(buKuZyCETRa6QG#?|7`Mr?ec=9CFgawd==RTuwDGy4nh2ajX1Y9S(5`9u)${W2* zp{}wRtF*FEFmDW${Fn{hZ^IzI;|L1*`*_`JO>v=ZCX5Y#2M)19s9j-4VQBQ=z9iiqonq5vUrsgQT4b)M>Y1m`5`5yVA*t3GL8eOR;6a4;b%}iH$w#pfzJI zwvFo~BMp9H4&K6w&HJd?C^4Z_{ZG6VAWkj5^J%E_XtsoR5k}wi#hDiE z*e%*yO`I#hs*8c(W0PdMttpnr`*smOR!m{7QM*}i_9xg=d=vc66yv_Tp=5Mbtzd8C z4~&)0gTZGrh~l;se4IDHLh?r8jkC9j_vUQa&&K1wJth3;oG9Vk)Acm+?FV)siV3eQ zNajZ-yD<6eE$p+&J$9f#M#wia;y0Kqgg;p$SiNu%S8xH;O>GsO(iF}5w=QJojLK{7Szjg zkN*#BUZ3Oo>X%}PXl4a!`JenLM{_7l47&$1A+e~8oaA)*hqyI?)`^Ea{k~>Hby3d`Z)NOx4a*V0Xd&=ajF}P zw99}69rt0%vQ_L-S1^v>*bDkgZ-SOnCNKSSC6==n*q~qtVIthY2YVLqCmE{t>ERp) zNveJ&5)AEl?9B#Owj@)I=B-a=FF$spVVnz167h#kkJhu*l8P{ejRf`3ZS2^ZOOSXh z8#iRnVsGqIu_b8{F$-P_J!>Y=t?!O9u|o!&jZ!i?-ss~jH~b?*YZ(!1{<=8L{0*Gj zl81%I&DoQg6yN={z`Q$7aOJWgu?arK)rB z=`$YkwIqk*;+cKRPxg6!2;Vy-hF{>cUAVR_m(DV=6{>!E!E6sn@DqZqp|ez^W6O@m z+KJBiIHL}=Ivb(WOUU^c+mrPZmNBz&d9Ht*9DWbmM54x_C1xeGZn9;lyH>PxAqskC&dUpra zs}u0-BZjgku9CL-{bZZudD1sr&ei#U1L59kp2N3zSWV->`Gy(TdAtGd;!>P*PY4F* zPm#_xX;@a=frXZYV?wHi;|0BJ&(-ouUH zFy}oN@_ZLGiCu-$>>oCl{vvO>+QBin9C_U{!1A^PWZ0U*I*VYebje1Em%oT~>2@S- zkML;ySGYA>0L~77A-28?UafkB4F}|~c0vzo_xHgMiQg!2x&uaCsW|oMcMLo_4Qz+B zAbIUovh~GVuu*9fl&n~Q4d#!?fa6?jKVObD@mX-|pf9waypO`f+k|i8z`N|5g?VoZ zh@)*T;__!8o?M54U-olGIfGo#!5r+?d4ti@4uH>%ublEeQ4bxNi#~m}Je#{Ku=)0H z=v;aZJKvXpYjZ6z(rU)$Kc0}eGF_xYekPq+L1;m*LZ0R+FyDR|Vl{HWRUwFLR{YCz z?tKi&DNU#rn1OQPwHTlJ4tQd(xdQKKxV8BdN+{%#UnfQQEx!|9C`|$*$GsT4CW{jy6KA?Q9u9{Y-J!q77di0yENF!g3g1_uzok&hO-(O}YNf!<@aaG19ZPI*dT$eXzk zvDb;*8XJpUOM@z;;3xWA|iEsxfdY39*5v->j? zYWZ@4)*%?4VvY+vVlYFyg2Rnvu&wz7saZaqt1K$S7lvl&b*+qCE4qt9rrToQ?^tlO5#hewz38wy9vX%eIN6R>oaO5r(f3Zp zkn~!7c4iuuWTxZ$lt1VgwFHu4%Ry=067>F;1+K@NN%QhE|Tt$QT z60mmUM|6nK2FahZVaAI`;JGhfa9DdP@^i&`hfmdVysBK-ChX;&{hENzyPksVa0g1K zhC;~Z1~8gv22DfNz>EHa&DnfRJ8&Kj7vync-#!4bWL1Isz)EP-9pGGtH^aeww^8i; zB%Z_5bW+py3&o5F%LY`F(ZF_)An!a1m0Qc7KFdS5%y$@4>VUD$qqzFQrC>wHtnect{KbsY|H zPg^^=CY>N0DSD5xs8)z7?U6FJ-L1hH&GVt=${U{NPHkqSqg4Dp_X_@qu7PW*>$wX$ zXVGNr6!5N1<${gm@wGhwd5Q3_@@n?-OvC&m%>muY9x*Al|a?K zL11$8IdVrO=tUBTbw{n)Dz~xBK*XI-dyWScrF(ez*EyWs{ud|D2!Mfw(HNKCDTr8+ z#+A*#2JbZHVtRNsC?;*?%(qWQj;i9o{tk@MHAUO*Yv@#S6EuIE7U8$!oQiV*JZW8v zN5hQCMZ5hZV);yBG@;l;qOpJ*(-sA5N-W9c8L4otk7A)!Hhh#fB~dBef?RI~S+Z-f z>8TN9ta@>M=V#Rdh|xb*;MVH1xRLM5ocYy@>%mc!MXFCkH_4~SXLEpjsW8d)276|Yq( z(xr>iaGm*hIy-k9>W^(iZKq~9-e$uBo5oVl+OyC(T?$hbpW-NG71X@<6vv9Spvu%x zUePBWCpYOBQI+=)lz;pR$~|N8G>au`lznmACnru(CmbI)|Kx6Y*`sFj7^WYhi#ux+ zpwCDMTRLaMzO&ak?atpYZB;B!qaYhE3|)cEh85T&Ihl-|D8k6c26=B)ci_8lV_0n+ zfHl+ZV|J-FM76)dnftvVuJ{$!CG+vma1%z_r2&7^Gk7w?2n(`8;OLjz+@QTWUE4R2 zTIAitzKwHmWTqbcG3&;li(+)&nj$=C*+CZYC&JZpqi6$PAc#K0N7eN?5IpX*KyU3P zeDz~K%KN>6Wf#8VynU6}cXK8$ZJQSv&M4(3!4q{By8UqZ4#(xV_pmDSJmJR+cbkU;YewPm zkspXF|0Xy^sxT|RKTs)N2i^Nc`-RV0aQJsJoRV%JMORHY^=Z#=PRRr|=Dz}*E_@>B zt**hGu}$#ty(Bw5P99uV3bAiSG#Oug4Ij+t!Cy}va{<-|h?Cr6D8HD59?O*}J$(zq zF8ttBy!uH0iDR(1Cg65Vb`rNT&5ceWB%KUX_M5z?J5sG416G+ zKQeJ@P(L25=R?Dx!y+CdPBA?VGDYWia7deN`+5gHEKb5R`@h3@>sWMDn9WW!PodNH zUBJcXa!K{fRx}wBptI>ny8hB#bRV2a&&ZeY{?Kq-JnauRx7mZZhCU)X&4KU>exZkJ zF9>x1d`wS_FQe_7tO{4h>2Rw3}abP$2+6m$P7zL&+>j;;x z-Qj-tJj9~RE@%_YV1HRWlZM3IM~f1GiqvpU8{&4bt~Tb%jF3MJQ-!oP$dT;6{f z0>30+c;`iYG5i=@$#raaYLAzlUZc~N12CvzMO@WafN8B3u8BGge$!85ymLBk*qy_5 z=k9_Ksb<`sdz>iT?IVXJSK^7~QILG2L8On45RtVlP^qH?W!GZydA$^VjQ#}n`+o7R z_Xlu6>3yc3;%mvFcw_A7ZxlVh$)<7t+PEm+0$8A63TqFRVBYI0?x^{6ux?5M&pU3I zwD|#i$P&Q5?WeJL))Ks&KFp=uT!O*i2VY-bE_XNoglRrfEdN0!LPifhYqfJHg-(hfaCXaSbx8mtnyny)RNY7C+FPft&kZkvrF|QE;3mhj-Jc@>l+Qy zh7;iYS_f2^p~I|}S76GsLlEB{2W?FkA-J><)j35RDL)tXeagqx>?dz|`WPB?K8mz% zvEZEYTj1Y%f6SBFhjyd_v-y)jb=fAa>d-fGQer;&HNy`o74O2k(rf7PQU>}vmEnu2 zHhCe!#t!y6WM*_IzUvo*h#AMwWfO1$g?M5S$Z&Dm6I1W&dLS9T8U^m>IpsS~u*CDD zK)HPsdaj(v{D~WaVwOLZ}D z-E7FpyNud)aVWK*6x6o(qSHQS2+te~3Mvm#dSf;WOf`Vh?k$`!K#V(=rHga#K8G_$ zbhvHqUZ56LfhWb|Fk}2#I49cs<)poV9-A5>w`o4KPJDumD-XfvGzpH}&W1H@gD~%P zJ-N+)gvwHKJiA|a!NX!H=~_YX{A@q`yy_aRFIxfiH^yO~b`YoHsEC?A>tUcWkThH7 zp`c*`NY>?pn)YdUsec62D^uX~gl%Y&Fq@kv*TgBtYEr${>)ifl(J=AuA@1(nHLzi` z9CuA?3jMG&5k6N>WA6;zP<53*M~#wj<(AW^_zi)mn-B?Hkd|>o)jGhj6pp?8VY!Di8lV1yqahwN#c+nhh~>w3H0uy<74g$Zp%K4WP}kXw9%wW{m3mk z*^G9!EoAZdSP;AH!b|sE$FvmFp&{M{KAQP)_65bDZSn_aF9?OVS)C;F&=j`$_IcEE zn+Gcj!qM6(0Ht0lP`>SdoOnbzbUXBNU!6yB?FTEMYPtc2j@rp8e-y&~j?K8B*_Vsn zTnyi>fS$T^2z8g;$CN$}_MVYpvn<4!!Kq+;8t__Rsq!9c%1Uv@iDRTta~0UM6fuuD zS@!YdUr=#Tq$Y|{+|ZyF3tAD6A-s!tV24Qm&B-tKmw1OBE{TGaPkMBRj5U6YJc64( zUWM>dIeOusA7|31M-RoNki*@TXi`#59_R&u)fio-I87I&%tpb8RwE`DBgvZO$?Bwj@j{Ns5G~(AOnBo8I;$_Rll^|6(o)0wbGDiIFPOj{KZ)ly z*>uCl>9b(|lqPa`|8#o9upV{C%Cq;!bHT5C4U3mFfC;y*LB(VhF#IK&RklfBvj{Oy zIgoR6TPf^W@s(3s`K;`Htq+cxxE|y?FOn~OKa{;#Mlue_z|{$n&`P#iE$grg_JrT8Jv zplLlG$QQ#co2SEvz**#Z=rA|Qvjc7jk72Hu$csMal<16h5vg^z(eA7)E?Mw~*B=oM zt3-Z^<+Y_GWBWu{dBqSrJfsByqVwV77zbNj#JKrQKH%N`1@cR#P~Vkx&@kSJt9ZB* z)a`mvJz@kTeu#m#h!^q4}}FMRK$jSl;ZFjwCSVw^AG-;ZYKwpN0T(t8NUHl87; zR$au_NfS`-aV`AaB~BM8e#WBjdhE^UDp>pdDeqL|9bCM^hg+Y0mQ%EjA*)YZ5i||U zPz(83NDj$?bMKzu195MX@NF%w?kT~~8`1^U771MKNs$iW`HnM9nhyhCB%szO5l_vx zf?k&lP#oJP*!MgYwq8F5wcC&3nxJ7+cUgcjg_CJ*A;;M|$&srk598I7d+3+B`lNTh z9UfKN1|?f=6Y-%Vc&pzVvX3|**az^+zOM(FF-mAydIi%59dT`aGbSFEhCL5GQ7!5s zeC|rb$_w2v?^QF%ejVg;{~A;0Ez;bZ53}ID%M;>$C$#F{^;U1PQStG;^bMU{1A+fD#ia+i9G#TA}z%E z9$ayEz?RZ!sPIx8obHdNQ&&hZ=z0e;C$7VptMBs8#5ZC4-1%(rZaq9>I*~~R|K{qC zpBH%-M4HF@UQp=h;{J*^LC=&6IL7Qe+32)YgrCmf1|!j~W#%y9woBk+w?2|RD+84> z!clxvD;KiJ9|mrE!@lgP;O|<(4K>VSo2VRjM@);3`lNw5zav4g@hG=6-31ODHv+qw zL%7M)7wgtOCNkk~alu@9qPiDx?DTlhvd)K<&$Qv>k{VJcyg{BT?uCa(?}D7;L(`U) zGnn~%J4)Uf;%0|ffz<5-9B(jbD;?eSUXWd;Od4v#dDy=p$a`WARvX3Ha+meQR2W2bqRxTbu6*!Jc!q9zw)j=zdTXC;A;;#oY2e_>z1L1UpItl*5I*IdSIhnfNomN5YVNCw-XoPuEo8t7-^qf5nEB>W&+ZQ)-vI44)77_ftuMw8(qA zXCCQ`j)6Vf9J#IGzXjT7j16#pb{3Rd5xqCT+!>RF*GiMpuE0F)25MTNM^?+MfS$e=V>4}Y(-A2n~P^6 zyYZWMGstu~z|eDF?r_Cac-6NT5BU4zxYGON$@N5D-X?b((W?SxIt9qgBhaYH3Jkw4 z5@i+)v8`B(7~DPwW`{l?nHM8Cq^5}CLH^vV{k|}GD+E(|%y|vR6=>D3SpsqQ25c|d zgIY8fPpCd6xe6k^Z&D^6@tq^mQtEN@rxCnOZX$g7g+lk}O0xND2&}z7l^qLDfmJ={ z$U(X3N|VRKtjH}WmRtfqr~Kv>DkYK_P8BaamnWJwGjaDo1KHc4j}j>j@Vs^s zRw=}BFJ?>v?Gck9$PCft=sp~Hq6B6u6Q8fNJo$B<>g*agdW)>OH=WDE~y2eyi zT&v2Kx>dsF*RG^nry7_2i6MTo&d}f&)!da;C7esFA=$ZS63a804&jAzaNwN+Z7LL_ z%9o~N!o)l>UOfs%%$!P&*($SNJxjT`>{eWswNaEUI|nr%by?Ti9`G;tM+8y~$*xvh zxbqF`+#l?IT6M z`z;4%_ou7;R@;9hU2QUTH-C*=0)^;{R~x zRDaJQ4rB80X8jwq9x;~fyJUg;O*BbjVhYh6z7HpjIBc)m&1z3rqmoQ8E>GTtlXgDl zoK{7{RE;E@p#On9ja>}#t0%Bqk%l0rdkX8%0l!Ogjh+Z@1KKatVY>6xr8ZXOiB6~d6V}O3#;q(LF7|5+p#&;|k82 z(+|cw6XBWBPvV)f7Orl1if1p3h5j;0Cb`EF_Qx(_>y$D`CHNB-yec% zLzp4{UF7AsB5*f5i^mu12zD@eI+NFdQM0pg}xYsovqgBH}=G}ZcRo`cx%2S% znjJXKNrc-Zj&Tmp?~%!zE`GXJfI+Vq>IGlL33txI8E0RNtrdc<#VDL06yqwaHn3F* zF2rz>CFWbsg~a&>@Wr_cqWhLA&t3YSbJCfF%5Nt!<4sZUvvxi)hce7@wFO<3b)da! zHx2hzgioTra&7)(*x30OV?|yEi#@4OVA=>W3-wUGAppu^enRPU0T@+pXErrPC|#vN zrPeKm4vBu;Rk9xJFWAwMu!dl`K!2XKCF8>h7=9v=<=BJp|sPFz3(x0q2FpB;bWk@EgX3_=Q6yb?}6f3Rw zr<3H)v)^j{+^Ka(*rne*&UW1i5sthlijv-DI#UC zMmz)wv&}ep>}hIqy98p*l9^<08@>2JQP2jV+7<2Yl1NUFM;bNi%(M(B1ud%hLdA%54`qo0M>=>Gq28{^o+HIUkv_AFnF7-;1u)r^DQwf#_oS)lB&hG`#m4x4lf_=kAnST1sE8Zj z^=vChRQv*uoF@x1o-d_6?i`mr`x|{aZVU@kzrz|D;kdTW=O0g zn^(@^VrJ^GJx`Z{r(F-$i?SjQo-Sm^?JwYd&$AGAOb+Yxr=rSD9l?s1^D!?`k87?m zqVXHU#Phbq*)C?@N=j-~57gY*Su zxMJ@R9*g8rgRoN|=cUR1x0Czdt}R0`1;irMn9K3UsM+~ka3sZpwEWOxQA-kuFif8H ziuzSxaU^+a{TV0Zmf?bpEg}ztXz%@ND_)dIgO@usaMipOETTFaZkVsc)`VoyF0c|p zJ3FBSi{XDeJ+1$*Xrl^{$3T%!=NxPo`Ssgo*F)&*J22GJ&6^n&PW}a+#Jih{c*};O z0hf%TBUenKho8)Xipon6((o6@JvfNn!BVs&BNE#p52EhqODNXx0Qc`TrAlSjp}FG* z=k)ahU0^E9o_dF3?S>qDZajj1cm4r8-xhGa4}IX1QxqH)R*;O3IXG&gFaAvphT@`P zScLMRbu5B&a4trk+CTE`_-pcF^H#2vcb+TdUxYPBCKLak5zsyV4EP**N30#>sM0!f zZsFY%pmXdhDbnx~EV*|OS72|ptr^J)RODfDt!3#NkdCPE|tlR~G zmwfSsPCb?gm!rqfQyk$i%xSu*l1oo#(&Ouk!Cd73Ry|rmujt%CU_j?5mbr0DcnXJ8$H_#P*lIpkZOe{a*!l{!iuBh5zDhp z)^*+A<=}rAv?tbr6Sbg|?<<2~NdsAPZiwEqYlgYiqj+=8RI%4Qd3^k8EX_4N3VK_M zu|KpMwLVpoZQb|LzR=FNX1^?PtX5*{{_{flr*e?c>;rp;X5{Ctyp5C7=7MR7AzZXR zP8Pn8hlwv+NmS4(tkSZF2R1UW-|?L&Ie`Ni#RSlO(+b}T%fLIdmgdW5rNe7taE3~f1o3MxmrU`s|f*$|nEd-Jl{nefqcsBZ@f zw_l@0`%A^)0bk+QdtY|x@Cg`r0mPtD(wR9f#m^2V?DGjlSh@KDOc%tX$%u7m7D8NaEKIpGU zu0I!smU*Cdx&d3^e-osm{{Z16b4b>G!g^;K!To|&qB@f-2yokt)_nu0f**+2z7Qrp z6oRf%h3H$fkUll4V#g%8kgK|~czM|?a==py?UNL^!`dV1!9U`O-$zLXBvp^LRxK>K z=nmSuwdL2&F~ovhrbKbF9-04JkyHA75;Mis?2@0Dq{+V|J%0_-?(&)RU0DRF7X-kI zRZGbgZ3~dQykAsVE=|2P#?XXCui5D@SI}wEPDVGc2I+yf;PyutoSN1mI$SV=`}OQO zhaY;ugAIzOCmf*)uf}17ni<4zP=E)*t9(YB2$l-hLxmuKu3s3!4{i;|!Dm4frf%mp zrwh3KE(+AbEC-g&Nhc9Iriq5toKRb5CEGN18E#Y8fcb3{n^h*V)i&e!v%x1=#($MK zV(cJuF1-m#-~XVxYZ^FL|1L0zzd>&Ax8ZNr<>Bn43bM|+lbKvEqz+Cx&}n-Xd)u!P z-R<)H*LO4cnF_OzYse-!%M}YW>vKVki^MB#e}i7R6!FS`MYc!i@Koj*j**IHEH7R3 zQg%Gr^$nwOss`LJ3IMtOaClk5UV|rdfrM!xAK3fe`75j^i5zLec3dLQ>Qs66>yxx z1+rwsNHJg1Mv|J`z~X5qs2RT`tH+-JVRIg2c+`@-N;BGX)|DDB^P!6_0ZkgGNqxLS z;rWv!bbgf0)|JJQ1J&>F$mK$&_pnZ!)s}%5IYr1?JV=^l&Ol_Ly2#Simdli>5xdA9 z0+Zmc?BJVlk*kv zT`J?=!VNozh_p@SpnS-Ax;C>DqRTTNz^o3Q=Km6VUo?lJ^>4sQHkqeliLqMO019)R zV4sEzty#I2TPM(hl$1pDpRC3z7oUXHd=@L3b{1_rR7mobtHx^+U*hvUM{$yU90-0- zLzOM3sAWkNGYs^m;r7FHuX-V;Z!?v5|1J&54@>y@*S>*^z5^`IlfjN>)3~imq8}gV zeuh;OPtb_t$|n1brqHP_x47E8S=@BDD6UEKJ(v&lv$GWj?Q7d*-l^D-{xXGo1333web|0;^WPfe#_v^;$bjT zK23_O?~sb>L^5^CdAd(4ko_p9IJc*d$RB;gCgp9wqsq5H?t#QM3a8V}6JG+aavJ>k zK{&C;hYOOcgJUmL>3RuJKXqX&=~z-GUVSVH?>4INg*I9mUc{t;Vwf%e5RWAng72*%w%9BjRUJZU$P7gj?umHH^MLe zH5e0ZUSAV< z$%aZ?jWuBZQq=McA?RjmP}gzFqnibj-=aT_Q0GN13VKS1Pha9&;dL^v;-sZ=yx^# zd(sSC@z#Z^9FWMHd*|6o%}I!1gW^27BypvtAJ(lGV1Zr%xcFItsedZ)%he%e*(3~< z%Vkont&mdjfvBAg1#|6095vx1yWyXPqphmZI`tcvC!WReKP0o<@e}EddJE5nPQXu( z20Sl!5RKfHp~vwgcE;u-HhUPM=LHks9e#kn#W1td%)$=yiLgdnf!pabihm+&%~^l# z21|ol>a+b8WWC#u%2EwDbNvx~q#8!2ObQ^IBOZ(P=p4egL)XayD|HFvTt}rA+=KeS z6kIg#9D7-AgcE+GFs;ABxl*So;Mu4HtJ67Ly!a~!bu5kNo7Yl7jS!D4DFe?ieLO0V z!YtphpcmC7igw)zcV)HmJ3o8ii)JC+ceRmuJqm(JeXHp1f>OBiq6RCWpEVop1-s?( z#Ow4_EYA?YoYiM(u;*>DTedzab20)A=Q_w*$g^20F7%!KHhi&4hhBQE!`&GDlGes^e8D zZ7#)phS)?c55@mUb}jEd#{}K~@->WPU|s%1KL3{)Zs1h;IZpY&`~ol~A%mRPxDL9VMIJ{3{Ll8maC)GiwZVf9dE;6=I35Ujc z*Kl`B70KQvEq=$Yh@WR=(p8I9C=|!SH4S%~Ic^yazBClgd3z2@7d!-=Efr`dF(lUy zU%{}Z3=&{0fXKf-Lae(gx^^vv0q0^G7I&XBs0hZ%%F0makqbLpHsG%7HSjR(KF!-; z$Hn;B&=dM4sOxEneOz59xSEj>A$!q;jlI_hFso+rkd#xBk2aWcI0r;`%dzS!mao| zT0}Lpeqi;zNE}%Ble90)fhu<)_{rG97sFVhTzUmYW_r>sPH||hFNe~P9?`}HZ&>Id zBe8Dc6TBBU54s#IvAotDE)^tVgnkUBe)9%AByk<)(&CKx08HLuM0a1Y1-qyT61%m9 z9!UqT)MpBn9f9JckV&*oJ{TU9hU2y=V~pEepD}?pBa^TF1BzM%t~N=7D0Psv&G%*- zT(^_aOVzoF-zXieWrfxW5@!(knng_?MRT8y;Jhv=Vx9Y4IPEryo5L8d5S;XIaPrU**F_foU1^qDzhctO7gFP@_ni!Zso0dm(Vz&Db_1DgWUZSqP2^U zL6mK|xVd8w=ykO-;;;pu+tjjCqgCOD1CL7~8-(e*ADe~$1kE%v_}9*mk2^eub!N`Q z%~c;ECU!g>@43sk*ijCK%rv;l^oQ)Xf(sF5^ux^$fbFk~NrOTlCeSXtek+=9-4w_b z2MxiEbKAMf=|k+(yaKkQK?P?k`jKY81>o0pk);X0fu^uTbk6G{`E0s`ewY`9YuxoP zU%HdNpY@ZQa&0TB*hjHFyS8Fe$q)E5cmeqKzouS)Ed?&GjrE80z>b#c$2s@?>8N%` zb|8KxNl&|ncP?7+6T07$EB)^H^pRuAa z`s+_sn&Yn|3Uq3L6~ZwX{I|p?TMGDA?FVS>@RELVo<|Q)yNtPwd)WUohyP~*IcrtH z{-6~vb9^aSZW=*T#%S^RYh`%*2PWWdCgf{t)G#@=ohnViw~l*-URvyg<)aQC}!Nkt(q>@UKCLxtHPl}Z4o_Al0 z29+WtNv5Pp5}Ez(_j!JQywCpU+~?kN_C4!e?^=ctgqWDv|4RD*S6R`jFDe;P z?bk|XQgRpB8jH1bev}WDDveV$jtV z1lQp)c4?H8D&-~Y>6sADLM8_^&Ze-#`yH64(+W_2e~oH8+ySfWhcW6&G&;*~hDW{U zP`IhY428s<2uKnDhr zZxRW+0!UtD#i=|E0NZ_6iL_h^gkTG?o!U$OQg2jRUcpV0ngglLHQ?OX&UJfpz&~)2 zY?N)_zHMtjOnHV!)-J*?GWnR=cMr|@8CWnk2gbS-K=ZK}ln!r0naS@k{GA>+2sVN6 zO)Jk&RSY=&J2>iUJnk_a!KNOu2HoN|>Y{SXZ0ntyoQnD~`c5F=jGwLI;-aeHfc+~> zD^wP)h^wUCKJxsh>h}B#+R1dP?hU;5tqb=5od(f)m)XkKx$K9s16@&^O2(Pm(!6D+ z7}Mbb2`LYl(=%7r{OTYxwf{mjBV}N1c^9W*C&!+OkKtEK^O(L^J^h=rnjg}-MW`rO zNb|*lh0&%nXjASEbD2d&tVkiA)g9T$D&Jh8*C#~4jcf^wTa=2{2iKsJK${hw?8oGU zyL42!I9**_0Jl~Cfx#SSIBA>5VxF#H9`~|osh1Ut3!6eEUZ^u}`zv%?t!QHnlHD|=elK75*+qQ3?g+h*uf;B}3u8Mj zUZWce_4%5GeE!osOV+dM6^33m5e~o5z+28IF*zNH|M5W5_B0Ge@;_nIoqTP2QU+ugHdpcVA;tai`dP%;$gl&ww@e zZe+@xuEOIJ`{~2MDo8u#Ob;ktVV)~|+4al~)cWHHuxs_9{jP3g=;L;l`}GdyzyFTA z4c>894@1yI|2OdeJ>%W5I)U*obKv~6<7|=m1N>&!ORqlr$x<60;`Gi2y7QGGU+K3$ z{r9$&#r6-d=FFRXmz&i%OK&YJZag65op?(vdQ#||j;FM+ZVbvSyF$mVm&5TrmAGX5 z0B-*@%rf2nfI)#9$Vb`}zVJLZ%6Sf^7~jFRe?#1aP$%^EI!_jEFJ;SO4Y`Ht*J*$3 zFzYhi#hxs=Mz`Cm@S|%6>7Nr{Sw>edo8ytkSICxN$LC*VD!2TE{XawK>FWt}`jaSn z&o2-Di%+D=KH7{IpvNBE+k>ie`k3Z%KU!HRjrm$`yL-*uRw)8$d^Ar6@I*;NA#*FH@J+} zuUjZ=sGQDERRi#HS-?7@9cg8~ChhcyU|nB=IHL&^W}K~rfT#Ts_VF$7;w-^c@gCRs z&;bs}pJR_-gwVRX$z;*eFK9Rz1p4DIGsp3ZS^SX{I=5{#rrMrD*=@hDTY5bX<`l!< z_6yj>WkYQHFKjFrrW>Tc;U)K%_*J}%N*+kX5A8o8rDcfuR863Hl?JqHzZ)abdhG9x zTG%^tCG)&Bt!Vq}C3#2D! zPoz;}QkX~he!fbM9krM@lePuiWUXJdS?Q*;^mX+bYI9eZ{K~6z z=YQJFvU)xJFC>s2(aeI9W%p35DwxRXwxHO?P%{(Qz$H4KhvmaA==`A%x|glNmUj=i zmKl4Xx&1Y!r{qEF*m<~f@Hz%tWxxTA2bf?Hg!)eP#P)YO80y@Gj%OV2&C+tT9e)Tj zsS(6|ii60U?_l&a0Oc+faaOr!u;_aWXzXq#lUDwRxn>ujMfxoiO#6+!_8+;(tBnwU zLK{?^hT-mnn;3DZ6XL&|BXzS0LW3!~P7O!1RV65S{S`N~;WyY{It7jzV_|>bNf?_Q zhpx&`LH_G0Znyj#Hk`ae7=6K$4LpqCi_eIpYgR7cf5<)0FUXG;wpI=4f(DYai`8&DtW?(2ICo&o4x{D^5e|2`crA+ z;uL(TI-mY(pTlolc8K?I#$MWd?>?I?xdRW>d$a3iRy0&U2saxzu`kv`FlTNQOA|lK zgsLiZTAczN6QwHjI~I?{#EhTcUc}BwJu@${-c9w?>X_{e8#c~aPMEYlj7ERl#}2%9 zp(|zF=%NYw{3&%twAk?u#2JU;Bh51MOs;~tj@ZV`4K{FvuJUYJyeS>u{{=i2Uji#r zGtyUFmG0nv*R#W(29!OH9 z&|uVEG+7vXPC(=S1+pYTHjM1&(JCE%e&UTobcx&#LiwkeeDr>H&BGICM)V21j<$jL z_+09dCdI_Gcbn_6cNo^#2x6and2OSYL%i}DF4L}pWHt?h{nXJ+e(N#eC^C!x--lEn z|BYZ#a~^YB29)^A?wZn^q$6ln_?^l4)KQnjsZ8xyCJdaL3C5C(_!qy1QmYSh1(&#y zY(eq?RFF^sL+@Oyd2d6n2-0b1whruEUI+_plVQ*3FF5ox6(*%bgT@pENLH(a&bnB* z+8fXAcUcL&Tuk^^^P=&fxh1*lSPgw1y8P4?GpX9!ne3tR4fgnEOs>Nm}p zdcD@;-!DzUsZ^Y0tl7d`*Qv7~Z%1?ae0ALV$C0j3kDyr|Ss*)B6Fj7!aL4MefS(`@ z225^YU_IOB{dehysIdy};yCEb%Gv#)6;vkg9bS z?6-Xa34Dha#-2irqvP1Q#T`8B3^DVGO{(no)+8<~bdW1~XofP)Wh{A03Eu-0nN*@S zYpy%Rcb3@7mR51>+%G3iKmH2Ia;k*Ihh|gH01-C53x^mHp2+*J(b_l^DLQ}Zc_zNz5oI?NB{ASii`e|YK zQu?E=g!X4t2w(G7@XwCW70P{26wbUMNj2XY@ii2q`MVPzvNIzSgig^X_=WQWgm)hG zvp4n1?AX>KK3NvUkJdehhFh+IR!<`w$M~Q=T zkTy5!V;&~QPlkqcKI*J#Uwu&^js?#=CFABc0}WKl2_8tPzsl)W?vrI~^8% zY)9`#2M~*~G>cYB6Hd(O7xk)kK0imFC7kl0g9ZqqZaVT zRcsRe`>4cMZj$CtRlD^*&Q7Ju17PPI*^ zk(zHYS8fI7J^W54t8O4_W96XNM+H4{e{%kQ!{)N_+329vA@Dv&ux0OJbe8ji-lYJm zp4UQ|>J|8HHyysW7viz4+i~0SGSJ&Qi~aiJ3MDI=smwEX5WROo&LV(q7?h>z>I&4! zelFWKVkIsVy{9MZzH`Cf-r<#UMYeX7A(U;Ng#D6}K`z3Y`6rKNe_S4cNsu{xuyPdm z9_|LMqeobW<7w!(I}7Pgq9OC!M3Bf6VrFv!7umO;>$nw&`>PyS-J07li|(V3(gr{) z?-`4E63zInI#7OHojhGTjujSIvi7YUUh|{Weez#O-r&yN5z5iqqeNKOKAV21dV@QS zXE5#1nQ*~nK4{*JgQ=s%=#pp?Zp7Oh4F4wNG!EHw);YRp_NEBJ5)#05%|R%T&LxX? z8?c|T4lu1Tg$nP*aVF&ztaR5!c4xE_WV`$pg!;~Ahh)q!e%C*ubCrjK>rL_WWfcew z;mE^BqnW|=JJ32L&whk#WQVqkQC$~%y8O^icu7Q8Jh@2+;9x@FyPUQ>Q z(h=1zzrsJI!6>Jn#~HiXv615famY4~-nX8Mx85iV2|L6_FKXZ|6XEkh6FKJc<1_Z` zTMM^Jdr8TOlX&&hUK9j9hvE4q;CDaQcQVYIJPPW(tuS($ora4)*qhD5^LN9&(qi7OpQOJaj2C>J-x@qf6eAvG}VRo zUQVM2cy*{65yNB?6wz~sKDzx_51RdVaM03-d(rR{mYT0Zy2(q}Xc#2|FK&U3F`k*_kB0?6 zFyP^R&~LGU+}y8_*7X9{PD(=m9v)2EA|T67e?W6#F;-?6L5o`pO2(yu!HdV(Yv+Qt z9vRRq{SgMO|3K}iD5!0Jf$?tVxGu9xV1GmiZA(An?@Gi*<1$iE)J(+c@0T?r2W5Y6 zqFUf{=nWo=t|y;DhhYy-N%IA?yn08>#{a>0du!lEQwvo4UV!Rj_ONQ%V%#{P2kItH z2G?cpL7=879CmTy$L~MR|9v1|s5Q$Ow!eNLJa)E}HRzn@XBx?)Nd|&o(|K$dU5HD? z=AoYYb+A2qpWJqOPqH?z#dDvx<93fDwDQ|%^qo8rbGlCo8)718ZeTWXnvg`(D-%R| zh8^|Tp2AF~TxRbxC!x}r9qe$^1Zs0kn?&mB^TS?eGQ~7);n1&kC_1Of5`(O1j8QhH zziO1w=5P$Gsr^e2R9~g~el>LZ7Du6nuoCXykb%=l-*LLy3kW-~4#N7MK-8?$wD{c( zhBoJL^HrcG`<-b*^kr5gCN6AP>52}M+|cN18r}K4m6tU00*k*Mh*NR^12?o|_0m!z zw{$v2IOc=HUlB(iItWkp%@Uqhn8a6dG^IbeVu8aj3NDGZE&0%-rHK@FpN|l40=>13e ztRwLVB$O#bW_&DX5;Bc*T^<8nV-AV>Un-6&szXN^17Xd`4|GAc4Ar_K%Z^vQrn_)~ zP$R05+ZgBr2HiO1BeXo> ziHj|!fv~NKcD+ibFX!)KmX1P@-*z20f4=~8<4&WI-xktXFb|$@EyMKeR&d`gCmj1s zoR6=L(YrB)tnd6Mx?$vGVc(S3kpAHa>^oG6a~zGR)&ymo8K_L14k=T+WhYo{RWK`i zRt4j`6X~?nDyAS%6plH(9*geYhUgLd>Dtmk7|2g$q3j3xuNWiXXV`H=ryXF^)92Xj z(g0@0vgmyFJ9#%%Mrf2Q$Jg=A1BI9t+(u;iMQ6tfJ3<}cQJ@0TBfl_p-WcfhS_S-v z69v5^)S>!j1m~UQ3W=K$%;NL0Az>26ze>TFsJ)$@HmGW=Q}aRDVhs^k_nk5 zZA5q|fmAP?i^4(yBl(hw!Ntt5gw|Dmaj54POnqC2 z{*jT~Zs{~=R>{P>m(#%W`U)&~Di8e_0g