From 6ffcd4b2825087408cd12881c5265a532f1df27d Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Tue, 1 Oct 2024 19:36:39 +0100 Subject: [PATCH] Add 'altnerating' strategy for naive interleaving When `split_heuristic_preprocess_naive_interleaving` is enabled, SLOTHY preprocesses the input by naively reordering instructions according to their depths in the computational flow graph. This commit introduces another naive interleaving strategy "alternate" which will make SLOTHY alternate evenly between instructions tagged with `interleaving_class=0/1`. This is useful when two sequential blocks of code are to be interleaved as evenly as possible, which is common in scalar/Neon hybrids. --- slothy/core/config.py | 16 ++++++++ slothy/core/heuristics.py | 85 +++++++++++++++++++-------------------- 2 files changed, 58 insertions(+), 43 deletions(-) diff --git a/slothy/core/config.py b/slothy/core/config.py index b7d98abb..ad3f387e 100644 --- a/slothy/core/config.py +++ b/slothy/core/config.py @@ -619,6 +619,18 @@ def split_heuristic_repeat(self): "Shouldn't read config.split_heuristic_repeat otherwise.") return self._split_heuristic_repeat + @property + def split_heuristic_preprocess_naive_interleaving_strategy(self): + """Strategy for naive interleaving preprocessing step + + Supported values are: + - "depth": Always pick the instruction with the lower possible + depth in the DFG first. + - "alternate": Try to evenly alternate between instructions tagged with + "interleaving_class=0/1". + """ + return self._split_heuristic_preprocess_naive_interleaving_strategy + def copy(self): """Make a deep copy of the configuration""" # Temporarily unset references to Arch and Target for deepcopy @@ -1108,6 +1120,7 @@ def __init__(self, Arch, Target): self._split_heuristic_repeat = 1 self._split_heuristic_preprocess_naive_interleaving = False self._split_heuristic_preprocess_naive_interleaving_by_latency = False + self._split_heuristic_preprocess_naive_interleaving_strategy = "depth" self._split_heuristic_estimate_performance = True self._compiler_binary = "gcc" @@ -1303,6 +1316,9 @@ def split_heuristic_preprocess_naive_interleaving(self, val): @split_heuristic_preprocess_naive_interleaving_by_latency.setter def split_heuristic_preprocess_naive_interleaving_by_latency(self, val): self._split_heuristic_preprocess_naive_interleaving_by_latency = val + @split_heuristic_preprocess_naive_interleaving_strategy.setter + def split_heuristic_preprocess_naive_interleaving_strategy(self, val): + self._split_heuristic_preprocess_naive_interleaving_strategy = val @split_heuristic_estimate_performance.setter def split_heuristic_estimate_performance(self, val): self._split_heuristic_estimate_performance = val diff --git a/slothy/core/heuristics.py b/slothy/core/heuristics.py index c291d9c9..900b78e4 100644 --- a/slothy/core/heuristics.py +++ b/slothy/core/heuristics.py @@ -417,9 +417,7 @@ def _naive_reordering(body, logger, conf, use_latency_depth=False): dfg = DFG(body, logger.getChild("dfg"), DFGConfig(conf.copy()), parsing_cb=True) insts = [dfg.nodes[i] for i in range(l)] - if use_latency_depth is False: - depths = [dfg.nodes_by_id[i].depth for i in range(l) ] - else: + if use_latency_depth is True: # Calculate latency-depth of instruction nodes nodes_by_depth = dfg.nodes.copy() nodes_by_depth.sort(key=lambda t: t.depth) @@ -434,7 +432,16 @@ def get_latency(tp,t): t.latency_depth = max(map(lambda tp, t=t: tp.src.latency_depth + get_latency(tp, t), srcs), default=0) - depths = [dfg.nodes_by_id[i].latency_depth for i in range(l) ] + + def get_depth(t): + if use_latency_depth is False: + pre_depth = t.depth + else: + pre_depth = t.latency_depth + scale = float(t.inst.source_line.tags.get("naive_interleaving_scale",1.0)) + return int(pre_depth * scale) + + depths = [get_depth(dfg.nodes_by_id[i]) for i in range(l) ] inputs = dfg.inputs.copy() outputs = conf.outputs.copy() @@ -449,6 +456,17 @@ def get_outputs(inst): joint_prev_inputs = {} joint_prev_outputs = {} + strategy = conf.split_heuristic_preprocess_naive_interleaving_strategy + + def get_interleaving_class(j): + return int(insts[j].inst.source_line.tags.get("interleaving_class", 0)) + + if strategy == "alternate": + # Compute target ratio between code classes + sz_0 = max(len(list(filter(lambda j: get_interleaving_class(j) == 0, range(l)))), 1) + sz_1 = max(len(list(filter(lambda j: get_interleaving_class(j) == 1, range(l)))), 1) + target_ratio = sz_0 / sz_1 + for i in range(l): cur_joint_prev_inputs = set() cur_joint_prev_outputs = set() @@ -477,50 +495,31 @@ def could_come_next(j): def pick_candidate(candidate_idxs): - strategy = "minimal_depth" - - if strategy == "minimal_depth": + if strategy == "depth": candidate_depths = list(map(lambda j: depths[j], candidate_idxs)) logger.debug("Candidate %s: %s", depth_str, candidate_depths) choice_idx = candidate_idxs[candidate_depths.index(min(candidate_depths))] else: - assert strategy == "alternate_functional_units" - def flatten_units(units): - res = [] - for u in units: - if isinstance(u,list): - res += u - else: - res.append(u) - return res - def units_disjoint(a,b): - if a is None or b is None: - return True - a = flatten_units(a) - b = flatten_units(b) - return len([x for x in a if x in b]) == 0 - def units_different(a,b): - return a != b - - disjoint_unit_idxs = [ i for i in candidate_idxs - if units_disjoint(conf.target.get_units(insts[i].inst), last_unit) ] - other_unit_idxs = [ i for i in candidate_idxs - if units_different(conf.target.get_units(insts[i].inst), last_unit) ] - - if len(disjoint_unit_idxs) > 0: - choice_idx = random.choice(disjoint_unit_idxs) - last_unit = conf.target.get_units(insts[choice_idx].inst) - elif len(other_unit_idxs) > 0: - choice_idx = random.choice(other_unit_idxs) - last_unit = conf.target.get_units(insts[choice_idx].inst) + assert strategy == "alternate" + + sz_0 = max(len(list(filter(lambda j: get_interleaving_class(j) == 0, range(i)))), 1) + sz_1 = max(len(list(filter(lambda j: get_interleaving_class(j) == 1, range(i)))), 1) + + candidates_0 = filter(lambda j: get_interleaving_class(j) == 0, candidate_idxs) + candidates_1 = filter(lambda j: get_interleaving_class(j) == 1, candidate_idxs) + + current_ratio = sz_0 / sz_1 + + c0 = next(candidates_0, None) + c1 = next(candidates_1, None) + + if current_ratio > target_ratio and c1 is not None: + choice_idx = c1 + elif c0 is not None: + choice_idx = c0 else: - candidate_depths = list(map(lambda j: depths[j], candidate_idxs)) - logger.debug(f"Candidate {depth_str}s: {candidate_depths}") - min_depth = min(candidate_depths) - refined_candidates = [ candidate_idxs[i] - for i,d in enumerate(candidate_depths) if d == min_depth ] - choice_idx = random.choice(refined_candidates) + choice_idx = candidate_idxs[0] return choice_idx @@ -786,7 +785,7 @@ def not_empty(x): res.output_renamings = { s:s for s in outputs } res.valid = True res.selfcheck(log.getChild("split_heuristic_full")) - + # Estimate performance of final code if conf.split_heuristic_estimate_performance: conf2 = conf.copy()