From 6ffcd4b2825087408cd12881c5265a532f1df27d Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Tue, 1 Oct 2024 19:36:39 +0100
Subject: [PATCH] Add 'altnerating' strategy for naive interleaving

When `split_heuristic_preprocess_naive_interleaving` is enabled,
SLOTHY preprocesses the input by naively reordering instructions
according to their depths in the computational flow graph.

This commit introduces another naive interleaving strategy
"alternate" which will make SLOTHY alternate evenly between
instructions tagged with `interleaving_class=0/1`. This is
useful when two sequential blocks of code are to be interleaved
as evenly as possible, which is common in scalar/Neon hybrids.
---
 slothy/core/config.py     | 16 ++++++++
 slothy/core/heuristics.py | 85 +++++++++++++++++++--------------------
 2 files changed, 58 insertions(+), 43 deletions(-)

diff --git a/slothy/core/config.py b/slothy/core/config.py
index b7d98abb..ad3f387e 100644
--- a/slothy/core/config.py
+++ b/slothy/core/config.py
@@ -619,6 +619,18 @@ def split_heuristic_repeat(self):
                             "Shouldn't read config.split_heuristic_repeat otherwise.")
         return self._split_heuristic_repeat
 
+    @property
+    def split_heuristic_preprocess_naive_interleaving_strategy(self):
+        """Strategy for naive interleaving preprocessing step
+
+        Supported values are:
+          - "depth": Always pick the instruction with the lower possible
+                             depth in the DFG first.
+          - "alternate": Try to evenly alternate between instructions tagged with
+                         "interleaving_class=0/1".
+        """
+        return self._split_heuristic_preprocess_naive_interleaving_strategy
+
     def copy(self):
         """Make a deep copy of the configuration"""
         # Temporarily unset references to Arch and Target for deepcopy
@@ -1108,6 +1120,7 @@ def __init__(self, Arch, Target):
         self._split_heuristic_repeat = 1
         self._split_heuristic_preprocess_naive_interleaving = False
         self._split_heuristic_preprocess_naive_interleaving_by_latency = False
+        self._split_heuristic_preprocess_naive_interleaving_strategy = "depth"
         self._split_heuristic_estimate_performance = True
 
         self._compiler_binary = "gcc"
@@ -1303,6 +1316,9 @@ def split_heuristic_preprocess_naive_interleaving(self, val):
     @split_heuristic_preprocess_naive_interleaving_by_latency.setter
     def split_heuristic_preprocess_naive_interleaving_by_latency(self, val):
         self._split_heuristic_preprocess_naive_interleaving_by_latency = val
+    @split_heuristic_preprocess_naive_interleaving_strategy.setter
+    def split_heuristic_preprocess_naive_interleaving_strategy(self, val):
+        self._split_heuristic_preprocess_naive_interleaving_strategy = val
     @split_heuristic_estimate_performance.setter
     def split_heuristic_estimate_performance(self, val):
         self._split_heuristic_estimate_performance = val
diff --git a/slothy/core/heuristics.py b/slothy/core/heuristics.py
index c291d9c9..900b78e4 100644
--- a/slothy/core/heuristics.py
+++ b/slothy/core/heuristics.py
@@ -417,9 +417,7 @@ def _naive_reordering(body, logger, conf, use_latency_depth=False):
         dfg = DFG(body, logger.getChild("dfg"), DFGConfig(conf.copy()), parsing_cb=True)
         insts = [dfg.nodes[i] for i in range(l)]
 
-        if use_latency_depth is False:
-            depths = [dfg.nodes_by_id[i].depth for i in range(l) ]
-        else:
+        if use_latency_depth is True:
             # Calculate latency-depth of instruction nodes
             nodes_by_depth = dfg.nodes.copy()
             nodes_by_depth.sort(key=lambda t: t.depth)
@@ -434,7 +432,16 @@ def get_latency(tp,t):
                 t.latency_depth = max(map(lambda tp, t=t: tp.src.latency_depth +
                                           get_latency(tp, t), srcs),
                                       default=0)
-            depths = [dfg.nodes_by_id[i].latency_depth for i in range(l) ]
+
+        def get_depth(t):
+            if use_latency_depth is False:
+                pre_depth = t.depth
+            else:
+                pre_depth = t.latency_depth
+            scale = float(t.inst.source_line.tags.get("naive_interleaving_scale",1.0))
+            return int(pre_depth * scale)
+
+        depths = [get_depth(dfg.nodes_by_id[i]) for i in range(l) ]
 
         inputs = dfg.inputs.copy()
         outputs = conf.outputs.copy()
@@ -449,6 +456,17 @@ def get_outputs(inst):
         joint_prev_inputs = {}
         joint_prev_outputs = {}
 
+        strategy = conf.split_heuristic_preprocess_naive_interleaving_strategy
+
+        def get_interleaving_class(j):
+            return int(insts[j].inst.source_line.tags.get("interleaving_class", 0))
+
+        if strategy == "alternate":
+            # Compute target ratio between code classes
+            sz_0 = max(len(list(filter(lambda j: get_interleaving_class(j) == 0, range(l)))), 1)
+            sz_1 = max(len(list(filter(lambda j: get_interleaving_class(j) == 1, range(l)))), 1)
+            target_ratio = sz_0 / sz_1
+
         for i in range(l):
             cur_joint_prev_inputs = set()
             cur_joint_prev_outputs = set()
@@ -477,50 +495,31 @@ def could_come_next(j):
 
             def pick_candidate(candidate_idxs):
 
-                strategy = "minimal_depth"
-
-                if strategy == "minimal_depth":
+                if strategy == "depth":
                     candidate_depths = list(map(lambda j: depths[j], candidate_idxs))
                     logger.debug("Candidate %s: %s", depth_str, candidate_depths)
                     choice_idx = candidate_idxs[candidate_depths.index(min(candidate_depths))]
 
                 else:
-                    assert strategy == "alternate_functional_units"
-                    def flatten_units(units):
-                        res = []
-                        for u in units:
-                            if isinstance(u,list):
-                                res += u
-                            else:
-                                res.append(u)
-                        return res
-                    def units_disjoint(a,b):
-                        if a is None or b is None:
-                            return True
-                        a = flatten_units(a)
-                        b = flatten_units(b)
-                        return len([x for x in a if x in b]) == 0
-                    def units_different(a,b):
-                        return a != b
-
-                    disjoint_unit_idxs = [ i for i in candidate_idxs
-                        if units_disjoint(conf.target.get_units(insts[i].inst), last_unit) ]
-                    other_unit_idxs = [ i for i in candidate_idxs
-                        if units_different(conf.target.get_units(insts[i].inst), last_unit) ]
-
-                    if len(disjoint_unit_idxs) > 0:
-                        choice_idx = random.choice(disjoint_unit_idxs)
-                        last_unit = conf.target.get_units(insts[choice_idx].inst)
-                    elif len(other_unit_idxs) > 0:
-                        choice_idx = random.choice(other_unit_idxs)
-                        last_unit = conf.target.get_units(insts[choice_idx].inst)
+                    assert strategy == "alternate"
+
+                    sz_0 = max(len(list(filter(lambda j: get_interleaving_class(j) == 0, range(i)))), 1)
+                    sz_1 = max(len(list(filter(lambda j: get_interleaving_class(j) == 1, range(i)))), 1)
+
+                    candidates_0 = filter(lambda j: get_interleaving_class(j) == 0, candidate_idxs)
+                    candidates_1 = filter(lambda j: get_interleaving_class(j) == 1, candidate_idxs)
+
+                    current_ratio = sz_0 / sz_1
+
+                    c0 = next(candidates_0, None)
+                    c1 = next(candidates_1, None)
+
+                    if current_ratio > target_ratio and c1 is not None:
+                        choice_idx = c1
+                    elif c0 is not None:
+                        choice_idx = c0
                     else:
-                        candidate_depths = list(map(lambda j: depths[j], candidate_idxs))
-                        logger.debug(f"Candidate {depth_str}s: {candidate_depths}")
-                        min_depth = min(candidate_depths)
-                        refined_candidates = [ candidate_idxs[i]
-                            for i,d in enumerate(candidate_depths) if d == min_depth ]
-                        choice_idx = random.choice(refined_candidates)
+                        choice_idx = candidate_idxs[0]
 
                 return choice_idx
 
@@ -786,7 +785,7 @@ def not_empty(x):
         res.output_renamings = { s:s for s in outputs }
         res.valid = True
         res.selfcheck(log.getChild("split_heuristic_full"))
-        
+
         # Estimate performance of final code
         if conf.split_heuristic_estimate_performance:
             conf2 = conf.copy()