@@ -87,10 +87,134 @@ STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
8787STATISTIC (StackSlotMerged, " Number of stack slot merged." );
8888STATISTIC (EscapedAllocas, " Number of allocas that escaped the lifetime region" );
8989
90+ // ===----------------------------------------------------------------------===//
91+ // StackColoring Pass
92+ // ===----------------------------------------------------------------------===//
93+ //
94+ // Stack Coloring reduces stack usage by merging stack slots when they
95+ // can't be used together. For example, consider the following C program:
96+ //
97+ // void bar(char *, int);
98+ // void foo(bool var) {
99+ // A: {
100+ // char z[4096];
101+ // bar(z, 0);
102+ // }
103+ //
104+ // char *p;
105+ // char x[4096];
106+ // char y[4096];
107+ // if (var) {
108+ // p = x;
109+ // } else {
110+ // bar(y, 1);
111+ // p = y + 1024;
112+ // }
113+ // B:
114+ // bar(p, 2);
115+ // }
116+ //
117+ // Naively-compiled, this program would use 12k of stack space. However, the
118+ // stack slot corresponding to `z` is always destroyed before either of the
119+ // stack slots for `x` or `y` are used, and then `x` is only used if `var`
120+ // is true, while `y` is only used if `var` is false. So in no time are 2
121+ // of the stack slots used together, and therefore we can merge them,
122+ // compiling the function using only a single 4k alloca:
123+ //
124+ // void foo(bool var) { // equivalent
125+ // char x[4096];
126+ // char *p;
127+ // bar(x, 0);
128+ // if (var) {
129+ // p = x;
130+ // } else {
131+ // bar(x, 1);
132+ // p = x + 1024;
133+ // }
134+ // bar(p, 2);
135+ // }
136+ //
137+ // This is an important optimization if we want stack space to be under
138+ // control in large functions, both open-coded ones and ones created by
139+ // inlining.
90140//
91141// Implementation Notes:
92142// ---------------------
93143//
144+ // An important part of the above reasoning is that `z` can't be accessed
145+ // while the latter 2 calls to `bar` are running. This is justified because
146+ // `z`'s lifetime is over after we exit from block `A:`, so any further
147+ // accesses to it would be UB. The way we represent this information
148+ // in LLVM is by having frontends delimit blocks with `lifetime.start`
149+ // and `lifetime.end` intrinsics.
150+ //
151+ // The effect of these intrinsics seems to be as follows (maybe I should
152+ // specify this in the reference?):
153+ //
154+ // L1) at start, each stack-slot is marked as *out-of-scope*, unless no
155+ // lifetime intrinsic refers to that stack slot, in which case
156+ // it is marked as *in-scope*.
157+ // L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
158+ // the stack slot is overwritten with `undef`.
159+ // L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
160+ // L4) on function exit, all stack slots are marked as *out-of-scope*.
161+ // L5) `lifetime.end` is a no-op when called on a slot that is already
162+ // *out-of-scope*.
163+ // L6) memory accesses to *out-of-scope* stack slots are UB.
164+ // L7) when a stack-slot is marked as *out-of-scope*, all pointers to it
165+ // are invalidated, unless the slot is "degenerate". This is used to
166+ // justify not marking slots as in-use until the pointer to them is
167+ // used, but feels a bit hacky in the presence of things like LICM. See
168+ // the "Degenerate Slots" section for more details.
169+ //
170+ // Now, let's ground stack coloring on these rules. We'll define a slot
171+ // as *in-use* at a (dynamic) point in execution if it either can be
172+ // written to at that point, or if it has a live and non-undef content
173+ // at that point.
174+ //
175+ // Obviously, slots that are never *in-use* together can be merged, and
176+ // in our example `foo`, the slots for `x`, `y` and `z` are never
177+ // in-use together (of course, sometimes slots that *are* in-use together
178+ // might still be mergable, but we don't care about that here).
179+ //
180+ // In this implementation, we successively merge pairs of slots that are
181+ // not *in-use* together. We could be smarter - for example, we could merge
182+ // a single large slot with 2 small slots, or we could construct the
183+ // interference graph and run a "smart" graph coloring algorithm, but with
184+ // that aside, how do we find out whether a pair of slots might be *in-use*
185+ // together?
186+ //
187+ // From our rules, we see that *out-of-scope* slots are never *in-use*,
188+ // and from (L7) we see that "non-degenerate" slots remain non-*in-use*
189+ // until their address is taken. Therefore, we can approximate slot activity
190+ // using dataflow.
191+ //
192+ // A subtle point: naively, we might try to figure out which pairs of
193+ // stack-slots interfere by propagating `S in-use` through the CFG for every
194+ // stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
195+ // which they are both *in-use*.
196+ //
197+ // That is sound, but overly conservative in some cases: in our (artificial)
198+ // example `foo`, either `x` or `y` might be in use at the label `B:`, but
199+ // as `x` is only in use if we came in from the `var` edge and `y` only
200+ // if we came from the `!var` edge, they still can't be in use together.
201+ // See PR32488 for an important real-life case.
202+ //
203+ // If we wanted to find all points of interference precisely, we could
204+ // propagate `S in-use` and `S&T in-use` predicates through the CFG. That
205+ // would be precise, but requires propagating `O(n^2)` dataflow facts.
206+ //
207+ // However, we aren't interested in the *set* of points of interference
208+ // between 2 stack slots, only *whether* there *is* such a point. So we
209+ // can rely on a little trick: for `S` and `T` to be in-use together,
210+ // one of them needs to become in-use while the other is in-use (or
211+ // they might both become in use simultaneously). We can check this
212+ // by also keeping track of the points at which a stack slot might *start*
213+ // being in-use.
214+ //
215+ // Exact first use:
216+ // ----------------
217+ //
94218// Consider the following motivating example:
95219//
96220// int foo() {
@@ -159,6 +283,9 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
159283// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
160284// byte stack (better).
161285//
286+ // Degenerate Slots:
287+ // -----------------
288+ //
162289// Relying entirely on first-use of stack slots is problematic,
163290// however, due to the fact that optimizations can sometimes migrate
164291// uses of a variable outside of its lifetime start/end region. Here
@@ -238,10 +365,6 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
238365// for "b" then it will appear that 'b' has a degenerate lifetime.
239366//
240367
241- // ===----------------------------------------------------------------------===//
242- // StackColoring Pass
243- // ===----------------------------------------------------------------------===//
244-
245368namespace {
246369// / StackColoring - A machine pass for merging disjoint stack allocations,
247370// / marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
@@ -272,8 +395,11 @@ class StackColoring : public MachineFunctionPass {
272395 // / Maps basic blocks to a serial number.
273396 SmallVector<const MachineBasicBlock*, 8 > BasicBlockNumbering;
274397
275- // / Maps liveness intervals for each slot.
398+ // / Maps slots to their use interval. Outside of this interval, slots
399+ // / values are either dead or `undef` and they will not be written to.
276400 SmallVector<std::unique_ptr<LiveInterval>, 16 > Intervals;
401+ // / Maps slots to the points where they can become in-use.
402+ SmallVector<SmallVector<SlotIndex, 4 >, 16 > LiveStarts;
277403 // / VNInfo is used for the construction of LiveIntervals.
278404 VNInfo::Allocator VNInfoAllocator;
279405 // / SlotIndex analysis object.
@@ -676,15 +802,22 @@ void StackColoring::calculateLocalLiveness()
676802
677803void StackColoring::calculateLiveIntervals (unsigned NumSlots) {
678804 SmallVector<SlotIndex, 16 > Starts;
679- SmallVector<SlotIndex , 16 > Finishes ;
805+ SmallVector<bool , 16 > DefinitelyInUse ;
680806
681807 // For each block, find which slots are active within this block
682808 // and update the live intervals.
683809 for (const MachineBasicBlock &MBB : *MF) {
684810 Starts.clear ();
685811 Starts.resize (NumSlots);
686- Finishes.clear ();
687- Finishes.resize (NumSlots);
812+ DefinitelyInUse.clear ();
813+ DefinitelyInUse.resize (NumSlots);
814+
815+ // Start the interval of the slots that we previously found to be 'in-use'.
816+ BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
817+ for (int pos = MBBLiveness.LiveIn .find_first (); pos != -1 ;
818+ pos = MBBLiveness.LiveIn .find_next (pos)) {
819+ Starts[pos] = Indexes->getMBBStartIdx (&MBB);
820+ }
688821
689822 // Create the interval for the basic blocks containing lifetime begin/end.
690823 for (const MachineInstr &MI : MBB) {
@@ -696,68 +829,35 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
696829 SlotIndex ThisIndex = Indexes->getInstructionIndex (MI);
697830 for (auto Slot : slots) {
698831 if (IsStart) {
699- if (!Starts[Slot].isValid () || Starts[Slot] > ThisIndex)
832+ // If a slot is already definitely in use, we don't have to emit
833+ // a new start marker because there is already a pre-existing
834+ // one.
835+ if (!DefinitelyInUse[Slot]) {
836+ LiveStarts[Slot].push_back (ThisIndex);
837+ DefinitelyInUse[Slot] = true ;
838+ }
839+ if (!Starts[Slot].isValid ())
700840 Starts[Slot] = ThisIndex;
701841 } else {
702- if (!Finishes[Slot].isValid () || Finishes[Slot] < ThisIndex)
703- Finishes[Slot] = ThisIndex;
842+ if (Starts[Slot].isValid ()) {
843+ VNInfo *VNI = Intervals[Slot]->getValNumInfo (0 );
844+ Intervals[Slot]->addSegment (
845+ LiveInterval::Segment (Starts[Slot], ThisIndex, VNI));
846+ Starts[Slot] = SlotIndex (); // Invalidate the start index
847+ DefinitelyInUse[Slot] = false ;
848+ }
704849 }
705850 }
706851 }
707852
708- // Create the interval of the blocks that we previously found to be 'alive'.
709- BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
710- for (int pos = MBBLiveness.LiveIn .find_first (); pos != -1 ;
711- pos = MBBLiveness.LiveIn .find_next (pos)) {
712- Starts[pos] = Indexes->getMBBStartIdx (&MBB);
713- }
714- for (int pos = MBBLiveness.LiveOut .find_first (); pos != -1 ;
715- pos = MBBLiveness.LiveOut .find_next (pos)) {
716- Finishes[pos] = Indexes->getMBBEndIdx (&MBB);
717- }
718-
853+ // Finish up started segments
719854 for (unsigned i = 0 ; i < NumSlots; ++i) {
720- //
721- // When LifetimeStartOnFirstUse is turned on, data flow analysis
722- // is forward (from starts to ends), not bidirectional. A
723- // consequence of this is that we can wind up in situations
724- // where Starts[i] is invalid but Finishes[i] is valid and vice
725- // versa. Example:
726- //
727- // LIFETIME_START x
728- // if (...) {
729- // <use of x>
730- // throw ...;
731- // }
732- // LIFETIME_END x
733- // return 2;
734- //
735- //
736- // Here the slot for "x" will not be live into the block
737- // containing the "return 2" (since lifetimes start with first
738- // use, not at the dominating LIFETIME_START marker).
739- //
740- if (Starts[i].isValid () && !Finishes[i].isValid ()) {
741- Finishes[i] = Indexes->getMBBEndIdx (&MBB);
742- }
743855 if (!Starts[i].isValid ())
744856 continue ;
745857
746- assert (Starts[i] && Finishes[i] && " Invalid interval" );
747- VNInfo *ValNum = Intervals[i]->getValNumInfo (0 );
748- SlotIndex S = Starts[i];
749- SlotIndex F = Finishes[i];
750- if (S < F) {
751- // We have a single consecutive region.
752- Intervals[i]->addSegment (LiveInterval::Segment (S, F, ValNum));
753- } else {
754- // We have two non-consecutive regions. This happens when
755- // LIFETIME_START appears after the LIFETIME_END marker.
756- SlotIndex NewStart = Indexes->getMBBStartIdx (&MBB);
757- SlotIndex NewFin = Indexes->getMBBEndIdx (&MBB);
758- Intervals[i]->addSegment (LiveInterval::Segment (NewStart, F, ValNum));
759- Intervals[i]->addSegment (LiveInterval::Segment (S, NewFin, ValNum));
760- }
858+ SlotIndex EndIdx = Indexes->getMBBEndIdx (&MBB);
859+ VNInfo *VNI = Intervals[i]->getValNumInfo (0 );
860+ Intervals[i]->addSegment (LiveInterval::Segment (Starts[i], EndIdx, VNI));
761861 }
762862 }
763863}
@@ -987,6 +1087,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
9871087 BasicBlockNumbering.clear ();
9881088 Markers.clear ();
9891089 Intervals.clear ();
1090+ LiveStarts.clear ();
9901091 VNInfoAllocator.Reset ();
9911092
9921093 unsigned NumSlots = MFI->getObjectIndexEnd ();
@@ -998,6 +1099,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
9981099 SmallVector<int , 8 > SortedSlots;
9991100 SortedSlots.reserve (NumSlots);
10001101 Intervals.reserve (NumSlots);
1102+ LiveStarts.resize (NumSlots);
10011103
10021104 unsigned NumMarkers = collectMarkers (NumSlots);
10031105
@@ -1069,6 +1171,9 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
10691171 return MFI->getObjectSize (LHS) > MFI->getObjectSize (RHS);
10701172 });
10711173
1174+ for (auto &s : LiveStarts)
1175+ std::sort (s.begin (), s.end ());
1176+
10721177 bool Changed = true ;
10731178 while (Changed) {
10741179 Changed = false ;
@@ -1084,12 +1189,22 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
10841189 int SecondSlot = SortedSlots[J];
10851190 LiveInterval *First = &*Intervals[FirstSlot];
10861191 LiveInterval *Second = &*Intervals[SecondSlot];
1192+ auto &FirstS = LiveStarts[FirstSlot];
1193+ auto &SecondS = LiveStarts[SecondSlot];
10871194 assert (!First->empty () && !Second->empty () && " Found an empty range" );
10881195
1089- // Merge disjoint slots.
1090- if (!First->overlaps (*Second)) {
1196+ // Merge disjoint slots. This is a little bit tricky - see the
1197+ // Implementation Notes section for an explanation.
1198+ if (!First->isLiveAtIndexes (SecondS) &&
1199+ !Second->isLiveAtIndexes (FirstS)) {
10911200 Changed = true ;
10921201 First->MergeSegmentsInAsValue (*Second, First->getValNumInfo (0 ));
1202+
1203+ int OldSize = FirstS.size ();
1204+ FirstS.append (SecondS.begin (), SecondS.end ());
1205+ auto Mid = FirstS.begin () + OldSize;
1206+ std::inplace_merge (FirstS.begin (), Mid, FirstS.end ());
1207+
10931208 SlotRemap[SecondSlot] = FirstSlot;
10941209 SortedSlots[J] = -1 ;
10951210 DEBUG (dbgs ()<<" Merging #" <<FirstSlot<<" and slots #" <<
0 commit comments