diff --git a/.empty b/.empty deleted file mode 100644 index e69de29b..00000000 diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 00000000..8081fdc5 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,64 @@ +# Derived from regalloc.rs' GitHub CI config file. + +name: Rust + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + # Lint code with rustfmt, report an error if it needs to be run. + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install rustfmt + run: rustup component add rustfmt + - name: Run rustfmt and check there's no difference + run: cargo fmt --all -- --check + + # Make sure the code compiles and that all the tests pass. + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build + run: cargo build + - name: Run tests + run: cargo test --all --verbose + + # Lint dependency graph for security advisories, duplicate versions, and + # incompatible licences. + cargo_deny: + name: Cargo deny + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - run: | + set -e + curl -L https://github.com/EmbarkStudios/cargo-deny/releases/download/0.8.5/cargo-deny-0.8.5-x86_64-unknown-linux-musl.tar.gz | tar xzf - + mv cargo-deny-*-x86_64-unknown-linux-musl/cargo-deny cargo-deny + echo `pwd` >> $GITHUB_PATH + - run: cargo deny check + + # Builds the fuzz targets. + fuzz: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install nightly + run: rustup toolchain install nightly + - name: Install cargo-fuzz + run: cargo +nightly install cargo-fuzz + - name: Build ssagen fuzzing target + run: cargo +nightly fuzz build ssagen + - name: Build moves fuzzing target + run: cargo +nightly fuzz build moves + - name: Build ion fuzzing target + run: cargo +nightly fuzz build ion + - name: Build and smoke-test ion_checker fuzzing target + run: cargo +nightly fuzz run ion_checker ./fuzz/smoketest/ion_checker.bin diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..aadc1161 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +Cargo.lock +target/ +.*.swp +*~ diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..802881be --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "regalloc2" +version = "0.0.1" +authors = ["Chris Fallin ", "Mozilla SpiderMonkey Developers"] +edition = "2018" +license = "Apache-2.0 WITH LLVM-exception AND MPL-2.0" +description = "Backtracking register allocator inspired from IonMonkey" +repository = "https://github.com/bytecodealliance/regalloc2" + +[dependencies] +log = { version = "0.4.8", default-features = false } +smallvec = "1.6.1" +fxhash = "0.2.1" + +# The below are only needed for fuzzing. +# Keep this in sync with libfuzzer_sys's crate version: +arbitrary = { version = "^0.4.6", optional = true } + +[profile.release] +debug = true + +[features] +default = [] +fuzzing = ["arbitrary"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f9d81955 --- /dev/null +++ b/LICENSE @@ -0,0 +1,220 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/README.md b/README.md new file mode 100644 index 00000000..a160ed3f --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +## regalloc2: another register allocator + +This is a register allocator that started life as, and is about 50% +still, a port of IonMonkey's backtracking register allocator to +Rust. In many regards, it has been generalized, optimized, and +improved since the initial port, and now supports both SSA and non-SSA +use-cases. + +In addition, it contains substantial amounts of testing infrastructure +(fuzzing harnesses and checkers) that does not exist in the original +IonMonkey allocator. + +See the [design overview](doc/DESIGN.md) for (much!) more detail on +how the allocator works. + +## License + +Unless otherwise specified, code in this crate is licensed under the Apache 2.0 +License with LLVM Exception. This license text can be found in the file +`LICENSE`. + +Files in the `src/ion/` directory are directly ported from original C++ code in +IonMonkey, a part of the Firefox codebase. Parts of `src/lib.rs` are also +definitions that are directly translated from this original code. As a result, +these files are derivative works and are covered by the Mozilla Public License +(MPL) 2.0, as described in license headers in those files. Please see the +notices in relevant files for links to the original IonMonkey source files from +which they have been translated/derived. The MPL text can be found in +`src/ion/LICENSE`. + +Parts of the code are derived from regalloc.rs: in particular, +`src/checker.rs` and `src/domtree.rs`. This crate has the same license +as regalloc.rs, so the license on these files does not differ. diff --git a/doc/DESIGN.md b/doc/DESIGN.md new file mode 100644 index 00000000..ccb28006 --- /dev/null +++ b/doc/DESIGN.md @@ -0,0 +1,1650 @@ +# regalloc2 Design Overview + +This document describes the basic architecture of the regalloc2 +register allocator. It describes the externally-visible interface +(input CFG, instructions, operands, with their invariants; meaning of +various parts of the output); core data structures; and the allocation +pipeline, or series of algorithms that compute an allocation. It ends +with a description of future work and expectations, as well as an +appendix that notes design influences and similarities to the +IonMonkey backtracking allocator. + +# API, Input IR and Invariants + +The toplevel API to regalloc2 consists of a single entry point `run()` +that takes a register environment, which specifies all physical +registers, and the input program. The function returns either an error +or an `Output` struct that provides allocations for each operand and a +vector of additional instructions (moves, loads, stores) to insert. + +## Register Environment + +The allocator takes a `MachineEnv` which specifies, for each of the +two register classes `Int` and `Float`, a vector of `PReg`s by index. A +`PReg` is nothing more than the class and index within the class; the +allocator does not need to know anything more. + +The `MachineEnv` provides a vector of preferred and non-preferred +physical registers per class. Any register not in either vector will +not be allocated. Usually, registers that do not need to be saved in +the prologue if used (i.e., caller-save registers) are given in the +"preferred" vector. The environment also provides exactly one scratch +register per class. This register must not be in the preferred or +non-preferred vectors, and is used whenever a set of moves that need +to occur logically in parallel have a cycle (for a simple example, +consider a swap `r0, r1 := r1, r0`). + +With some more work, we could potentially remove the need for the +scratch register by requiring support for an additional edit type from +the client ("swap"), but we have not pursued this. + +## CFG and Instructions + +The allocator operates on an input program that is in a standard CFG +representation: the function body is a sequence of basic blocks, and +each block has a sequence of instructions and zero or more +successors. The allocator also requires the client to provide +predecessors for each block, and these must be consistent with the +successors. + +Instructions are opaque to the allocator except for a few important +bits: (1) `is_ret` (is a return instruction); (2) `is_branch` (is a +branch instruction); (3) `is_call` (is a call instruction, for +heuristic purposes only), (4) `is_move` (is a move between registers), +and (5) a vector of Operands, covered below. Every block must end in a +return or branch. + +Both instructions and blocks are named by indices in contiguous index +spaces. A block's instructions must be a contiguous range of +instruction indices, and block i's first instruction must come +immediately after block i-1's last instruction. + +The CFG must have *no critical edges*. A critical edge is an edge from +block A to block B such that A has more than one successor *and* B has +more than one predecessor. For this definition, the entry block has an +implicit predecessor, and any block that ends in a return has an +implicit successor. + +Note that there are *no* requirements related to the ordering of +blocks, and there is no requirement that the control flow be +reducible. Some *heuristics* used by the allocator will perform better +if the code is reducible and ordered in reverse postorder (RPO), +however: in particular, (1) this interacts better with the +contiguous-range-of-instruction-indices live range representation that +we use, and (2) the "approximate loop depth" metric will actually be +exact if both these conditions are met. + +## Operands and VRegs + +Every instruction operates on values by way of `Operand`s. An operand +consists of the following fields: + +- VReg, or virtual register. *Every* operand mentions a virtual + register, even if it is constrained to a single physical register in + practice. This is because we track liveranges uniformly by vreg. + +- Policy, or "constraint". Every reference to a vreg can apply some + constraint to the vreg at that point in the program. Valid policies are: + + - Any location; + - Any register of the vreg's class; + - Any stack slot; + - A particular fixed physical register; or + - For a def (output), a *reuse* of an input register. + +- The "kind" of reference to this vreg: Def, Use, Mod. A def + (definition) writes to the vreg, and disregards any possible earlier + value. A mod (modify) reads the current value then writes a new + one. A use simply reads the vreg's value. + +- The position: before or after the instruction. + - Note that to have a def (output) register available in a way that + does not conflict with inputs, the def should be placed at the + "before" position. Similarly, to have a use (input) register + available in a way that does not conflict with outputs, the use + should be placed at the "after" position. + +This operand-specification design allows for SSA and non-SSA code (see +section below for details). + +VRegs, or virtual registers, are specified by an index and a register +class (Float or Int). The classes are not given separately; they are +encoded on every mention of the vreg. (In a sense, the class is an +extra index bit, or part of the register name.) The input function +trait does require the client to provide the exact vreg count, +however. + +Implementation note: both vregs and operands are bit-packed into +u32s. This is essential for memory-efficiency. As a result of the +operand bit-packing in particular (including the policy constraints!), +the allocator supports up to 2^20 (1M) vregs per function, and 2^5 +(32) physical registers per class. Later we will also see a limit of +2^20 (1M) instructions per function. These limits are considered +sufficient for the anticipated use-cases (e.g., compiling Wasm, which +also has function-size implementation limits); for larger functions, +it is likely better to use a simpler register allocator in any case. + +## Reuses and Two-Address ISAs + +Some instruction sets primarily have instructions that name only two +registers for a binary operator, rather than three: both registers are +inputs, and the result is placed in one of the registers, clobbering +its original value. The most well-known modern example is x86. It is +thus imperative that we support this pattern well in the register +allocator. + +This instruction-set design is somewhat at odds with an SSA +representation, where a value cannot be redefined. Even in non-SSA +code, it is awkward to overwrite a vreg that may need to be used again +later. + +Thus, the allocator supports a useful fiction of sorts: the +instruction can be described as if it has three register mentions -- +two inputs and a separate output -- and neither input will be +clobbered. The output, however, is special: its register-placement +policy is "reuse input i" (where i == 0 or 1). The allocator +guarantees that the register assignment for that input and the output +will be the same, so the instruction can use that register as its +"modifies" operand. If the input is needed again later, the allocator +will take care of the necessary copying. + +We will see below how the allocator makes this work by doing some +preprocessing so that the core allocation algorithms do not need to +worry about this constraint. + +Note that some non-SSA clients, such as Cranelift using the +regalloc.rs-to-regalloc2 compatibility shim, will instead generate +their own copies (copying to the output vreg first) and then use "mod" +operand kinds, which allow the output vreg to be both read and +written. regalloc2 works hard to make this as efficient as the +reused-input scheme by treating moves specially (see below). + +## SSA + +regalloc2 was originally designed to take an SSA IR as input, where +the usual definitions apply: every vreg is defined exactly once, and +every vreg use is dominated by its one def. (Useing blockparams means +that we do not need additional conditions for phi-nodes.) + +The allocator then evolved to support non-SSA inputs as well. As a +result, the input is maximally flexible right now: it does not check +for and enforce, nor try to take advantage of, the single-def +rule. However, blockparams are still available. + +In the future, we hope to change this, however, once compilation of +non-SSA inputs is no longer needed. Specifically, if we can migrate +Cranelift to the native regalloc2 API rather than the regalloc.rs +compatibility shim, we will be able to remove "mod" operand kinds, +assume (and verify) single defs, and take advantage of this when +reasoning about various algorithms in the allocator. + +## Block Parameters + +Every block can have *block parameters*, and a branch to a block with +block parameters must provide values for those parameters via +operands. When a branch has more than one successor, it provides +separate operands for each possible successor. These block parameters +are equivalent to phi-nodes; we chose this representation because they +are in many ways a more consistent representation of SSA. + +To see why we believe block parameters are a slightly nicer design +choice than use of phi nodes, consider: phis are special +pseudoinstructions that must come first in a block, are all defined in +parallel, and whose uses occur on the edge of a particular +predecessor. All of these facts complicate any analysis that scans +instructions and reasons about uses and defs. It is much closer to the +truth to actually put those uses *in* the predecessor, on the branch, +and put all the defs at the top of the block as a separate kind of +def. The tradeoff is that a vreg's def now has two possibilities -- +ordinary instruction def or blockparam def -- but this is fairly +reasonable to handle. + +## Non-SSA + +As mentioned, regalloc2 supports non-SSA inputs as well. No special +flag is needed to place the allocator in this mode or disable SSA +verification. However, we hope to eventually remove this functionality +when it is no longer needed. + +## Program Moves + +As an especially useful feature for non-SSA IR, regalloc2 supports +special handling of "move" instructions: it will try to merge the +input and output allocations to elide the move altogether. + +It turns out that moves are used frequently in the non-SSA input that +we observe from Cranelift via the regalloc.rs compatibility shim. They +are used in three different ways: + +- Moves to or from physical registers, used to implement ABI details + or place values in particular registers required by certain + instructions. +- Moves between vregs on program edges, as lowered from phi/blockparam + dataflow in the higher-level SSA IR (CLIF). +- Moves just prior to two-address-form instructions that modify an + input to form an output: the input is moved to the output vreg to + avoid clobbering the input. + +Note that, strictly speaking, special handling of program moves is +redundant because each of these kinds of uses has an equivalent in the +"native" regalloc2 API: + +- Moves to/from physical registers can become operand constraints, + either on a particular instruction that requires/produces the values + in certain registers (e.g., a call or ret with args/results in regs, + or a special instruction with fixed register args), or on a ghost + instruction at the top of function that defs vregs for all in-reg + args. + +- Moves between vregs as a lowering of blockparams/phi nodes can be + replaced with use of regalloc2's native blockparam support. + +- Moves prior to two-address-form instructions can be replaced with + the reused-input mechanism. + +Thus, eventually, special handling of program moves should be +removed. However, it is very important for performance at the moment. + +## Output + +The allocator produces two main data structures as output: an array of +`Allocation`s and a sequence of edits. Some other data, such as +stackmap slot info, is also provided. + +### Allocations + +The allocator provides an array of `Allocation` values, one per +`Operand`. Each `Allocation` has a kind and an index. The kind may +indicate that this is a physical register or a stack slot, and the +index gives the respective register or slot. All allocations will +conform to the constraints given, and will faithfully preserve the +dataflow of the input program. + +### Inserted Moves + +In order to implement the necessary movement of data between +allocations, the allocator needs to insert moves at various program +points. + +The vector of inserted moves contains tuples that name a program point +and an "edit". The edit is either a move, from one `Allocation` to +another, or else a kind of metadata used by the checker to know which +VReg is live in a given allocation at any particular time. The latter +sort of edit can be ignored by a backend that is just interested in +generating machine code. + +Note that the allocator will never generate a move from one stackslot +directly to another, by design. Instead, if it needs to do so, it will +make use of the scratch register. (Sometimes such a move occurs when +the scratch register is already holding a value, e.g. to resolve a +cycle of moves; in this case, it will allocate another spillslot and +spill the original scratch value around the move.) + +Thus, the single "edit" type can become either a register-to-register +move, a load from a stackslot into a register, or a store from a +register into a stackslot. + +# Data Structures + +We now review the data structures that regalloc2 uses to track its +state. + +## Program-Derived Alloc-Invariant Data + +There are a number of data structures that are computed in a +deterministic way from the input program and then subsequently used +only as read-only data during the core allocation procedure. + +### Livein/Liveout Bitsets + +The livein and liveout bitsets (`liveins` and `liveouts` on the `Env`) +are allocated one per basic block and record, per block, which vregs +are live entering and leaving that block. They are computed using a +standard backward iterative dataflow analysis and are exact; they do +not over-approximate (this turns out to be important for performance, +and is also necessary for correctness in the case of stackmaps). + +### Blockparam Vectors: Source-Side and Dest-Side + +The initialization stage scans the input program and produces two +vectors that represent blockparam flows from branches to destination +blocks: `blockparam_ins` and `blockparam_outs`. + +These two vectors are the first instance we will see of a recurring +pattern: the vectors contain tuples that are carefully ordered in a +way such that their sort-order is meaningful. "Build a vector lazily +then sort" is a common idiom: it batches the O(n log n) cost into one +operation that the stdlib has aggressively optimized, it provides +dense storage, and it allows for a scan in a certain order that often +lines up with a scan over the program. + +In this particular case, we will build vectors of (vreg, block) points +that are meaningful either at the start or end of a block, so that +later, when we scan over a particular vreg's allocations in block +order, we can generate another vector of allocations. One side (the +"outs") also contains enough information that it can line up with the +other side (the "ins") in a later sort. + +To make this work, `blockparam_ins` contains a vector of (to-vreg, +to-block, from-block) tuples, and has an entry for every blockparam of +every block. Note that we can compute this without actually observing +from-blocks; we only need to iterate over `block_preds` at any given +block. + +Then, `blockparam_outs` contains a vector of (from-vreg, from-block, +to-block, to-vreg), and has an entry for every parameter on every +branch that ends a block. There is exactly one "out" tuple for every +"in" tuple. As mentioned above, we will later scan over both to +generate moves. + +### Program-Move Vectors: Source-Side and Dest-Side + +Similar to blockparams, we handle moves specially. In fact, we ingest +all moves in the input program into a set of vectors -- "move sources" +and "move dests", analogous to the "ins" and "outs" blockparam vectors +described above -- and then completely ignore the moves in the program +thereafter. The semantics of the API are such that all program moves +will be recreated with regalloc-inserted edits, and should not still +be emitted after regalloc. This may seem inefficient, but in fact it +allows for better code because it integrates program-moves with the +move resolution that handles other forms of vreg movement. We +previously took the simpler approach of handling program-moves as +opaque instructions with a source and dest, and we found that there +were many redundant move-chains (A->B, B->C) that are eliminated when +everything is handled centrally. + +We also construct a `prog_move_merges` vector of live-range index pairs +to attempt to merge when we reach that stage of allocation. + +## Core Allocation State: Ranges, Uses, Bundles, VRegs, PRegs + +We now come to the core data structures: live-ranges, bundles, virtual +registers and their state, and physical registers and their state. + +First we must define a `ProgPoint` precisely: a `ProgPoint` is an +instruction index and a `Before` or `After` suffix. We pack the +before/after suffix into the LSB of a `u32`, so a `ProgPoint` can be +incremented and compared as a simple integer. + +A live-range is a contiguous range of program points (half-open, +i.e. including `from` and excluding `to`) for which a particular vreg +is live with a value. + +A live-range contains a vector of uses. Each use contains four parts: +the Operand word (directly copied, so there is no need to dereference +it); the ProgPoint at which the use occurs; the operand slot on that +instruction, if any, that the operand comes from, and the use's +'weight". (It's possible to have "ghost uses" that do not derive from +any slot on the isntruction.) These four parts are packed into three +`u32`s: the slot can fit in 8 bits, and the weight in 16. + +The live-range carries its program-point range, uses, vreg index, +bundle index (see below), and some metadata: spill weight and +flags. The spill weight is the sum of weights of each use. The flags +set currently carries one flag only: whether the live-range starts at +a Def-kind operand. (This is equivalent to whether the range consumes +a value at its start or not.) + +Uses are owned only by live-ranges and have no separate identity, but +live-ranges live in a toplevel array and are known by `LiveRangeIndex` +values throughout the allocator. New live-ranges can be created +(e.g. during splitting); old ones are not cleaned up, but rather, all +state is bulk-freed at the end. + +Live-ranges are aggregated into "bundles". A bundle is a collection of +ranges that does not overlap. Each bundle carries: a vector (inline +SmallVec) of (range, live-range index) tuples, an allocation (starts +as "none"), a "spillset" (more below), and some metadata, including a +spill weight (sum of ranges' weights), a priority (sum of ranges' +lengths), and three property flags: "minimal", "contains fixed +constraints", "contains stack constraints". + +VRegs also contain their vectors of live-ranges, in the same form as a +bundle does (inline SmallVec that has inline (from, to) range bounds +and range indices). + +There are two important overlap invariants: (i) no liveranges within a +bundle overlap, and (ii) no liveranges within a vreg overlap. These +are extremely important and we rely on them implicitly in many places. + +The live-range vectors in bundles and vregs, and use-vectors in ranges, +have various sorting invariants as well. These invariants differ +according to the phase of the allocator's computation. First, during +live-range construction, live-ranges are placed into vregs in reverse +order (because the computation is a reverse scan) and uses into ranges +in reverse order; these are sorted into forward order at the end of +live-range computation. When bundles are first constructed, their +range vectors are sorted, and they remain so for the rest of allocation, +as we need for interference testing. However, as ranges are created +and split, sortedness of vreg ranges is *not* maintained; they are +sorted once more, in bulk, when allocation is done and we start to +resolve moves. + +Finally, we have physical registers. The main data associated with +each is the allocation map. This map is a standard BTree, indexed by +ranges (`from` and `to` ProgPoints) and yielding a LiveRange for each +location range. The ranges have a custom comparison operator defined +that compares equal for any overlap. + +This comparison operator allows us to determine whether a range is +free, i.e. has no overlap with a particular range, in one probe -- the +btree will not contain a match. However, it makes iteration over *all* +overlapping ranges somewhat tricky to get right. Notably, Rust's +BTreeMap does not guarantee that the lookup result will be the *first* +equal key, if multiple keys are equal to the probe key. Thus, when we +want to enumerate all overlapping ranges, we probe with a range that +consists of the single program point *before* the start of the actual +query range, using the API that returns an iterator over a range in +the BTree, and then iterate through the resulting iterator to gather +all overlapping ranges (which will be contiguous). + +## Spill Bundles + +It is worth describing "spill bundles" separately. Every spillset (see +below; a group of bundles that originated from one bundle) optionally +points to a single bundle that we designate the "spill bundle" for +that spillset. Contrary to the name, this bundle is not +unconditionally spilled. Rather, one can see it as a sort of fallback: +it is where liveranges go when we give up on processing them via the +normal backtracking loop, and will only process them once more in the +"second-chance" stage. + +This fallback behavior implies that the spill bundle must always be +able to accept a spillslot allocation, i.e., it cannot require a +register. This invariant is what allows spill bundles to be processed +in a different way, after backtracking has completed. + +The spill bundle acquires liveranges in two ways. First, as we split +bundles, we will trim the split pieces in certain ways so that some +liveranges are immediately placed in the spill bundle. Intuitively, +the "empty" regions that just carry a value, but do not satisfy any +operands, should be in the spill bundle: it is better to have a single +consistent location for the value than to move it between lots of +different split pieces without using it, as moves carry a cost. + +Second, the spill bundle acquires the liveranges of a bundle that has +no requirement to be in a register when that bundle is processed, but +only if the spill bundle already exists. In other words, we won't +create a second-chance spill bundle just for a liverange with an "Any" +use; but if it was already forced into existence by splitting and +trimming, then we might as well use it. + +Note that unlike other bundles, a spill bundle's liverange vector +remains unsorted until we do the second-chance allocation. This allows +quick appends of more liveranges. + +## Allocation Queue + +The allocation queue is simply a priority queue (built with a binary +max-heap) of (prio, bundle-index) tuples. + +## Spillsets and Spillslots + +Every bundle contains a reference to a spillset. Spillsets are used to +assign spillslots near the end of allocation, but before then, they +are also a convenient place to store information that is common among +*all bundles* that share the spillset. In particular, spillsets are +initially assigned 1-to-1 to bundles after all bundle-merging is +complete; so spillsets represent in some sense the "original bundles", +and as splitting commences, the smaller bundle-pieces continue to +refer to their original spillsets. + +We stash some useful information on the spillset because of this: a +register hint, used to create some "stickiness" between pieces of an +original bundle that are assigned separately after splitting; the +spill bundle; the common register class of all vregs in this bundle; +the vregs whose liveranges are contained in this bundle; and then some +information actually used if this is spilled to the stack (`required` +indicates actual stack use; `size` is the spillslot count; `slot` is +the actual stack slot). + +Spill *sets* are later allocated to spill *slots*. Multiple spillsets +can be assigned to one spillslot; the only constraint is that +spillsets assigned to a spillslot must not overlap. When we look up +the allocation for a bundle, if the bundle is not given a specific +allocation (its `alloc` field is `Allocation::none()`), this means it +is spilled, and we traverse to the spillset then spillslot. + +## Other: Fixups, Stats, Debug Annotations + +There are a few fixup vectors that we will cover in more detail +later. Of particular note is the "multi-fixed-reg fixup vector": this +handles instructions that constrain the same input vreg to multiple, +different, fixed registers for different operands at the same program +point. The only way to satisfy such a set of constraints is to +decouple all but one of the inputs (make them no longer refer to the +vreg) and then later insert copies from the first fixed use of the +vreg to the other fixed regs. + +The `Env` also carries a statistics structure with counters that are +incremented, which can be useful for evaluating the effects of +changes; and a "debug annotations" hashmap from program point to +arbitrary strings that is filled out with various useful diagnostic +information if enabled, so that an annotated view of the program with +its liveranges, bundle assignments, inserted moves, merge and split +decisions, etc. can be viewed. + +# Allocation Pipeline + +We now describe the pipeline that computes register allocations. + +## Live-range Construction + +The first step in performing allocation is to analyze the input +program to understand its dataflow: that is, the ranges during which +virtual registers must be assigned to physical registers. Computing +these ranges is what allows us to do better than a trivial "every vreg +lives in a different location, always" allocation. + +We compute precise liveness first using an iterative dataflow +algorithm with BitVecs. (See below for our sparse chunked BitVec +description.) This produces the `liveins` and `liveouts` vectors of +BitVecs per block. + +We then perform a single pass over blocks in reverse order, and scan +instructions in each block in reverse order. Why reverse order? We +must see instructions within a block in reverse to properly compute +liveness (a value is live backward from an use to a def). Because we +want to keep liveranges in-order as we build them, to enable +coalescing, we visit blocks in reverse order as well, so overall this +is simply a scan over the whole instruction index space in reverse +order. + +For each block, we perform a scan with the following state: + +- A liveness bitvec, initialized at the start from `liveouts`. +- A vector of live-range indices, with one entry per vreg, initially + "invalid" (this vector is allocated once and reused at each block). +- In-progress vector of live-range indices per vreg in the vreg state, + in *reverse* order (we will reverse it when we're done). + +A vreg is live at the current point in the scan if its bit is set in +the bitvec; its entry in the vreg-to-liverange vec may be stale, but +if the bit is not set, we ignore it. + +We initially create a liverange for all vregs that are live out of the +block, spanning the whole block. We will trim this below if it is +locally def'd and does not pass through the block. + +For each instruction, we process its effects on the scan state: + +- For all clobbers (which logically happen at the end of the + instruction), add a single-program-point liverange to each clobbered + preg. + +- If not a move: + - for each program point [after, before], for each operand at + this point(\*): + - if a def or mod: + - if not currently live, this is a dead def; create an empty + LR. + - if a def: + - set the start of the LR for this vreg to this point. + - set as dead. + - if a use: + - create LR if not live, with start at beginning of block. + +- Else, if a move: + - simple case (no pinned vregs): + - add to `prog_move` data structures, and update LRs as above. + - effective point for the use is *after* the move, and for the mod + is *before* the *next* instruction. Why not more conventional + use-before, def-after? Because this allows the move to happen in + parallel with other moves that the move-resolution inserts + (between split fragments of a vreg); these moves always happen + at the gaps between instructions. We place it after, not before, + because before may land at a block-start and interfere with edge + moves, while after is always a "normal" gap (a move cannot end a + block). + - otherwise: see below (pinned vregs). + + +(\*) an instruction operand's effective point is adjusted in a few +cases. If the instruction is a branch, its uses (which are +blockparams) are extended to the "after" point. If there is a reused +input, all *other* inputs are extended to "after": this ensures proper +interference (as we explain more below). + +We then treat blockparams as defs at the end of the scan (beginning of +the block), and create the "ins" tuples. (The uses for the other side +of the edge are already handled as normal uses on a branch +instruction.) + +### Optimization: Pinned VRegs and Moves + +In order to efficiently handle the translation from the regalloc.rs +API, which uses named RealRegs that are distinct from VirtualRegs +rather than operand constraints, we need to implement a few +optimizations. The translation layer translates RealRegs as particular +vregs at the regalloc2 layer, because we need to track their liveness +properly. Handling these as "normal" vregs, with massive bundles of +many liveranges throughout the function, turns out to be a very +inefficient solution. So we mark them as "pinned" with a hook in the +RA2 API. Semantically, this means they are always assigned to a +particular preg whenever mentioned in an operand (but *NOT* between +those points; it is possible for a pinned vreg to move all about +registers and stackslots as long as it eventually makes it back to its +home preg in time for its next use). + +This has a few implications during liverange construction. First, when +we see an operand that mentions a pinned vreg, we translate this to an +operand constraint that names a fixed preg. Later, when we build +bundles, we will not create a bundle for the pinned vreg; instead we +will transfer its liveranges directly as unmoveable reservations in +pregs' allocation maps. Finally, we need to handle moves specially. + +With the caveat that "this is a massive hack and I am very very +sorry", here is how it works. A move between two pinned vregs is easy: +we add that to the inserted-moves vector right away because we know the +Allocation on both sides. A move from a pinned vreg to a normal vreg +is the first interesting case. In this case, we (i) create a ghost def +with a fixed-register policy on the normal vreg, doing the other +liverange-maintenance bits as above, and (ii) adjust the liveranges on +the pinned vreg (so the preg) in a particular way. If the preg is live +flowing downward, then this move implies a copy, because the normal +vreg and the pinned vreg are both used in the future and cannot +overlap. But we cannot keep the preg continuously live, because at +exactly one program point, the normal vreg is pinned to it. So we cut +the downward-flowing liverange just *after* the normal vreg's +fixed-reg ghost def. Then, whether it is live downward or not, we +create an upward-flowing liverange on the pinned vreg that ends just +*before* the ghost def. + +The move-from-normal-to-pinned case is similar. First, we create a +ghost use on the normal vreg that pins its value at this program point +to the fixed preg. Then, if the preg is live flowing downward, we trim +its downward liverange to start just after the fixed use. + +There are also some tricky metadata-maintenance records that we emit +so that the checker can keep this all straight. + +The outcome of this hack, together with the operand-constraint +translation on normal uses/defs/mods on pinned vregs, is that we +essentially are translating regalloc.rs's means of referring to real +registers to regalloc2's preferred abstractions by doing a bit of +reverse-engineering. It is not perfect, but it works. Still, we hope +to rip it all out once we get rid of the need for the compatibility +shim. + +### Handling Reused Inputs + +Reused inputs are also handled a bit specially. We have already +described how we essentially translate the idiom so that the output's +allocation is used for input and output, and there is a move just +before the instruction that copies the actual input (which will not be +clobbered) to the output. Together with an attempt to merge the +bundles for the two, to elide the move if possible, this works +perfectly well as long as we ignore all of the other inputs. + +But we can't do that: we have to ensure that other inputs' allocations +are correct too. Note that using the output's allocation as the input +is actually potentially incorrect if the output is at the After point +and the input is at the Before: the output might share a register with +one of the *other* (normal, non-reused) inputs if that input's vreg +were dead afterward. This will mean that we clobber the other input. + +So, to get the interference right, we *extend* all other (non-reused) +inputs of an instruction with a reused input to the After point. This +ensures that the other inputs are *not* clobbered by the slightly +premature use of the output register. + +The source has a link to a comment in IonMonkey that implies that it +uses a similar solution to this problem, though it's not entirely +clear. + +(This odd dance, like many of the others above and below, is "written +in fuzzbug failures", so to speak. It's not entirely obvious until one +sees the corner case where it's necessary!) + +## Bundle Merging + +Once we have built the liverange vectors for every vreg, we can reverse +these vectors (recall, they were built in strict reverse order) and +initially assign one bundle per (non-pinned) vreg. We then try to +merge bundles together as long as find pairs of bundles that do not +overlap and that (heuristically) make sense to merge. + +Note that this is the only point in the allocation pipeline where +bundles get larger. We initially merge as large as we dare (but not +too large, because then we'll just cause lots of conflicts and +splitting later), and then try out assignments, backtrack via +eviction, and split continuously to chip away at the problem until we +have a working set of allocation assignments. + +We attempt to merge three kinds of bundle pairs: reused-input to +corresponding output; across program moves; and across blockparam +assignments. + +To merge two bundles, we traverse over both their sorted liverange +vectors at once, checking for overlaps. Note that we can do this without +pointer-chasing to the liverange data; the (from, to) range is in the +liverange vector itself. + +We also check whether the merged bundle would have conflicting +requirements (see below for more on requirements). We do a coarse +check first, checking 1-bit flags that indicate whether either bundle +has any fixed-reg constraints or stack-only constraints. If so, we +need to do a detailed check by actually computing merged requirements +on both sides, merging, and checking for Conflict (the lattice bottom +value). If no conflict, we merge. + +A performance note: merging is extremely performance-sensitive, and it +turns out that a mergesort-like merge of the liverange vectors is too +expensive, partly because it requires allocating a separate result +vector (in-place merge in mergesort is infamously complex). Instead, +we simply append one vector onto the end of the other and invoke +Rust's builtin sort. We could special-case "one bundle is completely +before the other", but we currently don't do that (performance idea!). + +Once all bundles are merged as far as they will go, we compute cached +bundle properties (priorities and weights) and enqueue them on the +priority queue for allocation. + +## Recurring: Bundle Property Computation + +The core allocation loop is a recurring iteration of the following: we +take the highest-priority bundle from the allocation queue; we compute +its requirements; we try to find it a register according to those +requirements; if no fit, we either evict some other bundle(s) from +their allocations and try again, or we split the bundle and put the +parts back on the queue. We record all the information we need to make +the evict-or-split decision (and where to split) *during* the physical +register allocation-map scans, so we don't need to go back again to +compute that. + +Termination is nontrivial to see, because of eviction. How do we +guarantee we don't get into an infinite loop where two bundles fight +over a register forever? In fact, this can easily happen if there is a +bug; we fixed many fuzzbugs like this, and we have a check for +"infinite loop" based on an upper bound on iterations. But if the +allocator is correct, it should never happen. + +Termination is guaranteed because (i) bundles always get smaller, (ii) +eviction only occurs when a bundle is *strictly* higher weight (not +higher-or-equal), and (iii) once a bundle gets down to its "minimal" +size, it has an extremely high weight that is guaranteed to evict any +non-minimal bundle. A minimal bundle is one that covers only one +instruction. As long as the input program does not have impossible +constraints that require more than one vreg to exist in one preg, an +allocation problem of all minimal bundles will always have a solution. + +## Bundle Processing + +Let's now talk about what happens when we take a bundle off the +allocation queue. The three basic outcomes are: allocate; split and +requeue; or evict and try again immediately (and eventually allocate +or split/requeue). + +### Properties: Weight, Priority, and Requirements + +To process a bundle, we have to compute a few properties. In fact we +will have already computed a few of these beforehand, but we describe +them all here. + +- Priority: a bundle's priority determines the order in which it is + considered for allocation. RA2 defines as the sum of the lengths (in + instruction index space) of each liverange. This causes the + allocator to consider larger bundles first, when the allocation maps + are generally more free; they can always be evicted and split later. + +- Weight: a bundle's weight indicates how important (in terms of + runtime) its uses/register mentions are. In an approximate sense, + inner loop bodies create higher-weight uses. Fixed register + constraints add some weight, and defs add some weight. Finally, + weight is divided by priority, so a very large bundle that happens + to have a few important uses does not unformly exert its weight + across its entire range. This has the effect of causing bundles to + be more important (more likely to evict others) the more they are + split. + +- Requirement: a bundle's requirement is a value in a lattice that we + have defined, where top is "Unknown" and bottom is + "Conflict". Between these two, we have: any register (of a class); + any stackslot (of a class); a particular register. "Any register" + can degrade to "a particular register", but any other pair of + different requirements meets to Conflict. Requirements are derived + from the operand constraints for all uses in all liveranges in a + bundle, and then merged with the lattice meet-function. + +The lattice is as follows (diagram simplified to remove multiple +classes and multiple fixed registers which parameterize nodes; any two +differently-parameterized values are unordered with respect to each +other): + +```plain + + ___Unknown_____ + | | | + | | | + | ____Any(rc) | + |/ | | + Stack(rc) FixedReg(reg) + \ / + Conflict +``` + +Once we have the Requirement for a bundle, we can decide what to do. + +### No-Register-Required Cases + +If the requirement indicates that no register is needed (`Unknown` or +`Any`, i.e. a register or stack slot would be OK), *and* if the spill +bundle already exists for this bundle's spillset, then we move all the +liveranges over to the spill bundle, as described above. + +If the requirement indicates that the stack is needed explicitly +(e.g., for a safepoint), we set our spillset as "required" (this will +cause it to allocate a spillslot) and return; because the bundle has +no other allocation set, it will look to the spillset's spillslot by +default. + +If the requirement indicates a conflict, we immediately split and +requeue the split pieces. This split is performed at the point at +which the conflict is first introduced, i.e. just before the first use +whose requirement, when merged into the requirement for all prior uses +combined, goes to `Conflict`. In this way, we always guarantee forward +progress. Note also that a bundle can reach this stage with a +conflicting requirement only if the original liverange had conflicting +uses (e.g., a liverange from a def in a register to a use on stack, or +a liverange between two different fixed-reg-constrained operands); our +bundle merging logic explicitly avoids merging two bundles if it would +create a conflict. + +### Allocation-Map Probing + +If we did not immediately dispose of the bundle as described above, +then we *can* use a register (either `Any`, which accepts a register +as one of several options, or `Reg`, which must have one, or `Fixed`, +which must have a particular one). + +We determine which physical registers whose allocation maps we will +probe, and in what order. If a particular fixed register is required, +we probe only that register. Otherwise, we probe all registers in the +required class. + +The order in which we probe, if we are not constrained to a single +register, is carefully chosen. First, if there is a hint register from +the spillset (this is set by the last allocation into a register of +any other bundle in this spillset), we probe that. Then, we probe all +preferred registers; then all non-preferred registers. + +For each of the preferred and non-preferred register sequences, we +probe in an *offset* manner: we start at some index partway through +the sequence, determined by some heuristic number that is random and +well-distributed. (In practice, we use the sum of the bundle index and +the instruction index of the start of the first range in the bundle.) +We then march through the sequence and wrap around, stopping before we +hit our starting point again. + +The purpose of this offset is to distribute the contention and speed +up the allocation process. In the common case where there are enough +registers to hold values without spilling (for small functions), we +are more likely to choose a free register right away if we throw the +dart at random than if we start *every* probe at register 0, in +order. This has a large allocation performance impact in practice. + +For each register in probe order, we probe the allocation map, and +gather, simultaneously, several results: (i) whether the entire range +is free; (ii) if not, the vector of all conflicting bundles, *and* the +highest weight among those bundles; (iii) if not, the *first* conflict +point. + +We do this by iterating over all liveranges in the preg's btree that +overlap with each range in the current bundle. This iteration is +somewhat subtle due to multiple "equal" keys (see above where we +describe the use of the btree). It is also adaptive for performance +reasons: it initially obtains an iterator into the btree corresponding +to the start of the first range in the bundle, and concurrently +iterates through both the btree and the bundle. However, if there is a +large gap in the bundle, this might require skipping many irrelevant +entries in the btree. So, if we skip too many entries (heuristically, +16, right now), we do another lookup from scratch in the btree for the +start of the next range in the bundle. This balances between the two +cases: dense bundle, where O(1) iteration through the btree is faster, +and sparse bundle, where O(log n) lookup for each entry is better. + +### Decision: Allocate, Evict, or Split + +First, the "allocate" case is easy: if, during our register probe +loop, we find a physical register whose allocations do not overlap +this bundle, then we allocate this register; done! + +If not, then we need to decide whether to evict some conflicting +bundles and retry, or to split the current bundle into smaller pieces +that may have better luck. + +A bit about our split strategy first: contrary to the IonMonkey +allocator which inspired much of our design, we do *not* have a list +of split strategies that split one bundle into many pieces at +once. Instead, each iteration of the allocation loop splits at most +*once*. This simplifies the splitting code greatly, but also turns out +to be a nice heuristic: we split at the point that the bundle first +encounters a conflict for a particular preg assignment, then we hint +that preg for the first (pre-conflict) piece when we retry. In this +way, we always make forward progress -- one piece of the bundle is +always allocated -- and splits are informed by the actual situation at +hand, rather than best guesses. Also note that while this may appear +at first to be a greedy algorithm, it still allows backtracking: the +first half of the split bundle, which we *can* now assign to a preg, +does not necessarily remain on that preg forever (it can still be +evicted later). It is just a split that is known to make at least one +part of the allocation problem solvable. + +To determine whether to split or evict, we track our best options: as +we probe, we track the "lowest cost eviction option", which is a set +of bundles and the maximum weight in that set of bundles. We also +track the "lowest cost split option", which is the cost (more below), +the point at which to split, and the register for this option. + +For each register we probe, if there is a conflict but none of the +conflicts are fixed allocations, we receive a vector of bundles that +conflicted, and also separately, the first conflicting program +point. We update the lowest-cost eviction option if the cost (max +weight) of the conflicting bundles is less than the current best. We +update the lowest-cost split option if the cost is less as well, +according to the following definition of cost: a split's cost is the +cost of its move, as defined by the weight of a normal def operand at +the split program point, plus the cost of all bundles beyond the split +point (which will still be conflicts even after the split). + +If there is a conflict with a fixed allocation, then eviction is not +an option, but we can still compute the candidate split point and cost +in the same way as above. + +Finally, as an optimization, we pass in the current best cost to the +btree probe inner loop; if, while probing, we have already exceeded +the best cost, we stop early (this improves allocation time without +affecting the result). + +Once we have the best cost for split and evict options, we split if +(i) the bundle is not already a minimal bundle, and (ii) we've already +evicted once in this toplevel iteration without success, or the weight +of the current bundle is less than the eviction cost. We then requeue +*both* resulting halves of the bundle with the preg that resulted in +this option as the register hint. Otherwise, we evict all conflicting +bundles and try again. + +Note that the split cost does not actually play into the above (split +vs. evict) decision; it is only used to choose *which* split is +best. This is equivalent to saying: we never evict if the current +bundle is less important than the evicted bundles, even if the split +is more expensive still. This is important for forward progress, and +the case where the split would be even more expensive should be very +very rare (it would have to come from a costly move in the middle of +an inner loop). + +### How to Split + +The actual split procedure is fairly simple. We are given a bundle and +a split-point. We create a new bundle to take on the second half +("rest") of the original. We find the point in the liverange vector +that corresponds to the split, and distribute appropriately. If the +split-point lands in the middle of a liverange, then we split that +liverange as well. + +In the case that a new liverange is created, we add the liverange to +the corresponding vreg liverange vector as well. Note that, as described +above, the vreg's liverange vector is unsorted while splitting is +occurring (because we do not need to traverse it or do any lookups +during this phase); so we just append. + +The splitting code also supports a "minimal split", in which it simply +peels off the first use. This is used to ensure forward progress when +a bundle has conflicting requirements within it (see above). + +#### Spill Bundle and Splitting + +Once a split occurs, however, it turns out that we can improve results +by doing a little cleanup. Once we distribute a bundle's liveranges +across two half-bundles, we postprocess by trimming a bit. + +In particular, if we see that the "loose ends" around the split point +extend beyond uses, we will create and move ranges to a spill +bundle. That is: if the last liverange in the first-half bundle +extends beyond its last use, we trim that part off into an empty (no +uses) liverange and place that liverange in the spill +bundle. Likewise, if the first liverange in the second-half bundle +starts before its first use, we trim that part off into an empty +liverange and place it in the spill bundle. + +This is, empirically, an improvement: it reduces register contention +and makes splitting more effective. The intuition is twofold: (i) it +is better to put all of the "flow-through" parts of a vreg's liveness +into one bundle that is never split, and can be spilled to the stack +if needed, to avoid unnecessary moves; and (ii) if contention is high +enough to cause splitting, it is more likely there will be an actual +stack spill, and if this is the case, it is better to do the store +just after the last use and reload just before the first use of the +respective bundles. + +Unfortunately, this heuristic choice does interact somewhat poorly +with program moves: moves between two normal (non-pinned) vregs do not +create ghost uses or defs, and so these points of the ranges can be +spilled, turning a normal register move into a move from or to the +stack. However, empirically, we have found that adding such ghost +uses/defs actually regresses some cases as well, because it pulls +values back into registers when we could have had a stack-to-stack +move (that might even be a no-op if the same spillset); overall, it +seems better to trim. It also improves allocation performance by +reducing contention in the registers during the core loop (before +second-chance allocation). + +## Second-Chance Allocation: Spilled Bundles + +Once the main allocation loop terminates, when all bundles have either +been allocated or punted to the "spilled bundles" vector, we do +second-chance allocation. This is a simpler loop that never evicts and +never splits. Instead, each bundle gets one second chance, in which it +can probe pregs and attempt to allocate. If it fails, it will actually +live on the stack. + +This is correct because we are careful to only place bundles on the +spilled-bundles vector that are *allowed* to live on the +stack. Specifically, only the canonical spill bundles (which will +contain only empty ranges) and other bundles that have an "any" or +"unknown" requirement are placed here (but *not* "stack" requirements; +those *must* be on the stack, so do not undergo second-chance +allocation). + +At the end of this process, we have marked spillsets as required +whenever at least one bundle in the spillset actually requires a stack +slot. We can then allocate slots to the spillsets. + +## Spillslot Allocation + +We must allocate space on the stack, denoted by an abstract index +space, to each spillset that requires it, and for the liveranges in +which it requires it. + +To facilitate this, we keep a btree per spillslot in the same way we +do per preg. We will allocate spillsets to slots in a way that avoids +interference. + +Note that we actually overapproximate the required ranges for each +spillset in order to improve the behavior of a later phase (redundant +move elimination). Specifically, when we allocate a slot for a +spillset, we reserve that slot for *all* of the liveranges of *every* +vreg that is assigned to that spillset (due to merging rules that +initially merge one-vreg bundles into final merged bundles, there will +be no overlaps here). In other words, we rule out interleaving of +completely different values in the same slot, though bundle merging +does mean that potentially many (non-interfering) vregs may share +it. This provides the important property that if a vreg has been +reloaded, but not modified, its spillslot *still contains the +up-to-date value* (because the slot is reserved for all liveranges of +the vreg). This enables us to avoid another store to the spillslot +later if there is another spilled range. + +We perform probing in a way that is somewhat different than for +registers, because the spillslot space is conceptually infinite. We +can thus optimize for slightly better allocation performance by giving +up and allocating a new slot at any time. + +For each size class, we keep a linked list of slots. When we need to +allocate a spillset to a slot, we traverse down the list and try a +fixed number of slots. If we find one that fits the spillset's ranges, +we allocate, and we remove the slot from its current place in the list +and append to the end. In this way, it is deprioritized from probing +"for a while", which tends to reduce contention. This is a simple way +to round-robin between slots. If we don't find one that fits after a +fixed number of probes, we allocate a new slot. + +And with that, we have valid allocations for all vregs for all points +that they are live! Now we just need to modify the program to reify +these choices. + +## Allocation Assignment + +The first step in reifying the allocation is to iterate through all +mentions of a vreg and fill in the resulting `Allocation` array with +the appropriate allocations. We do this by simply traversing +liveranges per vreg, looking up the allocation by observing the bundle +(and spillset if no specific allocation for the bundle), and for each +use, filling in the slot according to the saved progpoint/slot info in +the use data. + +## Move Generation + +The more difficult half of the reification step is generating the +*moves* that will put the values in the right spots. + +There are two sources of moves that we must generate. The first are +moves between different ranges of the same vreg, as the split pieces +of that vreg's original bundle may have been assigned to different +locations. The second are moves that result from move semantics in the +input program: either assignments from blockparam args on branches to +the target block's params, or program move instructions. (Recall that +we reify program moves in a unified way with all other moves, so the +client should not generate any machine code for their original moves +in the pre-allocation program.) + +Moves are tricky to handle efficiently because they join two +potentially very different locations in the program (in the case of +control-flow-edge moves). In order to avoid the need for random +lookups, which are a cache-locality nightmare even if we have O(log n) +lookups, we instead take a scan-sort-scan approach. + +First, we scan over each vreg's liveranges, find the allocation for +each, and for each move that comes *to* or *from* this liverange, +generate a "half-move". The key idea is that we generate a record for +each "side" of the move, and these records are keyed in a way that +after a sort, the "from" and "to" ends will be consecutive. We can +sort the vector of halfmoves once (this is expensive, but not as +expensive as many separate pointer-chasing lookups), then scan it +again to actually generate the move instructions. + +To enable the sort to work, half-moves are sorted by a key that is +equivalent to the tuple (from-block, to-block, to-vreg, kind), where +`kind` is "source" or "dest". For each key, the payload is an +allocation. The fields in this tuple are carefully chosen: we know all +of them at every location we generate a halfmove, without expensive +lookups, and sorting by this key will make the source and all dests +(there can be more than one) contiguous in the final order. + +Half-moves are generated for several situations. First, at the start +of every block covered by a liverange, we can generate "dest" +half-moves for blockparams, and at the end of every block covered by a +liverange, we can generate "source" half-moves for blockparam args on +branches. Incidentally, this is the reason that `blockparam_ins` and +`blockparam_outs` are sorted tuple-vectors whose tuples begin with +(vreg, block, ...): this is the order in which we do the toplevel scan +over allocations. + +Second, at every block edge, if the vreg is live in any pred (at +block-start) or succ (at block-end), we generate a half-move to +transfer the vreg to its own location in the connected block. + +This completes the "edge-moves". We sort the half-move array and then +have all of the alloc-to-alloc pairs on a given (from-block, to-block) +edge. + +There are also two kinds of moves that happen within blocks. First, +when a live-range ends and another begins for the same vreg in the +same block (i.e., a split in the middle of a block), we know both +sides of the move immediately (because it is the same vreg and we can +look up the adjacent allocation easily), and we can generate that +move. + +Second, program moves occur within blocks. Here we need to do a +similar thing as for block-edge half-moves, but keyed on program point +instead. This is why the `prog_move_srcs` and `prog_move_dsts` arrays +are initially sorted by their (vreg, inst) keys: we can directly fill +in their allocation slots during our main scan. Note that when sorted +this way, the source and dest for a given move instruction will be at +different indices. After the main scan, we *re-sort* the arrays by +just the instruction, so the two sides of a move line up at the same +index; we can then traverse both arrays, zipped together, and generate +moves. + +Finally, we generate moves to fix up multi-fixed-reg-constraint +situations, and make reused inputs work, as described earlier. + +## Move Resolution + +During this whole discussion, we have described "generating moves", +but we have not said what that meant. Note that in many cases, there +are several moves at a particular program point that semantically +happen *in parallel*. For example, if multiple vregs change +allocations between two instructions, all of those moves happen as +part of one parallel permutation. Similarly, blockparams have +parallel-assignment semantics. We thus enqueue all the moves that we +generate at program points and resolve them into sequences of +sequential moves that can actually be lowered to move instructions in +the machine code. + +First, a word on *move priorities*. There are different kinds of moves +that are generated between instructions, and we have to ensure that +some happen before others, i.e., *not* in parallel. For example, a +vreg might change allocation (due to a split) before an instruction, +then be copied to an output register for an output with a reused-input +policy. The latter move must happen *after* the vreg has been moved +into its location for this instruction. + +To enable this, we define "move priorities", which are a logical +extension of program points (i.e., they are sub-points) that enable +finer-grained ordering of moves. We currently have the following +priorities: + +- In-edge moves, to place edge-moves before the first instruction in a + block. +- Block-param metadata, used for the checker only. +- Regular, used for vreg movement between allocations. +- Post-regular, used for checker metadata related to pinned-vreg moves. +- Multi-fixed-reg, used for moves that handle the + single-vreg-in-multiple-fixed-pregs constraint case. +- Reused-input, used for implementing outputs with reused-input policies. +- Out-edge moves, to place edge-moves after the last instruction + (prior to the branch) in a block. + +Every move is statically given one of these priorities by the code +that generates it. + +We collect moves with (prog-point, prio) keys, and we sort by those +keys. We then have, for each such key, a set of moves that +semantically happen in parallel. + +We then resolve those moves using a parallel-move resolver, as we now +describe. + +### Parallel-Move Resolver + +The fundamental issue that arises when resolving parallel moves to +sequential moves is *overlap*: some of the moves may overwrite +registers that other moves use as sources. We must carefully order +moves so that this does not clobber values incorrectly. + +We first check if such overlap occurs. If it does not (this is +actually the most common case), the sequence of parallel moves can be +emitted as sequential moves directly. Done! + +Otherwise, we have to order the moves carefully. Furthermore, if there +is a *cycle* anywhere among the moves, we will need a scratch +register. (Consider, e.g., t0 := t1 and t1 := t0 in parallel: with +only move instructions and no direct "exchange" instruction, we cannot +reify this without a third register.) + +We first compute a mapping from each move instruction to the move +instruction, if any, that it must precede. Note that there can be only +one such move for a given move, because each destination can be +written only once; so a move might be constrained only before the one +move that overwrites its source. (This will be important in a bit!) + +Our task is now to find an ordering of moves that respects these +dependencies. To do so, we perform a depth-first search on the graph +induced by the dependencies, which will generate a sequence of +sequential moves in reverse order. We keep a stack of moves; we start +with any move that has not been visited yet; in each iteration, if the +top-of-stack has no out-edge to another move (does not need to come +before any others), then push it to a result vector, followed by all +others on the stack (in popped order). If it does have an out-edge and +the target is already visited and not on the stack anymore (so already +emitted), likewise, emit this move and the rest on the stack. If it +has an out-edge to a move not yet visited, push on the stack and +continue. Otherwise, if out-edge to a move currently on the stack, we +have found a cycle. In this case, we emit the moves on the stack with +a modification: the first move writes to a scratch register, and we +emit an additional move that moves from the scratch to the first +move's dest. This breaks the cycle. + +The astute reader may notice that this sounds like a canonical +application of Tarjan's algorithm for finding SCCs (strongly-connected +components). Why don't we have the full complexity of that algorithm? +In particular, *why* can we emit the cycle *right away* once we find +it, rather than ensuring that we have gotten all of the SCC first? + +The answer is that because there is only *one* out-edge at most (a +move can only require preceding *one* other move), all SCCs must be +simple cycles. This means that once we have found a cycle, no other +nodes (moves) can be part of the SCC, because every node's single +out-edge is already accounted for. This is what allows us to avoid a +fully general SCC algorithm. + +Once the vector of moves in-reverse has been constructed, we reverse +it and return. + +Note that this "move resolver" is fuzzed separately with a simple +symbolic move simulator (the `moves` fuzz-target). + +### Stack-to-Stack Moves + +There is one potentially difficult situation that could arise from the +move-resolution logic so far: if a vreg moves from one spillslot to +another, this implies a memory-to-memory move, which most machine +architectures cannot handle natively. It would be much nicer if we +could ensure within the regalloc that this never occurs. + +This is in fact possible to do in a postprocessing step. We iterate +through the sequential moves, tracking whether the scratch register is +in use (has been written). When we see a stack-to-stack move: (i) if +the scratch register is not in use, generate a stack-to-scratch move +and scratch-to-stack move; otherwise, (ii) if the scratch register is +in use, allocate an "extra spillslot" if one has not already been +allocated, move the scratch reg to that, do the above stack-to-scratch +/ scratch-to-stack sequence, then reload the scratch reg from the +extra spillslot. + +## Redundant-Spill/Load Elimination + +As a final step before returning the vector of program edits to the +client, we perform one optimization: redundant-spill/load elimination. + +To understand the need for this, consider what will occur when a vreg +is (i) defined once, (ii) used many times, and (iii) spilled multiple +times between some of the uses: with the design described above, we +will move the value from the preg to the stack after every segment of +uses, and then reload it when the next use occurs. However, only the +first spill is actually needed; as we noted above, we allocate +spillslots so that the slot that corresponded to the vreg at the first +spill will always be reserved for that vreg as long as it is live. If +no other defs or mods occur, the value in the slot can be reloaded, +and need not be written back every time. + +This inefficiency is a result of our invariant that a vreg lives in +exactly one place at a time, and these locations are joined by +moves. This is a simple and effective design to use for most of the +allocation pipeline, but falls flat here. It is especially inefficient +when the unnecessary spill occurs in an inner loop. (E.g.: value +defined at top of function is spilled, then used once in the middle of +an inner loop body.) + +The opposite case can also sometimes occur, though it is rarer: a +value is loaded into a register, spilled, and then reloaded into the +same register. This can happen when hinting is successful at getting +several segments of a vreg to use the same preg, but splitting has +trimmed part of the liverange between uses and put it in the spill +bundle, and the spill bundle did not get a reg. + +In order to resolve this inefficiency, we implement a general +redundant-spill/load elimination pass (an even more general solution +would be a full redundant-move elimination pass, but we focus on moves +that are spills/loads to contain the complexity for now). This pass +tracks, for every allocation (reg or spillslot), whether it is a copy +of another allocation. This state is invalidated whenever either that +allocation or the allocation of which it is a copy is +overwritten. When we see a move instruction, if the destination is +already a copy of the source, we elide the move. (There are some +additional complexities to preserve checker metadata which we do not +describe here.) + +Note that this could, in principle, be done as a fixpoint analysis +over the CFG; it must be, if we try to preserve state across +blocks. This is because a location is only a copy of another if that +is true on every incoming edge. However, to avoid the cost and +complexity of doing such an analysis, we instead take the much simpler +approach of doing only an intra-block analysis. This turns out to be +sufficient to remove most redundant moves, especially in the common +case of a single use of an otherwise-spilled value. + +Note that we could do better *if* we accepted only SSA code, because +we would know that a value could not be redefined once written. We +should consider this again once we clean up and remove the non-SSA +support. + +# Future Plans + +## SSA-Only Cleanup + +When the major user (Cranelift via the regalloc.rs shim) migrates to +generate SSA code and native regalloc2 operands, there are many bits +of complexity we can remove, as noted throughout this +writeup. Briefly, we could (i) remove special handling of program +moves, (ii) remove the pinned-vreg hack, (iii) simplify redundant-move +elimination, (iv) remove special handling of "mod" operands, and (v) +probably simplify plenty of code given the invariant that a def always +starts a range. + +More importantly, we expect this change to result in potentially much +better allocation performance. The use of special pinned vregs and +moves to/from them instead of fixed-reg constraints, explicit moves +for every reused-input constraint, and already-sequentialized series +of move instructions on edges for phi nodes, are all expensive ways of +encoding regalloc2's native input primitives that have to be +reverse-engineered. Removing that translation layer would be +ideal. Also, allowing regalloc2 to handle phi-node (blockparam) +lowering in a way that is integrated with other moves will likely +generate better code than the way that program-move handling interacts +with Cranelift's manually lowered phi-moves at the moment. + +## Better Split Heuristics + +We have spent quite some effort trying to improve splitting behavior, +and it is now generally decent, but more work could be done here, +especially with regard to the interaction between splits and the loop +nest. + +## Native Debuginfo Output + +Cranelift currently computes value locations (in registers and +stack-slots) for detailed debuginfo with an expensive post-pass, after +regalloc is complete. This is because the existing register allocator +does not support returning this information directly. However, +providing such information by generating it while we scan over +liveranges in each vreg would be relatively simple, and has the +potential to be much faster and more reliable for Cranelift. We should +investigate adding an interface for this to regalloc2 and using it. + +# Appendix: Comparison to IonMonkey Allocator + +There are a number of differences between the [IonMonkey +allocator](https://searchfox.org/mozilla-central/source/js/src/jit/BacktrackingAllocator.cpp) +and this one. While this allocator initially began as an attempt to +clone IonMonkey's, it has drifted significantly as we optimized the +design (especially after we built the regalloc.rs shim and had to +adapt to its code style); it is easier at this point to name the +similarities than the differences. + +* The core abstractions of "liverange", "bundle", "vreg", "preg", and + "operand" (with policies/constraints) are the same. + +* The overall allocator pipeline is the same, and the top-level + structure of each stage should look similar. Both allocators begin + by computing liveranges, then merging bundles, then handling bundles + and splitting/evicting as necessary, then doing second-chance + allocation, then reifying the decisions. + +* The cost functions are very similar, though the heuristics that make + decisions based on them are not. + +Several notable high-level differences are: + +* There are [fuzz/fuzz_targets/](many different fuzz targets) that + exercise the allocator, including a full symbolic checker + (`ion_checker` target) based on the [symbolic checker in + regalloc.rs](https://cfallin.org/blog/2021/03/15/cranelift-isel-3/) + and, e.g., a targetted fuzzer for the parallel move-resolution + algorithm (`moves`) and the SSA generator used for generating cases + for the other fuzz targets (`ssagen`). + +* The data-structure invariants are simplified. While the IonMonkey + allocator allowed for LiveRanges and Bundles to overlap in certain + cases, this allocator sticks to a strict invariant: ranges do not + overlap in bundles, and bundles do not overlap. There are other + examples too: e.g., the definition of minimal bundles is very simple + and does not depend on scanning the code at all. In general, we + should be able to state simple invariants and see by inspection (as + well as fuzzing -- see above) that they hold. + +* The data structures themselves are simplified. Where IonMonkey uses + linked lists in many places, this allocator stores simple inline + smallvecs of liveranges on bundles and vregs, and smallvecs of uses + on liveranges. We also (i) find a way to construct liveranges + in-order immediately, without any need for splicing, unlike + IonMonkey, and (ii) relax sorting invariants where possible to allow + for cheap append operations in many cases. + +* The splitting heuristics are significantly reworked. Whereas + IonMonkey has an all-at-once approach to splitting an entire bundle, + and has a list of complex heuristics to choose where to split, this + allocator does conflict-based splitting, and tries to decide whether + to split or evict and which split to take based on cost heuristics. + +* The liverange computation is exact, whereas IonMonkey approximates + using a single-pass algorithm that makes vregs live across entire + loop bodies. We have found that precise liveness improves allocation + performance and generated code quality, even though the liveness + itself is slightly more expensive to compute. + +* Many of the algorithms in the IonMonkey allocator are built with + helper functions that do linear scans. These "small quadratic" loops + are likely not a huge issue in practice, but nevertheless have the + potential to be in corner cases. As much as possible, all work in + this allocator is done in linear scans. + +* There are novel schemes for solving certain interesting design + challenges. One example: in IonMonkey, liveranges are connected + across blocks by, when reaching one end of a control-flow edge in a + scan, doing a lookup of the allocation at the other end. This is in + principle a linear lookup (so quadratic overall). We instead + generate a vector of "half-moves", keyed on the edge and from/to + vregs, with each holding one of the allocations. By sorting and then + scanning this vector, we can generate all edge moves in one linear + scan. There are a number of other examples of simplifications: for + example, we handle multiple conflicting + physical-register-constrained uses of a vreg in a single instruction + by recording a copy to do in a side-table, then removing constraints + for the core regalloc. Ion instead has to tweak its definition of + minimal bundles and create two liveranges that overlap (!) to + represent the two uses. + +* Using block parameters rather than phi-nodes significantly + simplifies handling of inter-block data movement. IonMonkey had to + special-case phis in many ways because they are actually quite + weird: their uses happen semantically in other blocks, and their + defs happen in parallel at the top of the block. Block parameters + naturally and explicitly reprsent these semantics in a direct way. + +* The allocator supports irreducible control flow and arbitrary block + ordering (its only CFG requirement is that critical edges are + split). + +* The allocator supports non-SSA code, and has native support for + handling program moves specially. + +# Appendix: Performance-Tuning Lessons + +In the course of optimizing the allocator's performance, we found a +number of general principles: + +* We got substantial performance speedups from using vectors rather + than linked lists everywhere. This is well-known, but nevertheless, + it took some thought to work out how to avoid the need for any + splicing, and it turns out that even when our design is slightly + less efficient asymptotically (e.g., apend-and-re-sort rather than + linear-time merge of two sorted liverange lists when merging + bundles), it is faster. + +* We initially used a direct translation of IonMonkey's splay tree as + an allocation map for each PReg. This turned out to be significantly + (!) less efficient than Rust's built-in BTree data structures, for + the usual cache-efficiency vs. pointer-chasing reasons. + +* We initially used dense bitvecs, as IonMonkey does, for + livein/liveout bits. It turned out that a chunked sparse design (see + below) was much more efficient. + +* Precise liveness significantly improves performance because it + reduces the size of liveranges (i.e., interference), and probing + registers with liveranges is the most significant hot inner + loop. Paying a fraction of a percent runtime for the iterative + dataflow algorithm to get precise bitsets is more than worth it. + +* The randomized probing of registers was a huge win: as above, the + probing is very expensive, and reducing the average number of probes + it takes to find a free register is very important. + +* In general, single-pass algorithms and design of data structures to + enable them are important. For example, the half-move technique + avoids the need to do any O(log n) search at all, and is relatively + cache-efficient. As another example, a side-effect of the precise + liveness was that we could then process operands within blocks in + actual instruction order (in reverse), which allowed us to simply + append liveranges to in-progress vreg liverange vectors and then + reverse at the end. The expensive part is a single pass; only the + bitset computation is a fixpoint loop. + +* Sorts are better than always-sorted data structures (like btrees): + they amortize all the comparison and update cost to one phase, and + this phase is much more cache-friendly than a bunch of spread-out + updates. + +* Take care of basic data structures and their operator definitions! + We initially used the auto-derived comparator on ProgPoint, and let + ProgPoint be a normal struct (with a u32 inst index and a + Befor/After enum). The comparator for this, used in many sorting + inner loops, was a compound thing with conditionals. Instead, pack + them in a u32 and do a simple compare (and save half the memory as + well). Likewise, the half-move key is a single value packed in a + u64; this is far more efficient than the tuple comparator on a + 4-tuple, and the half-move sort (which can be a few percent or more + of total allocation time) became multiple times cheaper. + +# Appendix: Data Structure: Chunked Sparse BitVec + +We use a "chunked sparse bitvec" to store liveness information, which +is just a set of VReg indices. The design is fairly simple: the +toplevel is a HashMap from "chunk" to a `u64`, and each `u64` +represents 64 contiguous indices. + +The intuition is that while the vreg sets are likely sparse overall, +they will probably be dense within small regions of the index +space. For example, in the Nth block in a function, the values that +flow from block N-1 will largely be almost-contiguous vreg indices, if +vregs are allocated in sequence down the function body. Or, at least, +they will be some local vregs together with a few defined at the top +of the function; two separate chunks will cover that. + +We tried a number of other designs as well. Initially we used a simple +dense bitvec, but this was prohibitively expensive: O(n^2) space when +the real need is closer to O(n) (i.e., a classic sparse matrix). We +also tried a hybrid scheme that kept a vector of indices when small +and used either a bitvec or a hashset when large. This did not perform +as well because (i) it was less memory-efficient (the chunking helps +with this) and (ii) insertions are more expensive when they always +require a full hashset/hashmap insert. + +# Appendix: Fuzzing + +We have five fuzz targets: `ssagen`, `domtree`, `moves`, `ion`, and +`ion_checker`. + +## SSAGen + +The SSAGen target tests our SSA generator, which generates cases for +the full allocator fuzz targets. The SSA generator is careful to +always generate a valid CFG, with split critical edges, and valid SSA, +so that we never have to throw out a test input before we reach the +allocator itself. (An alternative fuzzing approach randomly generates +programs and then throws out those that do not meet certain conditions +before using them as legitimate testcases; this is much simpler, but +less efficient.) + +To generate a valid CFG, with no unreachable blocks and with no +critical edges, the generator (i) glues together units of either one +or three blocks (A->B, A->C), forming either a straight-through +section or a conditional. These are glued together into a "spine", and +the conditionals (the "C" block), where they exist, are then linked to +a random target block chosen among the main blocks of these one- or +three-block units. The targets are chosen either randomly, for +potentially irreducible CFGs, or in a way that ensures proper nesting +of loop backedges, if a structured CFG is requested. + +SSA is generated by first choosing which vregs will be defined in each +block, and which will be defined as blockparams vs. instruction +defs. Instructions are then generated, with operands chosen among the +"available" vregs: those defined so far in the current block and all +of those in any other block that dominates this one. + +The SSAGen fuzz target runs the above code generator against an SSA +validator, and thus ensures that it will only generate valid SSA code. + +## Domtree + +The `domtree` fuzz target computes dominance using the algorithm that +we use elsewhere in our CFG analysis, and then walks a +randomly-generated path through the CFG. It checks that the dominance +definition ("a dom b if any path from entry to b must pass through a") +is consistent with this particular randomly-chosen path. + +## Moves + +The `moves` fuzz target tests the parallel move resolver. It generates +a random sequence of parallel moves, careful to ensure that each +destination is written only once. It then runs the parallel move +resolver, and then *abstractly interprets* the resulting sequential +series of moves, thus determining which inputs flow to which +outputs. This must match the original set of parallel moves. + +## Ion and Ion-checker + +The `ion` fuzz target runs the allocator over test programs generated +by SSAGen. It does not validate the output; it only tests that the +allocator runs to completion and does not panic. This was used mainly +during development, and is now less useful than the checker-based +target. + +The `ion_checker` fuzz target runs the allocator's result through a +symbolic checker, which is adapted from the one developed for +regalloc.rs (see [this blog +post](https://cfallin.org/blog/2021/01/22/cranelift-isel-2/) for more +details). This is the most useful fuzz target in the fuzzing suite, +and has found many bugs in development. diff --git a/doc/TODO b/doc/TODO new file mode 100644 index 00000000..3e430305 --- /dev/null +++ b/doc/TODO @@ -0,0 +1,34 @@ +# Features + +- Large-input support (> 1M vregs, > 1M blocks) + - Two operand impls: u64-based and u32-based. Always accept + u64-based `Operand` publicly (do not expose this in interface). + - Trait to generalize over them and support both internally + (parameterize the whole allocator impl) + - On data-structure init, choose one or the other based on max vreg + index + - Update halfmove keys: u128 rather than u64 + +- Support allocation of register pairs (or overlapping registers generally) + +- Rematerialization +- Stack-location constraints that place operands in user-defined stack + locations (distinct from SpillSlots) (e.g., stack args) + +# Performance + +- Investigate better register hinting +- Investigate more principled cost functions and split locations, + especially around loop nests + +- Investigate ways to improve bundle-merging; e.g., merge moves before + other types of connections + +- Add limited inter-block redundant-move elimination: propagate across + splits but not joins. + +- Optimize allocations (some reports of 5-7% of time spent in allocator) + +# Cleanup + +- Remove support for non-SSA code once no longer necessary \ No newline at end of file diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 00000000..a0925114 --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1,3 @@ +target +corpus +artifacts diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 00000000..199eb9d1 --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,51 @@ +[package] +name = "regalloc2-fuzz" +version = "0.0.0" +authors = ["Chris Fallin "] +license = "MPL-2.0 AND Apache-2.0 WITH LLVM-exception" +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +regalloc2 = { path = "../", features = ["fuzzing"] } +libfuzzer-sys = "0.3" +arbitrary = { version = "^0.4.6", features = ["derive"] } +log = { version = "0.4.8", default-features = false } +env_logger = "0.8.3" + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "domtree" +path = "fuzz_targets/domtree.rs" +test = false +doc = false + +[[bin]] +name = "ssagen" +path = "fuzz_targets/ssagen.rs" +test = false +doc = false + +[[bin]] +name = "ion" +path = "fuzz_targets/ion.rs" +test = false +doc = false + +[[bin]] +name = "moves" +path = "fuzz_targets/moves.rs" +test = false +doc = false + +[[bin]] +name = "ion_checker" +path = "fuzz_targets/ion_checker.rs" +test = false +doc = false diff --git a/fuzz/fuzz_targets/domtree.rs b/fuzz/fuzz_targets/domtree.rs new file mode 100644 index 00000000..c89b443c --- /dev/null +++ b/fuzz/fuzz_targets/domtree.rs @@ -0,0 +1,133 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +#![no_main] +use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; +use libfuzzer_sys::fuzz_target; +use std::collections::HashSet; + +use regalloc2::{ + fuzzing::{domtree, postorder}, + Block, +}; + +#[derive(Clone, Debug)] +struct CFG { + num_blocks: usize, + preds: Vec>, + succs: Vec>, +} + +impl Arbitrary for CFG { + fn arbitrary(u: &mut Unstructured) -> Result { + let num_blocks = u.int_in_range(1..=1000)?; + let mut succs = vec![]; + for _ in 0..num_blocks { + let mut block_succs = vec![]; + for _ in 0..u.int_in_range(0..=5)? { + block_succs.push(Block::new(u.int_in_range(0..=(num_blocks - 1))?)); + } + succs.push(block_succs); + } + let mut preds = vec![]; + for _ in 0..num_blocks { + preds.push(vec![]); + } + for from in 0..num_blocks { + for succ in &succs[from] { + preds[succ.index()].push(Block::new(from)); + } + } + Ok(CFG { + num_blocks, + preds, + succs, + }) + } +} + +#[derive(Clone, Debug)] +struct Path { + blocks: Vec, +} + +impl Path { + fn choose_from_cfg(cfg: &CFG, u: &mut Unstructured) -> Result { + let succs = u.int_in_range(0..=(2 * cfg.num_blocks))?; + let mut block = Block::new(0); + let mut blocks = vec![]; + blocks.push(block); + for _ in 0..succs { + if cfg.succs[block.index()].is_empty() { + break; + } + block = *u.choose(&cfg.succs[block.index()])?; + blocks.push(block); + } + Ok(Path { blocks }) + } +} + +fn check_idom_violations(idom: &[Block], path: &Path) { + // "a dom b" means that any path from the entry block through the CFG that + // contains a and b will contain a before b. + // + // To test this, for any given block b_i, we have the set S of b_0 .. b_{i-1}, + // and we walk up the domtree from b_i to get all blocks that dominate b_i; + // each such block must appear in S. (Otherwise, we have a counterexample + // for which dominance says it should appear in the path prefix, but it does + // not.) + let mut visited = HashSet::new(); + visited.insert(Block::new(0)); + for block in &path.blocks { + let mut parent = idom[block.index()]; + let mut domset = HashSet::new(); + domset.insert(*block); + while parent.is_valid() { + assert!(visited.contains(&parent)); + domset.insert(parent); + let next = idom[parent.index()]; + parent = next; + } + + // Check that `dominates()` returns true for every block in domset, + // and false for every other block. + for domblock in 0..idom.len() { + let domblock = Block::new(domblock); + assert_eq!( + domset.contains(&domblock), + domtree::dominates(idom, domblock, *block) + ); + } + visited.insert(*block); + } +} + +#[derive(Clone, Debug)] +struct TestCase { + cfg: CFG, + path: Path, +} + +impl Arbitrary for TestCase { + fn arbitrary(u: &mut Unstructured) -> Result { + let cfg = CFG::arbitrary(u)?; + let path = Path::choose_from_cfg(&cfg, u)?; + Ok(TestCase { cfg, path }) + } +} + +fuzz_target!(|testcase: TestCase| { + let postord = postorder::calculate(testcase.cfg.num_blocks, Block::new(0), |block| { + &testcase.cfg.succs[block.index()] + }); + let idom = domtree::calculate( + testcase.cfg.num_blocks, + |block| &testcase.cfg.preds[block.index()], + &postord[..], + Block::new(0), + ); + check_idom_violations(&idom[..], &testcase.path); +}); diff --git a/fuzz/fuzz_targets/ion.rs b/fuzz/fuzz_targets/ion.rs new file mode 100644 index 00000000..485c36bf --- /dev/null +++ b/fuzz/fuzz_targets/ion.rs @@ -0,0 +1,16 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +#![no_main] +use libfuzzer_sys::fuzz_target; + +use regalloc2::fuzzing::func::Func; + +fuzz_target!(|func: Func| { + let _ = env_logger::try_init(); + log::trace!("func:\n{:?}", func); + let env = regalloc2::fuzzing::func::machine_env(); + let _out = regalloc2::fuzzing::ion::run(&func, &env, false).expect("regalloc did not succeed"); +}); diff --git a/fuzz/fuzz_targets/ion_checker.rs b/fuzz/fuzz_targets/ion_checker.rs new file mode 100644 index 00000000..d467a03c --- /dev/null +++ b/fuzz/fuzz_targets/ion_checker.rs @@ -0,0 +1,48 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +#![no_main] +use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; +use libfuzzer_sys::fuzz_target; + +use regalloc2::fuzzing::checker::Checker; +use regalloc2::fuzzing::func::{Func, Options}; + +#[derive(Clone, Debug)] +struct TestCase { + func: Func, +} + +impl Arbitrary for TestCase { + fn arbitrary(u: &mut Unstructured) -> Result { + Ok(TestCase { + func: Func::arbitrary_with_options( + u, + &Options { + reused_inputs: true, + fixed_regs: true, + clobbers: true, + control_flow: true, + reducible: false, + block_params: true, + always_local_uses: false, + reftypes: true, + }, + )?, + }) + } +} + +fuzz_target!(|testcase: TestCase| { + let func = testcase.func; + let _ = env_logger::try_init(); + log::trace!("func:\n{:?}", func); + let env = regalloc2::fuzzing::func::machine_env(); + let out = regalloc2::fuzzing::ion::run(&func, &env, true).expect("regalloc did not succeed"); + + let mut checker = Checker::new(&func); + checker.prepare(&out); + checker.run().expect("checker failed"); +}); diff --git a/fuzz/fuzz_targets/moves.rs b/fuzz/fuzz_targets/moves.rs new file mode 100644 index 00000000..e62342f4 --- /dev/null +++ b/fuzz/fuzz_targets/moves.rs @@ -0,0 +1,81 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +#![no_main] +use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; +use libfuzzer_sys::fuzz_target; + +use regalloc2::fuzzing::moves::ParallelMoves; +use regalloc2::{Allocation, PReg, RegClass}; +use std::collections::HashSet; + +#[derive(Clone, Debug)] +struct TestCase { + moves: Vec<(Allocation, Allocation)>, +} + +impl Arbitrary for TestCase { + fn arbitrary(u: &mut Unstructured) -> Result { + let mut ret = TestCase { moves: vec![] }; + let mut written = HashSet::new(); + while bool::arbitrary(u)? { + let reg1 = u.int_in_range(0..=30)?; + let reg2 = u.int_in_range(0..=30)?; + if written.contains(®2) { + break; + } + written.insert(reg2); + ret.moves.push(( + Allocation::reg(PReg::new(reg1, RegClass::Int)), + Allocation::reg(PReg::new(reg2, RegClass::Int)), + )); + } + Ok(ret) + } +} + +fuzz_target!(|testcase: TestCase| { + let _ = env_logger::try_init(); + let scratch = Allocation::reg(PReg::new(31, RegClass::Int)); + let mut par = ParallelMoves::new(scratch); + for &(src, dst) in &testcase.moves { + par.add(src, dst, ()); + } + let moves = par.resolve(); + + // Compute the final source reg for each dest reg in the original + // parallel-move set. + let mut final_src_per_dest: Vec> = vec![None; 32]; + for &(src, dst) in &testcase.moves { + if let (Some(preg_src), Some(preg_dst)) = (src.as_reg(), dst.as_reg()) { + final_src_per_dest[preg_dst.hw_enc()] = Some(preg_src.hw_enc()); + } + } + + // Simulate the sequence of moves. + let mut regfile: Vec> = vec![None; 32]; + for i in 0..32 { + regfile[i] = Some(i); + } + for (src, dst, _) in moves { + if let (Some(preg_src), Some(preg_dst)) = (src.as_reg(), dst.as_reg()) { + let data = regfile[preg_src.hw_enc()]; + regfile[preg_dst.hw_enc()] = data; + } else { + panic!("Bad allocation in move list"); + } + } + + // Assert that the expected register-moves occurred. + // N.B.: range up to 31 (not 32) to skip scratch register. + for i in 0..31 { + if let Some(orig_src) = final_src_per_dest[i] { + assert_eq!(regfile[i], Some(orig_src)); + } else { + // Should be untouched. + assert_eq!(regfile[i], Some(i)); + } + } +}); diff --git a/fuzz/fuzz_targets/ssagen.rs b/fuzz/fuzz_targets/ssagen.rs new file mode 100644 index 00000000..bed2253c --- /dev/null +++ b/fuzz/fuzz_targets/ssagen.rs @@ -0,0 +1,42 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +#![no_main] +use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; +use libfuzzer_sys::fuzz_target; + +use regalloc2::fuzzing::cfg::CFGInfo; +use regalloc2::fuzzing::func::{Func, Options}; +use regalloc2::fuzzing::ssa::validate_ssa; + +#[derive(Debug)] +struct TestCase { + f: Func, +} + +impl Arbitrary for TestCase { + fn arbitrary(u: &mut Unstructured) -> Result { + Ok(TestCase { + f: Func::arbitrary_with_options( + u, + &Options { + reused_inputs: true, + fixed_regs: true, + clobbers: true, + control_flow: true, + reducible: false, + always_local_uses: false, + block_params: true, + reftypes: true, + }, + )?, + }) + } +} + +fuzz_target!(|t: TestCase| { + let cfginfo = CFGInfo::new(&t.f).expect("could not create CFG info"); + validate_ssa(&t.f, &cfginfo).expect("invalid SSA"); +}); diff --git a/fuzz/smoketest/ion_checker.bin b/fuzz/smoketest/ion_checker.bin new file mode 100644 index 00000000..5156f227 Binary files /dev/null and b/fuzz/smoketest/ion_checker.bin differ diff --git a/src/cfg.rs b/src/cfg.rs new file mode 100644 index 00000000..f2abc47d --- /dev/null +++ b/src/cfg.rs @@ -0,0 +1,153 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +//! Lightweight CFG analyses. + +use crate::{domtree, postorder, Block, Function, Inst, OperandKind, ProgPoint, RegAllocError}; +use smallvec::{smallvec, SmallVec}; + +#[derive(Clone, Debug)] +pub struct CFGInfo { + /// Postorder traversal of blocks. + pub postorder: Vec, + /// Domtree parents, indexed by block. + pub domtree: Vec, + /// For each instruction, the block it belongs to. + pub insn_block: Vec, + /// For each vreg, the instruction that defines it, if any. + pub vreg_def_inst: Vec, + /// For each vreg, the block that defines it as a blockparam, if + /// any. (Every vreg must have a valid entry in either + /// `vreg_def_inst` or `vreg_def_blockparam`.) + pub vreg_def_blockparam: Vec<(Block, u32)>, + /// For each block, the first instruction. + pub block_entry: Vec, + /// For each block, the last instruction. + pub block_exit: Vec, + /// For each block, what is the approximate loop depth? + /// + /// This measure is fully precise iff the input CFG is reducible + /// and blocks are in RPO, so that loop backedges are precisely + /// those whose block target indices are less than their source + /// indices. Otherwise, it will be approximate, but should still + /// be usable for heuristic purposes. + pub approx_loop_depth: Vec, +} + +impl CFGInfo { + pub fn new(f: &F) -> Result { + let postorder = postorder::calculate(f.num_blocks(), f.entry_block(), |block| { + f.block_succs(block) + }); + let domtree = domtree::calculate( + f.num_blocks(), + |block| f.block_preds(block), + &postorder[..], + f.entry_block(), + ); + let mut insn_block = vec![Block::invalid(); f.num_insts()]; + let mut vreg_def_inst = vec![Inst::invalid(); f.num_vregs()]; + let mut vreg_def_blockparam = vec![(Block::invalid(), 0); f.num_vregs()]; + let mut block_entry = vec![ProgPoint::before(Inst::invalid()); f.num_blocks()]; + let mut block_exit = vec![ProgPoint::before(Inst::invalid()); f.num_blocks()]; + let mut backedge_in = vec![0; f.num_blocks()]; + let mut backedge_out = vec![0; f.num_blocks()]; + + for block in 0..f.num_blocks() { + let block = Block::new(block); + for (i, param) in f.block_params(block).iter().enumerate() { + vreg_def_blockparam[param.vreg()] = (block, i as u32); + } + for inst in f.block_insns(block).iter() { + insn_block[inst.index()] = block; + for operand in f.inst_operands(inst) { + match operand.kind() { + OperandKind::Def => { + vreg_def_inst[operand.vreg().vreg()] = inst; + } + _ => {} + } + } + } + block_entry[block.index()] = ProgPoint::before(f.block_insns(block).first()); + block_exit[block.index()] = ProgPoint::after(f.block_insns(block).last()); + + // Check critical edge condition: if there is more than + // one predecessor, each must have only one successor + // (this block). + let preds = f.block_preds(block).len() + if block == f.entry_block() { 1 } else { 0 }; + if preds > 1 { + for &pred in f.block_preds(block) { + let succs = f.block_succs(pred).len(); + if succs > 1 { + return Err(RegAllocError::CritEdge(pred, block)); + } + } + } + + // Check branch-arg condition: if any successors have more + // than one predecessor (given above, there will only be + // one such successor), then the last instruction of this + // block (the branch) cannot have any args other than the + // blockparams. + let mut require_no_branch_args = false; + for &succ in f.block_succs(block) { + let preds = f.block_preds(succ).len() + if succ == f.entry_block() { 1 } else { 0 }; + if preds > 1 { + require_no_branch_args = true; + } + } + if require_no_branch_args { + let last = f.block_insns(block).last(); + if f.branch_blockparam_arg_offset(block, last) > 0 { + return Err(RegAllocError::DisallowedBranchArg(last)); + } + } + + for &succ in f.block_succs(block) { + if succ.index() <= block.index() { + backedge_in[succ.index()] += 1; + backedge_out[block.index()] += 1; + } + } + } + + let mut approx_loop_depth = vec![]; + let mut backedge_stack: SmallVec<[usize; 4]> = smallvec![]; + let mut cur_depth = 0; + for block in 0..f.num_blocks() { + if backedge_in[block] > 0 { + cur_depth += 1; + backedge_stack.push(backedge_in[block]); + } + + approx_loop_depth.push(cur_depth); + + while backedge_stack.len() > 0 && backedge_out[block] > 0 { + backedge_out[block] -= 1; + *backedge_stack.last_mut().unwrap() -= 1; + if *backedge_stack.last().unwrap() == 0 { + cur_depth -= 1; + backedge_stack.pop(); + } + } + } + + Ok(CFGInfo { + postorder, + domtree, + insn_block, + vreg_def_inst, + vreg_def_blockparam, + block_entry, + block_exit, + approx_loop_depth, + }) + } + + pub fn dominates(&self, a: Block, b: Block) -> bool { + domtree::dominates(&self.domtree[..], a, b) + } +} diff --git a/src/checker.rs b/src/checker.rs new file mode 100644 index 00000000..146dbeeb --- /dev/null +++ b/src/checker.rs @@ -0,0 +1,745 @@ +/* + * The following code is derived from `lib/src/checker.rs` in the + * regalloc.rs project + * (https://github.com/bytecodealliance/regalloc.rs). regalloc.rs is + * also licensed under Apache-2.0 with the LLVM exception, as the rest + * of regalloc2's non-Ion-derived code is. + */ + +//! Checker: verifies that spills/reloads/moves retain equivalent +//! dataflow to original, VReg-based code. +//! +//! The basic idea is that we track symbolic values as they flow +//! through spills and reloads. The symbolic values represent +//! particular virtual registers in the original function body +//! presented to the register allocator. Any instruction in the +//! original function body (i.e., not added by the allocator) +//! conceptually generates a symbolic value "Vn" when storing to (or +//! modifying) a virtual register. +//! +//! These symbolic values are precise but partial: in other words, if +//! a physical register is described as containing a virtual register +//! at a program point, it must actually contain the value of this +//! register (modulo any analysis bugs); but it may resolve to +//! `Conflicts` even in cases where one *could* statically prove that +//! it contains a certain register, because the analysis is not +//! perfectly path-sensitive or value-sensitive. However, all +//! assignments *produced by our register allocator* should be +//! analyzed fully precisely. +//! +//! Operand constraints (fixed register, register, any) are also checked +//! at each operand. +//! +//! The dataflow analysis state at each program point is: +//! +//! - map of: Allocation -> lattice value (top > Vn symbols (unordered) > bottom) +//! +//! And the transfer functions for instructions are (where `A` is the +//! above map from allocated physical registers to symbolic values): +//! +//! - `Edit::Move` inserted by RA: [ alloc_d := alloc_s ] +//! +//! A[alloc_d] := A[alloc_s] +//! +//! - phi-node [ V_i := phi block_j:V_j, block_k:V_k, ... ] +//! with allocations [ A_i := phi block_j:A_j, block_k:A_k, ... ] +//! (N.B.: phi-nodes are not semantically present in the final +//! machine code, but we include their allocations so that this +//! checker can work) +//! +//! A[A_i] := meet(A[A_j], A[A_k], ...) +//! +//! - statement in pre-regalloc function [ V_i := op V_j, V_k, ... ] +//! with allocated form [ A_i := op A_j, A_k, ... ] +//! +//! A[A_i] := `V_i` +//! +//! In other words, a statement, even after allocation, generates +//! a symbol that corresponds to its original virtual-register +//! def. +//! +//! (N.B.: moves in pre-regalloc function fall into this last case +//! -- they are "just another operation" and generate a new +//! symbol) +//! +//! At control-flow join points, the symbols meet using a very simple +//! lattice meet-function: two different symbols in the same +//! allocation meet to "conflicted"; otherwise, the symbol meets with +//! itself to produce itself (reflexivity). +//! +//! To check correctness, we first find the dataflow fixpoint with the +//! above lattice and transfer/meet functions. Then, at each op, we +//! examine the dataflow solution at the preceding program point, and +//! check that the allocation for each op arg (input/use) contains the +//! symbol corresponding to the original virtual register specified +//! for this arg. + +#![allow(dead_code)] + +use crate::{ + Allocation, AllocationKind, Block, Edit, Function, Inst, InstPosition, Operand, + OperandConstraint, OperandKind, OperandPos, Output, PReg, ProgPoint, SpillSlot, VReg, +}; + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::default::Default; +use std::hash::Hash; +use std::result::Result; + +/// A set of errors detected by the regalloc checker. +#[derive(Clone, Debug)] +pub struct CheckerErrors { + errors: Vec, +} + +/// A single error detected by the regalloc checker. +#[derive(Clone, Debug)] +pub enum CheckerError { + MissingAllocation { + inst: Inst, + op: Operand, + }, + UnknownValueInAllocation { + inst: Inst, + op: Operand, + alloc: Allocation, + }, + ConflictedValueInAllocation { + inst: Inst, + op: Operand, + alloc: Allocation, + }, + IncorrectValueInAllocation { + inst: Inst, + op: Operand, + alloc: Allocation, + actual: VReg, + }, + ConstraintViolated { + inst: Inst, + op: Operand, + alloc: Allocation, + }, + AllocationIsNotReg { + inst: Inst, + op: Operand, + alloc: Allocation, + }, + AllocationIsNotFixedReg { + inst: Inst, + op: Operand, + alloc: Allocation, + }, + AllocationIsNotReuse { + inst: Inst, + op: Operand, + alloc: Allocation, + expected_alloc: Allocation, + }, + AllocationIsNotStack { + inst: Inst, + op: Operand, + alloc: Allocation, + }, + ConflictedValueInStackmap { + inst: Inst, + slot: SpillSlot, + }, + NonRefValueInStackmap { + inst: Inst, + slot: SpillSlot, + vreg: VReg, + }, +} + +/// Abstract state for an allocation. +/// +/// Forms a lattice with \top (`Unknown`), \bot (`Conflicted`), and a +/// number of mutually unordered value-points in between, one per real +/// or virtual register. Any two different registers meet to \bot. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CheckerValue { + /// "top" value: this storage slot has no known value. + Unknown, + /// "bottom" value: this storage slot has a conflicted value. + Conflicted, + /// Reg: this storage slot has a value that originated as a def + /// into the given virtual register. + /// + /// The boolean flag indicates whether the value is + /// reference-typed. + Reg(VReg, bool), +} + +impl Default for CheckerValue { + fn default() -> CheckerValue { + CheckerValue::Unknown + } +} + +impl CheckerValue { + /// Meet function of the abstract-interpretation value lattice. + fn meet(&self, other: &CheckerValue) -> CheckerValue { + match (self, other) { + (&CheckerValue::Unknown, _) => *other, + (_, &CheckerValue::Unknown) => *self, + (&CheckerValue::Conflicted, _) => *self, + (_, &CheckerValue::Conflicted) => *other, + (&CheckerValue::Reg(r1, ref1), &CheckerValue::Reg(r2, ref2)) + if r1 == r2 && ref1 == ref2 => + { + CheckerValue::Reg(r1, ref1) + } + _ => { + log::trace!("{:?} and {:?} meet to Conflicted", self, other); + CheckerValue::Conflicted + } + } + } +} + +/// State that steps through program points as we scan over the instruction stream. +#[derive(Clone, Debug, PartialEq, Eq)] +struct CheckerState { + allocations: HashMap, +} + +impl Default for CheckerState { + fn default() -> CheckerState { + CheckerState { + allocations: HashMap::new(), + } + } +} + +impl std::fmt::Display for CheckerValue { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + CheckerValue::Unknown => write!(f, "?"), + CheckerValue::Conflicted => write!(f, "!"), + CheckerValue::Reg(r, false) => write!(f, "{}", r), + CheckerValue::Reg(r, true) => write!(f, "{}/ref", r), + } + } +} + +fn merge_map( + into: &mut HashMap, + from: &HashMap, +) { + for (k, v) in from { + let into_v = into.entry(*k).or_insert(Default::default()); + let merged = into_v.meet(v); + *into_v = merged; + } +} + +impl CheckerState { + /// Create a new checker state. + fn new() -> CheckerState { + Default::default() + } + + /// Merge this checker state with another at a CFG join-point. + fn meet_with(&mut self, other: &CheckerState) { + merge_map(&mut self.allocations, &other.allocations); + } + + fn check_val( + &self, + inst: Inst, + op: Operand, + alloc: Allocation, + val: CheckerValue, + allocs: &[Allocation], + ) -> Result<(), CheckerError> { + if alloc == Allocation::none() { + return Err(CheckerError::MissingAllocation { inst, op }); + } + + match val { + CheckerValue::Unknown => { + return Err(CheckerError::UnknownValueInAllocation { inst, op, alloc }); + } + CheckerValue::Conflicted => { + return Err(CheckerError::ConflictedValueInAllocation { inst, op, alloc }); + } + CheckerValue::Reg(r, _) if r != op.vreg() => { + return Err(CheckerError::IncorrectValueInAllocation { + inst, + op, + alloc, + actual: r, + }); + } + _ => {} + } + + self.check_constraint(inst, op, alloc, allocs)?; + + Ok(()) + } + + /// Check an instruction against this state. This must be called + /// twice: once with `InstPosition::Before`, and once with + /// `InstPosition::After` (after updating state with defs). + fn check(&self, pos: InstPosition, checkinst: &CheckerInst) -> Result<(), CheckerError> { + match checkinst { + &CheckerInst::Op { + inst, + ref operands, + ref allocs, + .. + } => { + // Skip Use-checks at the After point if there are any + // reused inputs: the Def which reuses the input + // happens early. + let has_reused_input = operands + .iter() + .any(|op| matches!(op.constraint(), OperandConstraint::Reuse(_))); + if has_reused_input && pos == InstPosition::After { + return Ok(()); + } + + // For each operand, check (i) that the allocation + // contains the expected vreg, and (ii) that it meets + // the requirements of the OperandConstraint. + for (op, alloc) in operands.iter().zip(allocs.iter()) { + let is_here = match (op.pos(), pos) { + (OperandPos::Early, InstPosition::Before) => true, + (OperandPos::Late, InstPosition::After) => true, + _ => false, + }; + if !is_here { + continue; + } + if op.kind() == OperandKind::Def { + continue; + } + + let val = self + .allocations + .get(alloc) + .cloned() + .unwrap_or(Default::default()); + log::trace!( + "checker: checkinst {:?}: op {:?}, alloc {:?}, checker value {:?}", + checkinst, + op, + alloc, + val + ); + self.check_val(inst, *op, *alloc, val, allocs)?; + } + } + &CheckerInst::Safepoint { inst, ref slots } => { + for &slot in slots { + let alloc = Allocation::stack(slot); + let val = self + .allocations + .get(&alloc) + .cloned() + .unwrap_or(Default::default()); + log::trace!( + "checker: checkinst {:?}: safepoint slot {}, checker value {:?}", + checkinst, + slot, + val + ); + + match val { + CheckerValue::Unknown => {} + CheckerValue::Conflicted => { + return Err(CheckerError::ConflictedValueInStackmap { inst, slot }); + } + CheckerValue::Reg(vreg, false) => { + return Err(CheckerError::NonRefValueInStackmap { inst, slot, vreg }); + } + CheckerValue::Reg(_, true) => {} + } + } + } + _ => {} + } + Ok(()) + } + + /// Update according to instruction. + fn update<'a, F: Function>(&mut self, checkinst: &CheckerInst, checker: &Checker<'a, F>) { + match checkinst { + &CheckerInst::Move { into, from } => { + let val = self + .allocations + .get(&from) + .cloned() + .unwrap_or(Default::default()); + log::trace!( + "checker: checkinst {:?} updating: move {:?} -> {:?} val {:?}", + checkinst, + from, + into, + val + ); + self.allocations.insert(into, val); + } + &CheckerInst::Op { + ref operands, + ref allocs, + ref clobbers, + .. + } => { + for (op, alloc) in operands.iter().zip(allocs.iter()) { + if op.kind() != OperandKind::Def { + continue; + } + let reftyped = checker.reftyped_vregs.contains(&op.vreg()); + self.allocations + .insert(*alloc, CheckerValue::Reg(op.vreg(), reftyped)); + } + for clobber in clobbers { + self.allocations.remove(&Allocation::reg(*clobber)); + } + } + &CheckerInst::DefAlloc { alloc, vreg } => { + let reftyped = checker.reftyped_vregs.contains(&vreg); + self.allocations + .insert(alloc, CheckerValue::Reg(vreg, reftyped)); + } + &CheckerInst::Safepoint { ref slots, .. } => { + for (alloc, value) in &mut self.allocations { + if let CheckerValue::Reg(_, true) = *value { + if alloc.is_reg() { + *value = CheckerValue::Conflicted; + } else if alloc.is_stack() && !slots.contains(&alloc.as_stack().unwrap()) { + *value = CheckerValue::Conflicted; + } + } + } + } + } + } + + fn check_constraint( + &self, + inst: Inst, + op: Operand, + alloc: Allocation, + allocs: &[Allocation], + ) -> Result<(), CheckerError> { + match op.constraint() { + OperandConstraint::Any => {} + OperandConstraint::Reg => { + if alloc.kind() != AllocationKind::Reg { + return Err(CheckerError::AllocationIsNotReg { inst, op, alloc }); + } + } + OperandConstraint::Stack => { + if alloc.kind() != AllocationKind::Stack { + return Err(CheckerError::AllocationIsNotStack { inst, op, alloc }); + } + } + OperandConstraint::FixedReg(preg) => { + if alloc != Allocation::reg(preg) { + return Err(CheckerError::AllocationIsNotFixedReg { inst, op, alloc }); + } + } + OperandConstraint::Reuse(idx) => { + if alloc.kind() != AllocationKind::Reg { + return Err(CheckerError::AllocationIsNotReg { inst, op, alloc }); + } + if alloc != allocs[idx] { + return Err(CheckerError::AllocationIsNotReuse { + inst, + op, + alloc, + expected_alloc: allocs[idx], + }); + } + } + } + Ok(()) + } +} + +/// An instruction representation in the checker's BB summary. +#[derive(Clone, Debug)] +pub(crate) enum CheckerInst { + /// A move between allocations (these could be registers or + /// spillslots). + Move { into: Allocation, from: Allocation }, + + /// A regular instruction with fixed use and def slots. Contains + /// both the original operands (as given to the regalloc) and the + /// allocation results. + Op { + inst: Inst, + operands: Vec, + allocs: Vec, + clobbers: Vec, + }, + + /// Define an allocation's contents. Like BlockParams but for one + /// allocation. Used sometimes when moves are elided but ownership + /// of a value is logically transferred to a new vreg. + DefAlloc { alloc: Allocation, vreg: VReg }, + + /// A safepoint, with the given SpillSlots specified as containing + /// reftyped values. All other reftyped values become invalid. + Safepoint { inst: Inst, slots: Vec }, +} + +#[derive(Debug)] +pub struct Checker<'a, F: Function> { + f: &'a F, + bb_in: HashMap, + bb_insts: HashMap>, + reftyped_vregs: HashSet, +} + +impl<'a, F: Function> Checker<'a, F> { + /// Create a new checker for the given function, initializing CFG + /// info immediately. The client should call the `add_*()` + /// methods to add abstract instructions to each BB before + /// invoking `run()` to check for errors. + pub fn new(f: &'a F) -> Checker<'a, F> { + let mut bb_in = HashMap::new(); + let mut bb_insts = HashMap::new(); + let mut reftyped_vregs = HashSet::new(); + + for block in 0..f.num_blocks() { + let block = Block::new(block); + bb_in.insert(block, Default::default()); + bb_insts.insert(block, vec![]); + } + + for &vreg in f.reftype_vregs() { + reftyped_vregs.insert(vreg); + } + + Checker { + f, + bb_in, + bb_insts, + reftyped_vregs, + } + } + + /// Build the list of checker instructions based on the given func + /// and allocation results. + pub fn prepare(&mut self, out: &Output) { + log::trace!("checker: out = {:?}", out); + // Preprocess safepoint stack-maps into per-inst vecs. + let mut safepoint_slots: HashMap> = HashMap::new(); + for &(progpoint, slot) in &out.safepoint_slots { + safepoint_slots + .entry(progpoint.inst()) + .or_insert_with(|| vec![]) + .push(slot); + } + + // For each original instruction, create an `Op`. + let mut last_inst = None; + let mut insert_idx = 0; + for block in 0..self.f.num_blocks() { + let block = Block::new(block); + for inst in self.f.block_insns(block).iter() { + assert!(last_inst.is_none() || inst > last_inst.unwrap()); + last_inst = Some(inst); + + // Any inserted edits before instruction. + self.handle_edits(block, out, &mut insert_idx, ProgPoint::before(inst)); + + // If this is a safepoint, then check the spillslots at this point. + if self.f.requires_refs_on_stack(inst) { + let slots = safepoint_slots.remove(&inst).unwrap_or_else(|| vec![]); + + let checkinst = CheckerInst::Safepoint { inst, slots }; + self.bb_insts.get_mut(&block).unwrap().push(checkinst); + } + + // Skip if this is a branch: the blockparams do not + // exist in post-regalloc code, and the edge-moves + // have to be inserted before the branch rather than + // after. + if !self.f.is_branch(inst) { + // Instruction itself. + let operands: Vec<_> = self.f.inst_operands(inst).iter().cloned().collect(); + let allocs: Vec<_> = out.inst_allocs(inst).iter().cloned().collect(); + let clobbers: Vec<_> = self.f.inst_clobbers(inst).iter().cloned().collect(); + let checkinst = CheckerInst::Op { + inst, + operands, + allocs, + clobbers, + }; + log::trace!("checker: adding inst {:?}", checkinst); + self.bb_insts.get_mut(&block).unwrap().push(checkinst); + } + + // Any inserted edits after instruction. + self.handle_edits(block, out, &mut insert_idx, ProgPoint::after(inst)); + } + } + } + + fn handle_edits(&mut self, block: Block, out: &Output, idx: &mut usize, pos: ProgPoint) { + while *idx < out.edits.len() && out.edits[*idx].0 <= pos { + let &(edit_pos, ref edit) = &out.edits[*idx]; + *idx += 1; + if edit_pos < pos { + continue; + } + log::trace!("checker: adding edit {:?} at pos {:?}", edit, pos); + match edit { + &Edit::Move { from, to, to_vreg } => { + self.bb_insts + .get_mut(&block) + .unwrap() + .push(CheckerInst::Move { into: to, from }); + if let Some(vreg) = to_vreg { + self.bb_insts + .get_mut(&block) + .unwrap() + .push(CheckerInst::DefAlloc { alloc: to, vreg }); + } + } + &Edit::DefAlloc { alloc, vreg } => { + self.bb_insts + .get_mut(&block) + .unwrap() + .push(CheckerInst::DefAlloc { alloc, vreg }); + } + } + } + } + + /// Perform the dataflow analysis to compute checker state at each BB entry. + fn analyze(&mut self) { + let mut queue = VecDeque::new(); + let mut queue_set = HashSet::new(); + for block in 0..self.f.num_blocks() { + let block = Block::new(block); + queue.push_back(block); + queue_set.insert(block); + } + + while !queue.is_empty() { + let block = queue.pop_front().unwrap(); + queue_set.remove(&block); + let mut state = self.bb_in.get(&block).cloned().unwrap(); + log::trace!("analyze: block {} has state {:?}", block.index(), state); + for inst in self.bb_insts.get(&block).unwrap() { + state.update(inst, self); + log::trace!("analyze: inst {:?} -> state {:?}", inst, state); + } + + for &succ in self.f.block_succs(block) { + let cur_succ_in = self.bb_in.get(&succ).unwrap(); + let mut new_state = state.clone(); + new_state.meet_with(cur_succ_in); + let changed = &new_state != cur_succ_in; + if changed { + log::trace!( + "analyze: block {} state changed from {:?} to {:?}; pushing onto queue", + succ.index(), + cur_succ_in, + new_state + ); + self.bb_in.insert(succ, new_state); + if !queue_set.contains(&succ) { + queue.push_back(succ); + queue_set.insert(succ); + } + } + } + } + } + + /// Using BB-start state computed by `analyze()`, step the checker state + /// through each BB and check each instruction's register allocations + /// for errors. + fn find_errors(&self) -> Result<(), CheckerErrors> { + let mut errors = vec![]; + for (block, input) in &self.bb_in { + let mut state = input.clone(); + for inst in self.bb_insts.get(block).unwrap() { + if let Err(e) = state.check(InstPosition::Before, inst) { + log::trace!("Checker error: {:?}", e); + errors.push(e); + } + state.update(inst, self); + if let Err(e) = state.check(InstPosition::After, inst) { + log::trace!("Checker error: {:?}", e); + errors.push(e); + } + } + } + + if errors.is_empty() { + Ok(()) + } else { + Err(CheckerErrors { errors }) + } + } + + /// Find any errors, returning `Err(CheckerErrors)` with all errors found + /// or `Ok(())` otherwise. + pub fn run(mut self) -> Result<(), CheckerErrors> { + self.analyze(); + let result = self.find_errors(); + + log::trace!("=== CHECKER RESULT ==="); + fn print_state(state: &CheckerState) { + let mut s = vec![]; + for (alloc, state) in &state.allocations { + s.push(format!("{} := {}", alloc, state)); + } + log::trace!(" {{ {} }}", s.join(", ")) + } + for vreg in self.f.reftype_vregs() { + log::trace!(" REF: {}", vreg); + } + for bb in 0..self.f.num_blocks() { + let bb = Block::new(bb); + log::trace!("block{}:", bb.index()); + let insts = self.bb_insts.get(&bb).unwrap(); + let mut state = self.bb_in.get(&bb).unwrap().clone(); + print_state(&state); + for inst in insts { + match inst { + &CheckerInst::Op { + inst, + ref operands, + ref allocs, + ref clobbers, + } => { + log::trace!( + " inst{}: {:?} ({:?}) clobbers:{:?}", + inst.index(), + operands, + allocs, + clobbers + ); + } + &CheckerInst::Move { from, into } => { + log::trace!(" {} -> {}", from, into); + } + &CheckerInst::DefAlloc { alloc, vreg } => { + log::trace!(" defalloc: {}:{}", vreg, alloc); + } + &CheckerInst::Safepoint { ref slots, .. } => { + let mut slotargs = vec![]; + for &slot in slots { + slotargs.push(format!("{}", slot)); + } + log::trace!(" safepoint: {}", slotargs.join(", ")); + } + } + state.update(inst, &self); + print_state(&state); + } + } + + result + } +} diff --git a/src/domtree.rs b/src/domtree.rs new file mode 100644 index 00000000..4300e04f --- /dev/null +++ b/src/domtree.rs @@ -0,0 +1,118 @@ +/* + * Derives from the dominator tree implementation in regalloc.rs, which is + * licensed under the Apache Public License 2.0 with LLVM Exception. See: + * https://github.com/bytecodealliance/regalloc.rs + */ + +// This is an implementation of the algorithm described in +// +// A Simple, Fast Dominance Algorithm +// Keith D. Cooper, Timothy J. Harvey, and Ken Kennedy +// Department of Computer Science, Rice University, Houston, Texas, USA +// TR-06-33870 +// https://www.cs.rice.edu/~keith/EMBED/dom.pdf + +use crate::Block; + +// Helper +fn merge_sets( + idom: &[Block], // map from Block to Block + block_to_rpo: &[Option], + mut node1: Block, + mut node2: Block, +) -> Block { + while node1 != node2 { + if node1.is_invalid() || node2.is_invalid() { + return Block::invalid(); + } + let rpo1 = block_to_rpo[node1.index()].unwrap(); + let rpo2 = block_to_rpo[node2.index()].unwrap(); + if rpo1 > rpo2 { + node1 = idom[node1.index()]; + } else if rpo2 > rpo1 { + node2 = idom[node2.index()]; + } + } + assert!(node1 == node2); + node1 +} + +pub fn calculate<'a, PredFn: Fn(Block) -> &'a [Block]>( + num_blocks: usize, + preds: PredFn, + post_ord: &[Block], + start: Block, +) -> Vec { + // We have post_ord, which is the postorder sequence. + + // Compute maps from RPO to block number and vice-versa. + let mut block_to_rpo = vec![None; num_blocks]; + block_to_rpo.resize(num_blocks, None); + for (i, rpo_block) in post_ord.iter().rev().enumerate() { + block_to_rpo[rpo_block.index()] = Some(i as u32); + } + + let mut idom = vec![Block::invalid(); num_blocks]; + + // The start node must have itself as a parent. + idom[start.index()] = start; + + let mut changed = true; + while changed { + changed = false; + // Consider blocks in reverse postorder. Skip any that are unreachable. + for &node in post_ord.iter().rev() { + let rponum = block_to_rpo[node.index()].unwrap(); + + let mut parent = Block::invalid(); + for &pred in preds(node).iter() { + let pred_rpo = match block_to_rpo[pred.index()] { + Some(r) => r, + None => { + // Skip unreachable preds. + continue; + } + }; + if pred_rpo < rponum { + parent = pred; + break; + } + } + + if parent.is_valid() { + for &pred in preds(node).iter() { + if pred == parent { + continue; + } + if idom[pred.index()].is_invalid() { + continue; + } + parent = merge_sets(&idom, &block_to_rpo[..], parent, pred); + } + } + + if parent.is_valid() && parent != idom[node.index()] { + idom[node.index()] = parent; + changed = true; + } + } + } + + // Now set the start node's dominator-tree parent to "invalid"; + // this allows the loop in `dominates` to terminate. + idom[start.index()] = Block::invalid(); + + idom +} + +pub fn dominates(idom: &[Block], a: Block, mut b: Block) -> bool { + loop { + if a == b { + return true; + } + if b.is_invalid() { + return false; + } + b = idom[b.index()]; + } +} diff --git a/src/fuzzing/func.rs b/src/fuzzing/func.rs new file mode 100644 index 00000000..6151a7c7 --- /dev/null +++ b/src/fuzzing/func.rs @@ -0,0 +1,603 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +use crate::{ + domtree, postorder, Allocation, Block, Function, Inst, InstRange, MachineEnv, Operand, + OperandConstraint, OperandKind, OperandPos, PReg, RegClass, VReg, +}; + +use arbitrary::Result as ArbitraryResult; +use arbitrary::{Arbitrary, Unstructured}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum InstOpcode { + Phi, + Op, + Call, + Ret, + Branch, +} + +#[derive(Clone, Debug)] +pub struct InstData { + op: InstOpcode, + operands: Vec, + clobbers: Vec, + is_safepoint: bool, +} + +impl InstData { + pub fn op(def: usize, uses: &[usize]) -> InstData { + let mut operands = vec![Operand::reg_def(VReg::new(def, RegClass::Int))]; + for &u in uses { + operands.push(Operand::reg_use(VReg::new(u, RegClass::Int))); + } + InstData { + op: InstOpcode::Op, + operands, + clobbers: vec![], + is_safepoint: false, + } + } + pub fn branch(uses: &[usize]) -> InstData { + let mut operands = vec![]; + for &u in uses { + operands.push(Operand::reg_use(VReg::new(u, RegClass::Int))); + } + InstData { + op: InstOpcode::Branch, + operands, + clobbers: vec![], + is_safepoint: false, + } + } + pub fn ret() -> InstData { + InstData { + op: InstOpcode::Ret, + operands: vec![], + clobbers: vec![], + is_safepoint: false, + } + } +} + +#[derive(Clone)] +pub struct Func { + insts: Vec, + blocks: Vec, + block_preds: Vec>, + block_succs: Vec>, + block_params: Vec>, + num_vregs: usize, + reftype_vregs: Vec, +} + +impl Function for Func { + fn num_insts(&self) -> usize { + self.insts.len() + } + + fn num_blocks(&self) -> usize { + self.blocks.len() + } + + fn entry_block(&self) -> Block { + assert!(self.blocks.len() > 0); + Block::new(0) + } + + fn block_insns(&self, block: Block) -> InstRange { + self.blocks[block.index()] + } + + fn block_succs(&self, block: Block) -> &[Block] { + &self.block_succs[block.index()][..] + } + + fn block_preds(&self, block: Block) -> &[Block] { + &self.block_preds[block.index()][..] + } + + fn block_params(&self, block: Block) -> &[VReg] { + &self.block_params[block.index()][..] + } + + fn is_call(&self, insn: Inst) -> bool { + self.insts[insn.index()].op == InstOpcode::Call + } + + fn is_ret(&self, insn: Inst) -> bool { + self.insts[insn.index()].op == InstOpcode::Ret + } + + fn is_branch(&self, insn: Inst) -> bool { + self.insts[insn.index()].op == InstOpcode::Branch + } + + fn branch_blockparam_arg_offset(&self, _: Block, _: Inst) -> usize { + // Branch blockparam args always start at zero for this + // Function implementation. + 0 + } + + fn requires_refs_on_stack(&self, insn: Inst) -> bool { + self.insts[insn.index()].is_safepoint + } + + fn reftype_vregs(&self) -> &[VReg] { + &self.reftype_vregs[..] + } + + fn is_move(&self, _: Inst) -> Option<(Operand, Operand)> { + None + } + + fn inst_operands(&self, insn: Inst) -> &[Operand] { + &self.insts[insn.index()].operands[..] + } + + fn inst_clobbers(&self, insn: Inst) -> &[PReg] { + &self.insts[insn.index()].clobbers[..] + } + + fn num_vregs(&self) -> usize { + self.num_vregs + } + + fn spillslot_size(&self, regclass: RegClass) -> usize { + match regclass { + RegClass::Int => 1, + RegClass::Float => 2, + } + } +} + +struct FuncBuilder { + postorder: Vec, + idom: Vec, + f: Func, + insts_per_block: Vec>, +} + +impl FuncBuilder { + fn new() -> Self { + FuncBuilder { + postorder: vec![], + idom: vec![], + f: Func { + block_preds: vec![], + block_succs: vec![], + block_params: vec![], + insts: vec![], + blocks: vec![], + num_vregs: 0, + reftype_vregs: vec![], + }, + insts_per_block: vec![], + } + } + + pub fn add_block(&mut self) -> Block { + let b = Block::new(self.f.blocks.len()); + self.f + .blocks + .push(InstRange::forward(Inst::new(0), Inst::new(0))); + self.f.block_preds.push(vec![]); + self.f.block_succs.push(vec![]); + self.f.block_params.push(vec![]); + self.insts_per_block.push(vec![]); + b + } + + pub fn add_inst(&mut self, block: Block, data: InstData) { + self.insts_per_block[block.index()].push(data); + } + + pub fn add_edge(&mut self, from: Block, to: Block) { + self.f.block_succs[from.index()].push(to); + self.f.block_preds[to.index()].push(from); + } + + pub fn set_block_params(&mut self, block: Block, params: &[VReg]) { + self.f.block_params[block.index()] = params.iter().cloned().collect(); + } + + fn compute_doms(&mut self) { + self.postorder = postorder::calculate(self.f.blocks.len(), Block::new(0), |block| { + &self.f.block_succs[block.index()][..] + }); + self.idom = domtree::calculate( + self.f.blocks.len(), + |block| &self.f.block_preds[block.index()][..], + &self.postorder[..], + Block::new(0), + ); + } + + fn finalize(mut self) -> Func { + for (blocknum, blockrange) in self.f.blocks.iter_mut().enumerate() { + let begin_inst = self.f.insts.len(); + for inst in &self.insts_per_block[blocknum] { + self.f.insts.push(inst.clone()); + } + let end_inst = self.f.insts.len(); + *blockrange = InstRange::forward(Inst::new(begin_inst), Inst::new(end_inst)); + } + + self.f + } +} + +impl Arbitrary for OperandConstraint { + fn arbitrary(u: &mut Unstructured) -> ArbitraryResult { + Ok(*u.choose(&[OperandConstraint::Any, OperandConstraint::Reg])?) + } +} + +fn choose_dominating_block( + idom: &[Block], + mut block: Block, + allow_self: bool, + u: &mut Unstructured, +) -> ArbitraryResult { + assert!(block.is_valid()); + let orig_block = block; + loop { + if (allow_self || block != orig_block) && bool::arbitrary(u)? { + break; + } + if idom[block.index()].is_invalid() { + break; + } + block = idom[block.index()]; + } + let block = if block != orig_block || allow_self { + block + } else { + Block::invalid() + }; + Ok(block) +} + +#[derive(Clone, Copy, Debug)] +pub struct Options { + pub reused_inputs: bool, + pub fixed_regs: bool, + pub clobbers: bool, + pub control_flow: bool, + pub reducible: bool, + pub block_params: bool, + pub always_local_uses: bool, + pub reftypes: bool, +} + +impl std::default::Default for Options { + fn default() -> Self { + Options { + reused_inputs: false, + fixed_regs: false, + clobbers: false, + control_flow: true, + reducible: false, + block_params: true, + always_local_uses: false, + reftypes: false, + } + } +} + +impl Arbitrary for Func { + fn arbitrary(u: &mut Unstructured) -> ArbitraryResult { + Func::arbitrary_with_options(u, &Options::default()) + } +} + +impl Func { + pub fn arbitrary_with_options(u: &mut Unstructured, opts: &Options) -> ArbitraryResult { + // General strategy: + // 1. Create an arbitrary CFG. + // 2. Create a list of vregs to define in each block. + // 3. Define some of those vregs in each block as blockparams.f. + // 4. Populate blocks with ops that define the rest of the vregs. + // - For each use, choose an available vreg: either one + // already defined (via blockparam or inst) in this block, + // or one defined in a dominating block. + + let mut builder = FuncBuilder::new(); + for _ in 0..u.int_in_range(1..=100)? { + builder.add_block(); + } + let num_blocks = builder.f.blocks.len(); + + // Generate a CFG. Create a "spine" of either single blocks, + // with links to the next; or fork patterns, with the left + // fork linking to the next and the right fork in `out_blocks` + // to be connected below. This creates an arbitrary CFG with + // split critical edges, which is a property that we require + // for the regalloc. + let mut from = 0; + let mut out_blocks = vec![]; + let mut in_blocks = vec![]; + // For reducibility, if selected: enforce strict nesting of backedges + let mut max_backedge_src = 0; + let mut min_backedge_dest = num_blocks; + while from < num_blocks { + in_blocks.push(from); + if num_blocks > 3 && from < num_blocks - 3 && bool::arbitrary(u)? && opts.control_flow { + // To avoid critical edges, we use from+1 as an edge + // block, and advance `from` an extra block; `from+2` + // will be the next normal iteration. + builder.add_edge(Block::new(from), Block::new(from + 1)); + builder.add_edge(Block::new(from), Block::new(from + 2)); + builder.add_edge(Block::new(from + 2), Block::new(from + 3)); + out_blocks.push(from + 1); + from += 2; + } else if from < num_blocks - 1 { + builder.add_edge(Block::new(from), Block::new(from + 1)); + } + from += 1; + } + for pred in out_blocks { + let mut succ = *u.choose(&in_blocks[..])?; + if opts.reducible && (pred >= succ) { + if pred < max_backedge_src || succ > min_backedge_dest { + // If the chosen edge would result in an + // irreducible CFG, just make this a diamond + // instead. + succ = pred + 2; + } else { + max_backedge_src = pred; + min_backedge_dest = succ; + } + } + builder.add_edge(Block::new(pred), Block::new(succ)); + } + + builder.compute_doms(); + + for block in 0..num_blocks { + builder.f.block_preds[block].clear(); + } + for block in 0..num_blocks { + for &succ in &builder.f.block_succs[block] { + builder.f.block_preds[succ.index()].push(Block::new(block)); + } + } + + builder.compute_doms(); + + let mut vregs_by_block = vec![]; + let mut vregs_by_block_to_be_defined = vec![]; + let mut block_params = vec![vec![]; num_blocks]; + for block in 0..num_blocks { + let mut vregs = vec![]; + for _ in 0..u.int_in_range(5..=15)? { + let vreg = VReg::new(builder.f.num_vregs, RegClass::Int); + builder.f.num_vregs += 1; + vregs.push(vreg); + if opts.reftypes && bool::arbitrary(u)? { + builder.f.reftype_vregs.push(vreg); + } + } + vregs_by_block.push(vregs.clone()); + vregs_by_block_to_be_defined.push(vec![]); + let mut max_block_params = u.int_in_range(0..=std::cmp::min(3, vregs.len() / 3))?; + for &vreg in &vregs { + if block > 0 && opts.block_params && bool::arbitrary(u)? && max_block_params > 0 { + block_params[block].push(vreg); + max_block_params -= 1; + } else { + vregs_by_block_to_be_defined.last_mut().unwrap().push(vreg); + } + } + vregs_by_block_to_be_defined.last_mut().unwrap().reverse(); + builder.set_block_params(Block::new(block), &block_params[block][..]); + } + + for block in 0..num_blocks { + let mut avail = block_params[block].clone(); + let mut remaining_nonlocal_uses = u.int_in_range(0..=3)?; + while let Some(vreg) = vregs_by_block_to_be_defined[block].pop() { + let def_constraint = OperandConstraint::arbitrary(u)?; + let def_pos = if bool::arbitrary(u)? { + OperandPos::Early + } else { + OperandPos::Late + }; + let mut operands = vec![Operand::new( + vreg, + def_constraint, + OperandKind::Def, + def_pos, + )]; + let mut allocations = vec![Allocation::none()]; + for _ in 0..u.int_in_range(0..=3)? { + let vreg = if avail.len() > 0 + && (opts.always_local_uses + || remaining_nonlocal_uses == 0 + || bool::arbitrary(u)?) + { + *u.choose(&avail[..])? + } else if !opts.always_local_uses { + let def_block = choose_dominating_block( + &builder.idom[..], + Block::new(block), + /* allow_self = */ false, + u, + )?; + if !def_block.is_valid() { + // No vregs already defined, and no pred blocks that dominate us + // (perhaps we are the entry block): just stop generating inputs. + break; + } + remaining_nonlocal_uses -= 1; + *u.choose(&vregs_by_block[def_block.index()])? + } else { + break; + }; + let use_constraint = OperandConstraint::arbitrary(u)?; + operands.push(Operand::new( + vreg, + use_constraint, + OperandKind::Use, + OperandPos::Early, + )); + allocations.push(Allocation::none()); + } + let mut clobbers: Vec = vec![]; + if operands.len() > 1 && opts.reused_inputs && bool::arbitrary(u)? { + // Make the def a reused input. + let op = operands[0]; + assert_eq!(op.kind(), OperandKind::Def); + let reused = u.int_in_range(1..=(operands.len() - 1))?; + operands[0] = Operand::new( + op.vreg(), + OperandConstraint::Reuse(reused), + op.kind(), + OperandPos::Late, + ); + // Make sure reused input is a Reg. + let op = operands[reused]; + operands[reused] = Operand::new( + op.vreg(), + OperandConstraint::Reg, + op.kind(), + OperandPos::Early, + ); + } else if opts.fixed_regs && bool::arbitrary(u)? { + let mut fixed = vec![]; + for _ in 0..u.int_in_range(0..=operands.len() - 1)? { + // Pick an operand and make it a fixed reg. + let fixed_reg = PReg::new(u.int_in_range(0..=30)?, RegClass::Int); + if fixed.contains(&fixed_reg) { + break; + } + fixed.push(fixed_reg); + let i = u.int_in_range(0..=(operands.len() - 1))?; + let op = operands[i]; + operands[i] = Operand::new( + op.vreg(), + OperandConstraint::FixedReg(fixed_reg), + op.kind(), + op.pos(), + ); + } + } else if opts.clobbers && bool::arbitrary(u)? { + for _ in 0..u.int_in_range(0..=5)? { + let reg = u.int_in_range(0..=30)?; + if clobbers.iter().any(|r| r.hw_enc() == reg) { + break; + } + clobbers.push(PReg::new(reg, RegClass::Int)); + } + } + + let is_safepoint = opts.reftypes + && operands + .iter() + .all(|op| !builder.f.reftype_vregs.contains(&op.vreg())) + && bool::arbitrary(u)?; + + let op = *u.choose(&[InstOpcode::Op, InstOpcode::Call])?; + builder.add_inst( + Block::new(block), + InstData { + op, + operands, + clobbers, + is_safepoint, + }, + ); + avail.push(vreg); + } + + // Define the branch with blockparam args that must end + // the block. + if builder.f.block_succs[block].len() > 0 { + let mut args = vec![]; + for &succ in &builder.f.block_succs[block] { + for _ in 0..builder.f.block_params[succ.index()].len() { + let dom_block = choose_dominating_block( + &builder.idom[..], + Block::new(block), + false, + u, + )?; + let vreg = if dom_block.is_valid() && bool::arbitrary(u)? { + u.choose(&vregs_by_block[dom_block.index()][..])? + } else { + u.choose(&avail[..])? + }; + args.push(vreg.vreg()); + } + } + builder.add_inst(Block::new(block), InstData::branch(&args[..])); + } else { + builder.add_inst(Block::new(block), InstData::ret()); + } + } + + Ok(builder.finalize()) + } +} + +impl std::fmt::Debug for Func { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{{\n")?; + for vreg in self.reftype_vregs() { + write!(f, " REF: {}\n", vreg)?; + } + for (i, blockrange) in self.blocks.iter().enumerate() { + let succs = self.block_succs[i] + .iter() + .map(|b| b.index()) + .collect::>(); + let preds = self.block_preds[i] + .iter() + .map(|b| b.index()) + .collect::>(); + let params = self.block_params[i] + .iter() + .map(|v| format!("v{}", v.vreg())) + .collect::>() + .join(", "); + write!( + f, + " block{}({}): # succs:{:?} preds:{:?}\n", + i, params, succs, preds + )?; + for inst in blockrange.iter() { + if self.requires_refs_on_stack(inst) { + write!(f, " -- SAFEPOINT --\n")?; + } + write!( + f, + " inst{}: {:?} ops:{:?} clobber:{:?}\n", + inst.index(), + self.insts[inst.index()].op, + self.insts[inst.index()].operands, + self.insts[inst.index()].clobbers + )?; + } + } + write!(f, "}}\n")?; + Ok(()) + } +} + +pub fn machine_env() -> MachineEnv { + // Reg 31 is the scratch reg. + let regs: Vec = (0..31).map(|i| PReg::new(i, RegClass::Int)).collect(); + let preferred_regs_by_class: [Vec; 2] = [regs.iter().cloned().take(24).collect(), vec![]]; + let non_preferred_regs_by_class: [Vec; 2] = + [regs.iter().cloned().skip(24).collect(), vec![]]; + let scratch_by_class: [PReg; 2] = [PReg::new(31, RegClass::Int), PReg::new(0, RegClass::Float)]; + MachineEnv { + regs, + preferred_regs_by_class, + non_preferred_regs_by_class, + scratch_by_class, + } +} diff --git a/src/fuzzing/mod.rs b/src/fuzzing/mod.rs new file mode 100644 index 00000000..e6a225e6 --- /dev/null +++ b/src/fuzzing/mod.rs @@ -0,0 +1,32 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +//! Utilities for fuzzing. + +pub mod func; + +// Re-exports for fuzz targets. + +pub mod domtree { + pub use crate::domtree::*; +} +pub mod postorder { + pub use crate::postorder::*; +} +pub mod moves { + pub use crate::moves::*; +} +pub mod cfg { + pub use crate::cfg::*; +} +pub mod ssa { + pub use crate::ssa::*; +} +pub mod ion { + pub use crate::ion::*; +} +pub mod checker { + pub use crate::checker::*; +} diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 00000000..21dd9766 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,181 @@ +#[macro_export] +macro_rules! define_index { + ($ix:ident) => { + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub struct $ix(pub u32); + impl $ix { + #[inline(always)] + pub fn new(i: usize) -> Self { + Self(i as u32) + } + #[inline(always)] + pub fn index(self) -> usize { + assert!(self.is_valid()); + self.0 as usize + } + #[inline(always)] + pub fn invalid() -> Self { + Self(u32::MAX) + } + #[inline(always)] + pub fn is_invalid(self) -> bool { + self == Self::invalid() + } + #[inline(always)] + pub fn is_valid(self) -> bool { + self != Self::invalid() + } + #[inline(always)] + pub fn next(self) -> $ix { + assert!(self.is_valid()); + Self(self.0 + 1) + } + #[inline(always)] + pub fn prev(self) -> $ix { + assert!(self.is_valid()); + Self(self.0 - 1) + } + + #[inline(always)] + pub fn raw_u32(self) -> u32 { + self.0 + } + } + + impl crate::index::ContainerIndex for $ix {} + }; +} + +pub trait ContainerIndex: Clone + Copy + std::fmt::Debug + PartialEq + Eq {} + +pub trait ContainerComparator { + type Ix: ContainerIndex; + fn compare(&self, a: Self::Ix, b: Self::Ix) -> std::cmp::Ordering; +} + +define_index!(Inst); +define_index!(Block); + +#[derive(Clone, Copy, Debug)] +pub struct InstRange(Inst, Inst, bool); + +impl InstRange { + #[inline(always)] + pub fn forward(from: Inst, to: Inst) -> Self { + assert!(from.index() <= to.index()); + InstRange(from, to, true) + } + + #[inline(always)] + pub fn backward(from: Inst, to: Inst) -> Self { + assert!(from.index() >= to.index()); + InstRange(to, from, false) + } + + #[inline(always)] + pub fn first(self) -> Inst { + assert!(self.len() > 0); + if self.is_forward() { + self.0 + } else { + self.1.prev() + } + } + + #[inline(always)] + pub fn last(self) -> Inst { + assert!(self.len() > 0); + if self.is_forward() { + self.1.prev() + } else { + self.0 + } + } + + #[inline(always)] + pub fn rest(self) -> InstRange { + assert!(self.len() > 0); + if self.is_forward() { + InstRange::forward(self.0.next(), self.1) + } else { + InstRange::backward(self.1.prev(), self.0) + } + } + + #[inline(always)] + pub fn len(self) -> usize { + self.1.index() - self.0.index() + } + + #[inline(always)] + pub fn is_forward(self) -> bool { + self.2 + } + + #[inline(always)] + pub fn rev(self) -> Self { + Self(self.0, self.1, !self.2) + } + + #[inline(always)] + pub fn iter(self) -> InstRangeIter { + InstRangeIter(self) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct InstRangeIter(InstRange); + +impl Iterator for InstRangeIter { + type Item = Inst; + #[inline(always)] + fn next(&mut self) -> Option { + if self.0.len() == 0 { + None + } else { + let ret = self.0.first(); + self.0 = self.0.rest(); + Some(ret) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_inst_range() { + let range = InstRange::forward(Inst::new(0), Inst::new(0)); + assert_eq!(range.len(), 0); + + let range = InstRange::forward(Inst::new(0), Inst::new(5)); + assert_eq!(range.first().index(), 0); + assert_eq!(range.last().index(), 4); + assert_eq!(range.len(), 5); + assert_eq!( + range.iter().collect::>(), + vec![ + Inst::new(0), + Inst::new(1), + Inst::new(2), + Inst::new(3), + Inst::new(4) + ] + ); + let range = range.rev(); + assert_eq!(range.first().index(), 4); + assert_eq!(range.last().index(), 0); + assert_eq!(range.len(), 5); + assert_eq!( + range.iter().collect::>(), + vec![ + Inst::new(4), + Inst::new(3), + Inst::new(2), + Inst::new(1), + Inst::new(0) + ] + ); + } +} diff --git a/src/indexset.rs b/src/indexset.rs new file mode 100644 index 00000000..35d90ddc --- /dev/null +++ b/src/indexset.rs @@ -0,0 +1,367 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +//! Index sets: sets of integers that represent indices into a space. + +use fxhash::FxHashMap; +use std::cell::Cell; + +const SMALL_ELEMS: usize = 12; + +/// A hybrid large/small-mode sparse mapping from integer indices to +/// elements. +/// +/// The trailing `(u32, u64)` elements in each variant is a one-item +/// cache to allow fast access when streaming through. +#[derive(Clone, Debug)] +enum AdaptiveMap { + Small { + len: u32, + keys: [u32; SMALL_ELEMS], + values: [u64; SMALL_ELEMS], + }, + Large(FxHashMap), +} + +const INVALID: u32 = 0xffff_ffff; + +impl AdaptiveMap { + fn new() -> Self { + Self::Small { + len: 0, + keys: [INVALID; SMALL_ELEMS], + values: [0; SMALL_ELEMS], + } + } + + /// Expand into `Large` mode if we are at capacity and have no + /// zero-value pairs that can be trimmed. + #[inline(never)] + fn expand(&mut self) { + match self { + &mut Self::Small { + ref mut len, + ref mut keys, + ref mut values, + } => { + // Note: we *may* remain as `Small` if there are any + // zero elements. Try removing them first, before we + // commit to a memory allocation. + if values.iter().any(|v| *v == 0) { + let mut out = 0; + for i in 0..(*len as usize) { + if values[i] == 0 { + continue; + } + if out < i { + keys[out] = keys[i]; + values[out] = values[i]; + } + out += 1; + } + *len = out as u32; + } else { + let mut map = FxHashMap::default(); + for i in 0..(*len as usize) { + map.insert(keys[i], values[i]); + } + *self = Self::Large(map); + } + } + _ => {} + } + } + #[inline(always)] + fn get_or_insert<'a>(&'a mut self, key: u32) -> &'a mut u64 { + // Check whether the key is present and we are in small mode; + // if no to both, we need to expand first. + let (needs_expand, small_mode_idx) = match self { + &mut Self::Small { len, ref keys, .. } => { + // Perform this scan but do not return right away; + // doing so runs into overlapping-borrow issues + // because the current non-lexical lifetimes + // implementation is not able to see that the `self` + // mutable borrow on return is only on the + // early-return path. + let small_mode_idx = keys.iter().take(len as usize).position(|k| *k == key); + let needs_expand = small_mode_idx.is_none() && len == SMALL_ELEMS as u32; + (needs_expand, small_mode_idx) + } + _ => (false, None), + }; + + if needs_expand { + assert!(small_mode_idx.is_none()); + self.expand(); + } + + match self { + &mut Self::Small { + ref mut len, + ref mut keys, + ref mut values, + } => { + // If we found the key already while checking whether + // we need to expand above, use that index to return + // early. + if let Some(i) = small_mode_idx { + return &mut values[i]; + } + // Otherwise, the key must not be present; add a new + // entry. + assert!(*len < SMALL_ELEMS as u32); + let idx = *len; + *len += 1; + keys[idx as usize] = key; + values[idx as usize] = 0; + &mut values[idx as usize] + } + &mut Self::Large(ref mut map) => map.entry(key).or_insert(0), + } + } + #[inline(always)] + fn get_mut(&mut self, key: u32) -> Option<&mut u64> { + match self { + &mut Self::Small { + len, + ref keys, + ref mut values, + } => { + for i in 0..len { + if keys[i as usize] == key { + return Some(&mut values[i as usize]); + } + } + None + } + &mut Self::Large(ref mut map) => map.get_mut(&key), + } + } + #[inline(always)] + fn get(&self, key: u32) -> Option { + match self { + &Self::Small { + len, + ref keys, + ref values, + } => { + for i in 0..len { + if keys[i as usize] == key { + let value = values[i as usize]; + return Some(value); + } + } + None + } + &Self::Large(ref map) => { + let value = map.get(&key).cloned(); + value + } + } + } + fn iter<'a>(&'a self) -> AdaptiveMapIter<'a> { + match self { + &Self::Small { + len, + ref keys, + ref values, + } => AdaptiveMapIter::Small(&keys[0..len as usize], &values[0..len as usize]), + &Self::Large(ref map) => AdaptiveMapIter::Large(map.iter()), + } + } +} + +enum AdaptiveMapIter<'a> { + Small(&'a [u32], &'a [u64]), + Large(std::collections::hash_map::Iter<'a, u32, u64>), +} + +impl<'a> std::iter::Iterator for AdaptiveMapIter<'a> { + type Item = (u32, u64); + fn next(&mut self) -> Option { + match self { + &mut Self::Small(ref mut keys, ref mut values) => { + if keys.is_empty() { + None + } else { + let (k, v) = ((*keys)[0], (*values)[0]); + *keys = &(*keys)[1..]; + *values = &(*values)[1..]; + Some((k, v)) + } + } + &mut Self::Large(ref mut it) => it.next().map(|(&k, &v)| (k, v)), + } + } +} + +/// A conceptually infinite-length set of indices that allows union +/// and efficient iteration over elements. +#[derive(Clone)] +pub struct IndexSet { + elems: AdaptiveMap, + cache: Cell<(u32, u64)>, +} + +const BITS_PER_WORD: usize = 64; + +impl IndexSet { + pub fn new() -> Self { + Self { + elems: AdaptiveMap::new(), + cache: Cell::new((INVALID, 0)), + } + } + + #[inline(always)] + fn elem(&mut self, bit_index: usize) -> &mut u64 { + let word_index = (bit_index / BITS_PER_WORD) as u32; + if self.cache.get().0 == word_index { + self.cache.set((INVALID, 0)); + } + self.elems.get_or_insert(word_index) + } + + #[inline(always)] + fn maybe_elem_mut(&mut self, bit_index: usize) -> Option<&mut u64> { + let word_index = (bit_index / BITS_PER_WORD) as u32; + if self.cache.get().0 == word_index { + self.cache.set((INVALID, 0)); + } + self.elems.get_mut(word_index) + } + + #[inline(always)] + fn maybe_elem(&self, bit_index: usize) -> Option { + let word_index = (bit_index / BITS_PER_WORD) as u32; + if self.cache.get().0 == word_index { + Some(self.cache.get().1) + } else { + self.elems.get(word_index) + } + } + + #[inline(always)] + pub fn set(&mut self, idx: usize, val: bool) { + let bit = idx % BITS_PER_WORD; + if val { + *self.elem(idx) |= 1 << bit; + } else if let Some(word) = self.maybe_elem_mut(idx) { + *word &= !(1 << bit); + } + } + + pub fn assign(&mut self, other: &Self) { + self.elems = other.elems.clone(); + self.cache = other.cache.clone(); + } + + #[inline(always)] + pub fn get(&self, idx: usize) -> bool { + let bit = idx % BITS_PER_WORD; + if let Some(word) = self.maybe_elem(idx) { + (word & (1 << bit)) != 0 + } else { + false + } + } + + pub fn union_with(&mut self, other: &Self) -> bool { + let mut changed = 0; + for (word_idx, bits) in other.elems.iter() { + if bits == 0 { + continue; + } + let word_idx = word_idx as usize; + let self_word = self.elem(word_idx * BITS_PER_WORD); + changed |= bits & !*self_word; + *self_word |= bits; + } + changed != 0 + } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.elems.iter().flat_map(|(word_idx, bits)| { + let word_idx = word_idx as usize; + set_bits(bits).map(move |i| BITS_PER_WORD * word_idx + i) + }) + } + + /// Is the adaptive data structure in "small" mode? This is meant + /// for testing assertions only. + pub(crate) fn is_small(&self) -> bool { + match &self.elems { + &AdaptiveMap::Small { .. } => true, + _ => false, + } + } +} + +fn set_bits(bits: u64) -> impl Iterator { + let iter = SetBitsIter(bits); + iter +} + +pub struct SetBitsIter(u64); + +impl Iterator for SetBitsIter { + type Item = usize; + fn next(&mut self) -> Option { + // Build an `Option` so that on the nonzero path, + // the compiler can optimize the trailing-zeroes operator + // using that knowledge. + std::num::NonZeroU64::new(self.0).map(|nz| { + let bitidx = nz.trailing_zeros(); + self.0 &= self.0 - 1; // clear highest set bit + bitidx as usize + }) + } +} + +impl std::fmt::Debug for IndexSet { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let vals = self.iter().collect::>(); + write!(f, "{:?}", vals) + } +} + +#[cfg(test)] +mod test { + use super::IndexSet; + + #[test] + fn test_set_bits_iter() { + let mut vec = IndexSet::new(); + let mut sum = 0; + for i in 0..1024 { + if i % 17 == 0 { + vec.set(i, true); + sum += i; + } + } + + let mut checksum = 0; + for bit in vec.iter() { + assert!(bit % 17 == 0); + checksum += bit; + } + + assert_eq!(sum, checksum); + } + + #[test] + fn test_expand_remove_zero_elems() { + let mut vec = IndexSet::new(); + // Set 12 different words (this is the max small-mode size). + for i in 0..12 { + vec.set(64 * i, true); + } + // Now clear a bit, and set a bit in a different word. We + // should still be in small mode. + vec.set(64 * 5, false); + vec.set(64 * 100, true); + assert!(vec.is_small()); + } +} diff --git a/src/ion/LICENSE b/src/ion/LICENSE new file mode 100644 index 00000000..14e2f777 --- /dev/null +++ b/src/ion/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/src/ion/data_structures.rs b/src/ion/data_structures.rs new file mode 100644 index 00000000..dbf1d777 --- /dev/null +++ b/src/ion/data_structures.rs @@ -0,0 +1,535 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Data structures for backtracking allocator. + +use super::liveranges::SpillWeight; +use crate::cfg::CFGInfo; +use crate::index::ContainerComparator; +use crate::indexset::IndexSet; +use crate::{ + define_index, Allocation, Block, Edit, Function, Inst, MachineEnv, Operand, PReg, ProgPoint, + RegClass, SpillSlot, VReg, +}; +use smallvec::SmallVec; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::Debug; + +/// A range from `from` (inclusive) to `to` (exclusive). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct CodeRange { + pub from: ProgPoint, + pub to: ProgPoint, +} + +impl CodeRange { + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.from == self.to + } + #[inline(always)] + pub fn contains(&self, other: &Self) -> bool { + other.from >= self.from && other.to <= self.to + } + #[inline(always)] + pub fn contains_point(&self, other: ProgPoint) -> bool { + other >= self.from && other < self.to + } + #[inline(always)] + pub fn overlaps(&self, other: &Self) -> bool { + other.to > self.from && other.from < self.to + } + #[inline(always)] + pub fn len(&self) -> usize { + self.to.inst().index() - self.from.inst().index() + } +} + +impl std::cmp::PartialOrd for CodeRange { + #[inline(always)] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl std::cmp::Ord for CodeRange { + #[inline(always)] + fn cmp(&self, other: &Self) -> Ordering { + if self.to <= other.from { + Ordering::Less + } else if self.from >= other.to { + Ordering::Greater + } else { + Ordering::Equal + } + } +} + +define_index!(LiveBundleIndex); +define_index!(LiveRangeIndex); +define_index!(SpillSetIndex); +define_index!(UseIndex); +define_index!(VRegIndex); +define_index!(PRegIndex); +define_index!(SpillSlotIndex); + +/// Used to carry small sets of bundles, e.g. for conflict sets. +pub type LiveBundleVec = SmallVec<[LiveBundleIndex; 4]>; + +#[derive(Clone, Copy, Debug)] +pub struct LiveRangeListEntry { + pub range: CodeRange, + pub index: LiveRangeIndex, +} + +pub type LiveRangeList = SmallVec<[LiveRangeListEntry; 4]>; +pub type UseList = SmallVec<[Use; 2]>; + +#[derive(Clone, Debug)] +pub struct LiveRange { + pub range: CodeRange, + + pub vreg: VRegIndex, + pub bundle: LiveBundleIndex, + pub uses_spill_weight_and_flags: u32, + + pub uses: UseList, + + pub merged_into: LiveRangeIndex, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u32)] +pub enum LiveRangeFlag { + StartsAtDef = 1, +} + +impl LiveRange { + #[inline(always)] + pub fn set_flag(&mut self, flag: LiveRangeFlag) { + self.uses_spill_weight_and_flags |= (flag as u32) << 29; + } + #[inline(always)] + pub fn clear_flag(&mut self, flag: LiveRangeFlag) { + self.uses_spill_weight_and_flags &= !((flag as u32) << 29); + } + #[inline(always)] + pub fn assign_flag(&mut self, flag: LiveRangeFlag, val: bool) { + let bit = if val { (flag as u32) << 29 } else { 0 }; + self.uses_spill_weight_and_flags &= 0xe000_0000; + self.uses_spill_weight_and_flags |= bit; + } + #[inline(always)] + pub fn has_flag(&self, flag: LiveRangeFlag) -> bool { + self.uses_spill_weight_and_flags & ((flag as u32) << 29) != 0 + } + #[inline(always)] + pub fn flag_word(&self) -> u32 { + self.uses_spill_weight_and_flags & 0xe000_0000 + } + #[inline(always)] + pub fn merge_flags(&mut self, flag_word: u32) { + self.uses_spill_weight_and_flags |= flag_word; + } + #[inline(always)] + pub fn uses_spill_weight(&self) -> SpillWeight { + let bits = (self.uses_spill_weight_and_flags & 0x1fff_ffff) << 2; + SpillWeight::from_f32(f32::from_bits(bits)) + } + #[inline(always)] + pub fn set_uses_spill_weight(&mut self, weight: SpillWeight) { + let weight_bits = (weight.to_f32().to_bits() >> 2) & 0x1fff_ffff; + self.uses_spill_weight_and_flags = + (self.uses_spill_weight_and_flags & 0xe000_0000) | weight_bits; + } +} + +#[derive(Clone, Copy, Debug)] +pub struct Use { + pub operand: Operand, + pub pos: ProgPoint, + pub slot: u8, + pub weight: u16, +} + +impl Use { + #[inline(always)] + pub fn new(operand: Operand, pos: ProgPoint, slot: u8) -> Self { + Self { + operand, + pos, + slot, + // Weight is updated on insertion into LR. + weight: 0, + } + } +} + +pub const SLOT_NONE: u8 = u8::MAX; + +#[derive(Clone, Debug)] +pub struct LiveBundle { + pub ranges: LiveRangeList, + pub spillset: SpillSetIndex, + pub allocation: Allocation, + pub prio: u32, // recomputed after every bulk update + pub spill_weight_and_props: u32, +} + +impl LiveBundle { + #[inline(always)] + pub fn set_cached_spill_weight_and_props( + &mut self, + spill_weight: u32, + minimal: bool, + fixed: bool, + stack: bool, + ) { + debug_assert!(spill_weight < ((1 << 29) - 1)); + self.spill_weight_and_props = spill_weight + | (if minimal { 1 << 31 } else { 0 }) + | (if fixed { 1 << 30 } else { 0 }) + | (if stack { 1 << 29 } else { 0 }); + } + + #[inline(always)] + pub fn cached_minimal(&self) -> bool { + self.spill_weight_and_props & (1 << 31) != 0 + } + + #[inline(always)] + pub fn cached_fixed(&self) -> bool { + self.spill_weight_and_props & (1 << 30) != 0 + } + + #[inline(always)] + pub fn cached_stack(&self) -> bool { + self.spill_weight_and_props & (1 << 29) != 0 + } + + #[inline(always)] + pub fn set_cached_fixed(&mut self) { + self.spill_weight_and_props |= 1 << 30; + } + + #[inline(always)] + pub fn set_cached_stack(&mut self) { + self.spill_weight_and_props |= 1 << 29; + } + + #[inline(always)] + pub fn cached_spill_weight(&self) -> u32 { + self.spill_weight_and_props & ((1 << 29) - 1) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct BundleProperties { + pub minimal: bool, + pub fixed: bool, +} + +#[derive(Clone, Debug)] +pub struct SpillSet { + pub vregs: SmallVec<[VRegIndex; 2]>, + pub slot: SpillSlotIndex, + pub reg_hint: PReg, + pub class: RegClass, + pub spill_bundle: LiveBundleIndex, + pub required: bool, + pub size: u8, +} + +#[derive(Clone, Debug)] +pub struct VRegData { + pub ranges: LiveRangeList, + pub blockparam: Block, + pub is_ref: bool, + pub is_pinned: bool, +} + +#[derive(Clone, Debug)] +pub struct PRegData { + pub reg: PReg, + pub allocations: LiveRangeSet, +} + +#[derive(Clone, Debug)] +pub struct Env<'a, F: Function> { + pub func: &'a F, + pub env: &'a MachineEnv, + pub cfginfo: CFGInfo, + pub liveins: Vec, + pub liveouts: Vec, + /// Blockparam outputs: from-vreg, (end of) from-block, (start of) + /// to-block, to-vreg. The field order is significant: these are sorted so + /// that a scan over vregs, then blocks in each range, can scan in + /// order through this (sorted) list and add allocs to the + /// half-move list. + pub blockparam_outs: Vec<(VRegIndex, Block, Block, VRegIndex)>, + /// Blockparam inputs: to-vreg, (start of) to-block, (end of) + /// from-block. As above for `blockparam_outs`, field order is + /// significant. + pub blockparam_ins: Vec<(VRegIndex, Block, Block)>, + /// Blockparam allocs: block, idx, vreg, alloc. Info to describe + /// blockparam locations at block entry, for metadata purposes + /// (e.g. for the checker). + pub blockparam_allocs: Vec<(Block, u32, VRegIndex, Allocation)>, + + pub ranges: Vec, + pub bundles: Vec, + pub spillsets: Vec, + pub vregs: Vec, + pub vreg_regs: Vec, + pub pregs: Vec, + pub allocation_queue: PrioQueue, + pub clobbers: Vec, // Sorted list of insts with clobbers. + pub safepoints: Vec, // Sorted list of safepoint insts. + pub safepoints_per_vreg: HashMap>, + + pub spilled_bundles: Vec, + pub spillslots: Vec, + pub slots_by_size: Vec, + + pub extra_spillslot: Vec>, + + // Program moves: these are moves in the provided program that we + // handle with our internal machinery, in order to avoid the + // overhead of ordinary operand processing. We expect the client + // to not generate any code for instructions that return + // `Some(..)` for `.is_move()`, and instead use the edits that we + // provide to implement those moves (or some simplified version of + // them) post-regalloc. + // + // (from-vreg, inst, from-alloc), sorted by (from-vreg, inst) + pub prog_move_srcs: Vec<((VRegIndex, Inst), Allocation)>, + // (to-vreg, inst, to-alloc), sorted by (to-vreg, inst) + pub prog_move_dsts: Vec<((VRegIndex, Inst), Allocation)>, + // (from-vreg, to-vreg) for bundle-merging. + pub prog_move_merges: Vec<(LiveRangeIndex, LiveRangeIndex)>, + + // When multiple fixed-register constraints are present on a + // single VReg at a single program point (this can happen for, + // e.g., call args that use the same value multiple times), we + // remove all but one of the fixed-register constraints, make a + // note here, and add a clobber with that PReg instread to keep + // the register available. When we produce the final edit-list, we + // will insert a copy from wherever the VReg's primary allocation + // was to the approprate PReg. + // + // (progpoint, copy-from-preg, copy-to-preg, to-slot) + pub multi_fixed_reg_fixups: Vec<(ProgPoint, PRegIndex, PRegIndex, usize)>, + + pub inserted_moves: Vec, + + // Output: + pub edits: Vec<(u32, InsertMovePrio, Edit)>, + pub allocs: Vec, + pub inst_alloc_offsets: Vec, + pub num_spillslots: u32, + pub safepoint_slots: Vec<(ProgPoint, SpillSlot)>, + + pub allocated_bundle_count: usize, + + pub stats: Stats, + + // For debug output only: a list of textual annotations at every + // ProgPoint to insert into the final allocated program listing. + pub debug_annotations: std::collections::HashMap>, + pub annotations_enabled: bool, +} + +#[derive(Clone, Debug)] +pub struct SpillSlotData { + pub ranges: LiveRangeSet, + pub class: RegClass, + pub alloc: Allocation, + pub next_spillslot: SpillSlotIndex, +} + +#[derive(Clone, Debug)] +pub struct SpillSlotList { + pub first_spillslot: SpillSlotIndex, + pub last_spillslot: SpillSlotIndex, +} + +#[derive(Clone, Debug)] +pub struct PrioQueue { + pub heap: std::collections::BinaryHeap, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct PrioQueueEntry { + pub prio: u32, + pub bundle: LiveBundleIndex, + pub reg_hint: PReg, +} + +#[derive(Clone, Debug)] +pub struct LiveRangeSet { + pub btree: BTreeMap, +} + +#[derive(Clone, Copy, Debug)] +pub struct LiveRangeKey { + pub from: u32, + pub to: u32, +} + +impl LiveRangeKey { + #[inline(always)] + pub fn from_range(range: &CodeRange) -> Self { + Self { + from: range.from.to_index(), + to: range.to.to_index(), + } + } + + #[inline(always)] + pub fn to_range(&self) -> CodeRange { + CodeRange { + from: ProgPoint::from_index(self.from), + to: ProgPoint::from_index(self.to), + } + } +} + +impl std::cmp::PartialEq for LiveRangeKey { + #[inline(always)] + fn eq(&self, other: &Self) -> bool { + self.to > other.from && self.from < other.to + } +} +impl std::cmp::Eq for LiveRangeKey {} +impl std::cmp::PartialOrd for LiveRangeKey { + #[inline(always)] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl std::cmp::Ord for LiveRangeKey { + #[inline(always)] + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + if self.to <= other.from { + std::cmp::Ordering::Less + } else if self.from >= other.to { + std::cmp::Ordering::Greater + } else { + std::cmp::Ordering::Equal + } + } +} + +pub struct PrioQueueComparator<'a> { + pub prios: &'a [usize], +} +impl<'a> ContainerComparator for PrioQueueComparator<'a> { + type Ix = LiveBundleIndex; + fn compare(&self, a: Self::Ix, b: Self::Ix) -> std::cmp::Ordering { + self.prios[a.index()].cmp(&self.prios[b.index()]) + } +} + +impl PrioQueue { + pub fn new() -> Self { + PrioQueue { + heap: std::collections::BinaryHeap::new(), + } + } + + #[inline(always)] + pub fn insert(&mut self, bundle: LiveBundleIndex, prio: usize, reg_hint: PReg) { + self.heap.push(PrioQueueEntry { + prio: prio as u32, + bundle, + reg_hint, + }); + } + + #[inline(always)] + pub fn is_empty(self) -> bool { + self.heap.is_empty() + } + + #[inline(always)] + pub fn pop(&mut self) -> Option<(LiveBundleIndex, PReg)> { + self.heap.pop().map(|entry| (entry.bundle, entry.reg_hint)) + } +} + +impl LiveRangeSet { + pub(crate) fn new() -> Self { + Self { + btree: BTreeMap::new(), + } + } +} + +#[derive(Clone, Debug)] +pub struct InsertedMove { + pub pos: ProgPoint, + pub prio: InsertMovePrio, + pub from_alloc: Allocation, + pub to_alloc: Allocation, + pub to_vreg: Option, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum InsertMovePrio { + InEdgeMoves, + BlockParam, + Regular, + PostRegular, + MultiFixedReg, + ReusedInput, + OutEdgeMoves, +} + +#[derive(Clone, Copy, Debug, Default)] +pub struct Stats { + pub livein_blocks: usize, + pub livein_iterations: usize, + pub initial_liverange_count: usize, + pub merged_bundle_count: usize, + pub prog_moves: usize, + pub prog_moves_dead_src: usize, + pub prog_move_merge_attempt: usize, + pub prog_move_merge_success: usize, + pub process_bundle_count: usize, + pub process_bundle_reg_probes_fixed: usize, + pub process_bundle_reg_success_fixed: usize, + pub process_bundle_bounding_range_probe_start_any: usize, + pub process_bundle_bounding_range_probes_any: usize, + pub process_bundle_bounding_range_success_any: usize, + pub process_bundle_reg_probe_start_any: usize, + pub process_bundle_reg_probes_any: usize, + pub process_bundle_reg_success_any: usize, + pub evict_bundle_event: usize, + pub evict_bundle_count: usize, + pub splits: usize, + pub splits_clobbers: usize, + pub splits_hot: usize, + pub splits_conflicts: usize, + pub splits_defs: usize, + pub splits_all: usize, + pub final_liverange_count: usize, + pub final_bundle_count: usize, + pub spill_bundle_count: usize, + pub spill_bundle_reg_probes: usize, + pub spill_bundle_reg_success: usize, + pub blockparam_ins_count: usize, + pub blockparam_outs_count: usize, + pub blockparam_allocs_count: usize, + pub halfmoves_count: usize, + pub edits_count: usize, +} diff --git a/src/ion/dump.rs b/src/ion/dump.rs new file mode 100644 index 00000000..0048f801 --- /dev/null +++ b/src/ion/dump.rs @@ -0,0 +1,141 @@ +//! Debugging output. + +use super::Env; +use crate::{Block, Function, ProgPoint}; + +impl<'a, F: Function> Env<'a, F> { + pub fn dump_state(&self) { + log::trace!("Bundles:"); + for (i, b) in self.bundles.iter().enumerate() { + log::trace!( + "bundle{}: spillset={:?} alloc={:?}", + i, + b.spillset, + b.allocation + ); + for entry in &b.ranges { + log::trace!( + " * range {:?} -- {:?}: range{}", + entry.range.from, + entry.range.to, + entry.index.index() + ); + } + } + log::trace!("VRegs:"); + for (i, v) in self.vregs.iter().enumerate() { + log::trace!("vreg{}:", i); + for entry in &v.ranges { + log::trace!( + " * range {:?} -- {:?}: range{}", + entry.range.from, + entry.range.to, + entry.index.index() + ); + } + } + log::trace!("Ranges:"); + for (i, r) in self.ranges.iter().enumerate() { + log::trace!( + "range{}: range={:?} vreg={:?} bundle={:?} weight={:?}", + i, + r.range, + r.vreg, + r.bundle, + r.uses_spill_weight(), + ); + for u in &r.uses { + log::trace!(" * use at {:?} (slot {}): {:?}", u.pos, u.slot, u.operand); + } + } + } + + pub fn annotate(&mut self, progpoint: ProgPoint, s: String) { + if self.annotations_enabled { + self.debug_annotations + .entry(progpoint) + .or_insert_with(|| vec![]) + .push(s); + } + } + + pub fn dump_results(&self) { + log::info!("=== REGALLOC RESULTS ==="); + for block in 0..self.func.num_blocks() { + let block = Block::new(block); + log::info!( + "block{}: [succs {:?} preds {:?}]", + block.index(), + self.func + .block_succs(block) + .iter() + .map(|b| b.index()) + .collect::>(), + self.func + .block_preds(block) + .iter() + .map(|b| b.index()) + .collect::>() + ); + for inst in self.func.block_insns(block).iter() { + for annotation in self + .debug_annotations + .get(&ProgPoint::before(inst)) + .map(|v| &v[..]) + .unwrap_or(&[]) + { + log::info!(" inst{}-pre: {}", inst.index(), annotation); + } + let ops = self + .func + .inst_operands(inst) + .iter() + .map(|op| format!("{}", op)) + .collect::>(); + let clobbers = self + .func + .inst_clobbers(inst) + .iter() + .map(|preg| format!("{}", preg)) + .collect::>(); + let allocs = (0..ops.len()) + .map(|i| format!("{}", self.get_alloc(inst, i))) + .collect::>(); + let opname = if self.func.is_branch(inst) { + "br" + } else if self.func.is_call(inst) { + "call" + } else if self.func.is_ret(inst) { + "ret" + } else { + "op" + }; + let args = ops + .iter() + .zip(allocs.iter()) + .map(|(op, alloc)| format!("{} [{}]", op, alloc)) + .collect::>(); + let clobbers = if clobbers.is_empty() { + "".to_string() + } else { + format!(" [clobber: {}]", clobbers.join(", ")) + }; + log::info!( + " inst{}: {} {}{}", + inst.index(), + opname, + args.join(", "), + clobbers + ); + for annotation in self + .debug_annotations + .get(&ProgPoint::after(inst)) + .map(|v| &v[..]) + .unwrap_or(&[]) + { + log::info!(" inst{}-post: {}", inst.index(), annotation); + } + } + } + } +} diff --git a/src/ion/liveranges.rs b/src/ion/liveranges.rs new file mode 100644 index 00000000..501d9f5c --- /dev/null +++ b/src/ion/liveranges.rs @@ -0,0 +1,1249 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Live-range computation. + +use super::{ + CodeRange, Env, InsertMovePrio, LiveBundle, LiveBundleIndex, LiveRange, LiveRangeFlag, + LiveRangeIndex, LiveRangeKey, LiveRangeListEntry, LiveRangeSet, PRegData, PRegIndex, RegClass, + SpillSetIndex, Use, VRegData, VRegIndex, SLOT_NONE, +}; +use crate::indexset::IndexSet; +use crate::{ + Allocation, Block, Function, Inst, InstPosition, Operand, OperandConstraint, OperandKind, + OperandPos, PReg, ProgPoint, RegAllocError, VReg, +}; +use fxhash::FxHashSet; +use smallvec::{smallvec, SmallVec}; +use std::collections::{HashSet, VecDeque}; + +/// A spill weight computed for a certain Use. +#[derive(Clone, Copy, Debug)] +pub struct SpillWeight(f32); + +#[inline(always)] +pub fn spill_weight_from_constraint( + constraint: OperandConstraint, + loop_depth: usize, + is_def: bool, +) -> SpillWeight { + // A bonus of 1000 for one loop level, 4000 for two loop levels, + // 16000 for three loop levels, etc. Avoids exponentiation. + let loop_depth = std::cmp::min(10, loop_depth); + let hot_bonus: f32 = (0..loop_depth).fold(1000.0, |a, _| a * 4.0); + let def_bonus: f32 = if is_def { 2000.0 } else { 0.0 }; + let constraint_bonus: f32 = match constraint { + OperandConstraint::Any => 1000.0, + OperandConstraint::Reg | OperandConstraint::FixedReg(_) => 2000.0, + _ => 0.0, + }; + SpillWeight(hot_bonus + def_bonus + constraint_bonus) +} + +impl SpillWeight { + /// Convert a floating-point weight to a u16 that can be compactly + /// stored in a `Use`. We simply take the top 16 bits of the f32; this + /// is equivalent to the bfloat16 format + /// (https://en.wikipedia.org/wiki/Bfloat16_floating-point_format). + pub fn to_bits(self) -> u16 { + (self.0.to_bits() >> 15) as u16 + } + + /// Convert a value that was returned from + /// `SpillWeight::to_bits()` back into a `SpillWeight`. Note that + /// some precision may be lost when round-tripping from a spill + /// weight to packed bits and back. + pub fn from_bits(bits: u16) -> SpillWeight { + let x = f32::from_bits((bits as u32) << 15); + SpillWeight(x) + } + + /// Get a zero spill weight. + pub fn zero() -> SpillWeight { + SpillWeight(0.0) + } + + /// Convert to a raw floating-point value. + pub fn to_f32(self) -> f32 { + self.0 + } + + /// Create a `SpillWeight` from a raw floating-point value. + pub fn from_f32(x: f32) -> SpillWeight { + SpillWeight(x) + } + + pub fn to_int(self) -> u32 { + self.0 as u32 + } +} + +impl std::ops::Add for SpillWeight { + type Output = SpillWeight; + fn add(self, other: SpillWeight) -> Self { + SpillWeight(self.0 + other.0) + } +} + +impl<'a, F: Function> Env<'a, F> { + pub fn create_pregs_and_vregs(&mut self) { + // Create PRegs from the env. + self.pregs.resize( + PReg::MAX_INDEX, + PRegData { + reg: PReg::invalid(), + allocations: LiveRangeSet::new(), + }, + ); + for &preg in &self.env.regs { + self.pregs[preg.index()].reg = preg; + } + // Create VRegs from the vreg count. + for idx in 0..self.func.num_vregs() { + // We'll fill in the real details when we see the def. + let reg = VReg::new(idx, RegClass::Int); + self.add_vreg( + reg, + VRegData { + ranges: smallvec![], + blockparam: Block::invalid(), + is_ref: false, + is_pinned: false, + }, + ); + } + for v in self.func.reftype_vregs() { + self.vregs[v.vreg()].is_ref = true; + } + for v in self.func.pinned_vregs() { + self.vregs[v.vreg()].is_pinned = true; + } + // Create allocations too. + for inst in 0..self.func.num_insts() { + let start = self.allocs.len() as u32; + self.inst_alloc_offsets.push(start); + for _ in 0..self.func.inst_operands(Inst::new(inst)).len() { + self.allocs.push(Allocation::none()); + } + } + } + + pub fn add_vreg(&mut self, reg: VReg, data: VRegData) -> VRegIndex { + let idx = self.vregs.len(); + self.vregs.push(data); + self.vreg_regs.push(reg); + VRegIndex::new(idx) + } + + pub fn create_bundle(&mut self) -> LiveBundleIndex { + let bundle = self.bundles.len(); + self.bundles.push(LiveBundle { + allocation: Allocation::none(), + ranges: smallvec![], + spillset: SpillSetIndex::invalid(), + prio: 0, + spill_weight_and_props: 0, + }); + LiveBundleIndex::new(bundle) + } + + pub fn create_liverange(&mut self, range: CodeRange) -> LiveRangeIndex { + let idx = self.ranges.len(); + + self.ranges.push(LiveRange { + range, + vreg: VRegIndex::invalid(), + bundle: LiveBundleIndex::invalid(), + uses_spill_weight_and_flags: 0, + + uses: smallvec![], + + merged_into: LiveRangeIndex::invalid(), + }); + + LiveRangeIndex::new(idx) + } + + /// Mark `range` as live for the given `vreg`. + /// + /// Returns the liverange that contains the given range. + pub fn add_liverange_to_vreg(&mut self, vreg: VRegIndex, range: CodeRange) -> LiveRangeIndex { + log::trace!("add_liverange_to_vreg: vreg {:?} range {:?}", vreg, range); + + // Invariant: as we are building liveness information, we + // *always* process instructions bottom-to-top, and as a + // consequence, new liveranges are always created before any + // existing liveranges for a given vreg. We assert this here, + // then use it to avoid an O(n) merge step (which would lead + // to O(n^2) liveness construction cost overall). + // + // We store liveranges in reverse order in the `.ranges` + // array, then reverse them at the end of + // `compute_liveness()`. + + assert!( + self.vregs[vreg.index()].ranges.is_empty() + || range.to + <= self.ranges[self.vregs[vreg.index()] + .ranges + .last() + .unwrap() + .index + .index()] + .range + .from + ); + + if self.vregs[vreg.index()].ranges.is_empty() + || range.to + < self.ranges[self.vregs[vreg.index()] + .ranges + .last() + .unwrap() + .index + .index()] + .range + .from + { + // Is not contiguous with previously-added (immediately + // following) range; create a new range. + let lr = self.create_liverange(range); + self.ranges[lr.index()].vreg = vreg; + self.vregs[vreg.index()] + .ranges + .push(LiveRangeListEntry { range, index: lr }); + lr + } else { + // Is contiguous with previously-added range; just extend + // its range and return it. + let lr = self.vregs[vreg.index()].ranges.last().unwrap().index; + assert!(range.to == self.ranges[lr.index()].range.from); + self.ranges[lr.index()].range.from = range.from; + lr + } + } + + pub fn insert_use_into_liverange(&mut self, into: LiveRangeIndex, mut u: Use) { + let operand = u.operand; + let constraint = operand.constraint(); + let block = self.cfginfo.insn_block[u.pos.inst().index()]; + let loop_depth = self.cfginfo.approx_loop_depth[block.index()] as usize; + let weight = spill_weight_from_constraint( + constraint, + loop_depth, + operand.kind() != OperandKind::Use, + ); + u.weight = weight.to_bits(); + + log::trace!( + "insert use {:?} into lr {:?} with weight {:?}", + u, + into, + weight, + ); + + // N.B.: we do *not* update `requirement` on the range, + // because those will be computed during the multi-fixed-reg + // fixup pass later (after all uses are inserted). + + self.ranges[into.index()].uses.push(u); + + // Update stats. + let range_weight = self.ranges[into.index()].uses_spill_weight() + weight; + self.ranges[into.index()].set_uses_spill_weight(range_weight); + log::trace!( + " -> now range has weight {:?}", + self.ranges[into.index()].uses_spill_weight(), + ); + } + + pub fn find_vreg_liverange_for_pos( + &self, + vreg: VRegIndex, + pos: ProgPoint, + ) -> Option { + for entry in &self.vregs[vreg.index()].ranges { + if entry.range.contains_point(pos) { + return Some(entry.index); + } + } + None + } + + pub fn add_liverange_to_preg(&mut self, range: CodeRange, reg: PReg) { + log::trace!("adding liverange to preg: {:?} to {}", range, reg); + let preg_idx = PRegIndex::new(reg.index()); + self.pregs[preg_idx.index()] + .allocations + .btree + .insert(LiveRangeKey::from_range(&range), LiveRangeIndex::invalid()); + } + + pub fn is_live_in(&mut self, block: Block, vreg: VRegIndex) -> bool { + self.liveins[block.index()].get(vreg.index()) + } + + pub fn compute_liveness(&mut self) -> Result<(), RegAllocError> { + // Create initial LiveIn and LiveOut bitsets. + for _ in 0..self.func.num_blocks() { + self.liveins.push(IndexSet::new()); + self.liveouts.push(IndexSet::new()); + } + + // Run a worklist algorithm to precisely compute liveins and + // liveouts. + let mut workqueue = VecDeque::new(); + let mut workqueue_set = FxHashSet::default(); + // Initialize workqueue with postorder traversal. + for &block in &self.cfginfo.postorder[..] { + workqueue.push_back(block); + workqueue_set.insert(block); + } + + while !workqueue.is_empty() { + let block = workqueue.pop_front().unwrap(); + workqueue_set.remove(&block); + + log::trace!("computing liveins for block{}", block.index()); + + self.stats.livein_iterations += 1; + + let mut live = self.liveouts[block.index()].clone(); + log::trace!(" -> initial liveout set: {:?}", live); + + for inst in self.func.block_insns(block).rev().iter() { + if let Some((src, dst)) = self.func.is_move(inst) { + live.set(dst.vreg().vreg(), false); + live.set(src.vreg().vreg(), true); + } + + for pos in &[OperandPos::Late, OperandPos::Early] { + for op in self.func.inst_operands(inst) { + if op.pos() == *pos { + let was_live = live.get(op.vreg().vreg()); + log::trace!("op {:?} was_live = {}", op, was_live); + match op.kind() { + OperandKind::Use | OperandKind::Mod => { + live.set(op.vreg().vreg(), true); + } + OperandKind::Def => { + live.set(op.vreg().vreg(), false); + } + } + } + } + } + } + for &blockparam in self.func.block_params(block) { + live.set(blockparam.vreg(), false); + } + + for &pred in self.func.block_preds(block) { + if self.liveouts[pred.index()].union_with(&live) { + if !workqueue_set.contains(&pred) { + workqueue_set.insert(pred); + workqueue.push_back(pred); + } + } + } + + log::trace!("computed liveins at block{}: {:?}", block.index(), live); + self.liveins[block.index()] = live; + } + + // Check that there are no liveins to the entry block. (The + // client should create a virtual intsruction that defines any + // PReg liveins if necessary.) + if self.liveins[self.func.entry_block().index()] + .iter() + .next() + .is_some() + { + log::trace!( + "non-empty liveins to entry block: {:?}", + self.liveins[self.func.entry_block().index()] + ); + return Err(RegAllocError::EntryLivein); + } + + for &vreg in self.func.reftype_vregs() { + self.safepoints_per_vreg.insert(vreg.vreg(), HashSet::new()); + } + + // Create Uses and Defs referring to VRegs, and place the Uses + // in LiveRanges. + // + // We already computed precise liveouts and liveins for every + // block above, so we don't need to run an iterative algorithm + // here; instead, every block's computation is purely local, + // from end to start. + + // Track current LiveRange for each vreg. + // + // Invariant: a stale range may be present here; ranges are + // only valid if `live.get(vreg)` is true. + let mut vreg_ranges: Vec = + vec![LiveRangeIndex::invalid(); self.func.num_vregs()]; + + for i in (0..self.func.num_blocks()).rev() { + let block = Block::new(i); + + self.stats.livein_blocks += 1; + + // Init our local live-in set. + let mut live = self.liveouts[block.index()].clone(); + + // Initially, registers are assumed live for the whole block. + for vreg in live.iter() { + let range = CodeRange { + from: self.cfginfo.block_entry[block.index()], + to: self.cfginfo.block_exit[block.index()].next(), + }; + log::trace!( + "vreg {:?} live at end of block --> create range {:?}", + VRegIndex::new(vreg), + range + ); + let lr = self.add_liverange_to_vreg(VRegIndex::new(vreg), range); + vreg_ranges[vreg] = lr; + } + + // Create vreg data for blockparams. + for param in self.func.block_params(block) { + self.vreg_regs[param.vreg()] = *param; + self.vregs[param.vreg()].blockparam = block; + } + + let insns = self.func.block_insns(block); + + // If the last instruction is a branch (rather than + // return), create blockparam_out entries. + if self.func.is_branch(insns.last()) { + let operands = self.func.inst_operands(insns.last()); + let mut i = self.func.branch_blockparam_arg_offset(block, insns.last()); + for &succ in self.func.block_succs(block) { + for &blockparam in self.func.block_params(succ) { + let from_vreg = VRegIndex::new(operands[i].vreg().vreg()); + let blockparam_vreg = VRegIndex::new(blockparam.vreg()); + self.blockparam_outs + .push((from_vreg, block, succ, blockparam_vreg)); + i += 1; + } + } + } + + // For each instruction, in reverse order, process + // operands and clobbers. + for inst in insns.rev().iter() { + if self.func.inst_clobbers(inst).len() > 0 { + self.clobbers.push(inst); + } + + // Mark clobbers with CodeRanges on PRegs. + for i in 0..self.func.inst_clobbers(inst).len() { + // don't borrow `self` + let clobber = self.func.inst_clobbers(inst)[i]; + // Clobber range is at After point only: an + // instruction can still take an input in a reg + // that it later clobbers. (In other words, the + // clobber is like a normal def that never gets + // used.) + let range = CodeRange { + from: ProgPoint::after(inst), + to: ProgPoint::before(inst.next()), + }; + self.add_liverange_to_preg(range, clobber); + } + + // Does the instruction have any input-reusing + // outputs? This is important below to establish + // proper interference wrt other inputs. + let mut reused_input = None; + for op in self.func.inst_operands(inst) { + if let OperandConstraint::Reuse(i) = op.constraint() { + reused_input = Some(i); + break; + } + } + + // If this is a move, handle specially. + if let Some((src, dst)) = self.func.is_move(inst) { + // We can completely skip the move if it is + // trivial (vreg to same vreg). + if src.vreg() != dst.vreg() { + log::trace!(" -> move inst{}: src {} -> dst {}", inst.index(), src, dst); + + assert_eq!(src.class(), dst.class()); + assert_eq!(src.kind(), OperandKind::Use); + assert_eq!(src.pos(), OperandPos::Early); + assert_eq!(dst.kind(), OperandKind::Def); + assert_eq!(dst.pos(), OperandPos::Late); + + // If both src and dest are pinned, emit the + // move right here, right now. + if self.vregs[src.vreg().vreg()].is_pinned + && self.vregs[dst.vreg().vreg()].is_pinned + { + // Update LRs. + if !live.get(src.vreg().vreg()) { + let lr = self.add_liverange_to_vreg( + VRegIndex::new(src.vreg().vreg()), + CodeRange { + from: self.cfginfo.block_entry[block.index()], + to: ProgPoint::after(inst), + }, + ); + live.set(src.vreg().vreg(), true); + vreg_ranges[src.vreg().vreg()] = lr; + } + if live.get(dst.vreg().vreg()) { + let lr = vreg_ranges[dst.vreg().vreg()]; + self.ranges[lr.index()].range.from = ProgPoint::after(inst); + live.set(dst.vreg().vreg(), false); + } else { + self.add_liverange_to_vreg( + VRegIndex::new(dst.vreg().vreg()), + CodeRange { + from: ProgPoint::after(inst), + to: ProgPoint::before(inst.next()), + }, + ); + } + + let src_preg = match src.constraint() { + OperandConstraint::FixedReg(r) => r, + _ => unreachable!(), + }; + let dst_preg = match dst.constraint() { + OperandConstraint::FixedReg(r) => r, + _ => unreachable!(), + }; + self.insert_move( + ProgPoint::before(inst), + InsertMovePrio::MultiFixedReg, + Allocation::reg(src_preg), + Allocation::reg(dst_preg), + Some(dst.vreg()), + ); + } + // If exactly one of source and dest (but not + // both) is a pinned-vreg, convert this into a + // ghost use on the other vreg with a FixedReg + // constraint. + else if self.vregs[src.vreg().vreg()].is_pinned + || self.vregs[dst.vreg().vreg()].is_pinned + { + log::trace!( + " -> exactly one of src/dst is pinned; converting to ghost use" + ); + let (preg, vreg, pinned_vreg, kind, pos, progpoint) = + if self.vregs[src.vreg().vreg()].is_pinned { + // Source is pinned: this is a def on the dst with a pinned preg. + ( + self.func.is_pinned_vreg(src.vreg()).unwrap(), + dst.vreg(), + src.vreg(), + OperandKind::Def, + OperandPos::Late, + ProgPoint::after(inst), + ) + } else { + // Dest is pinned: this is a use on the src with a pinned preg. + ( + self.func.is_pinned_vreg(dst.vreg()).unwrap(), + src.vreg(), + dst.vreg(), + OperandKind::Use, + OperandPos::Early, + ProgPoint::after(inst), + ) + }; + let constraint = OperandConstraint::FixedReg(preg); + let operand = Operand::new(vreg, constraint, kind, pos); + + log::trace!( + concat!( + " -> preg {:?} vreg {:?} kind {:?} ", + "pos {:?} progpoint {:?} constraint {:?} operand {:?}" + ), + preg, + vreg, + kind, + pos, + progpoint, + constraint, + operand + ); + + // Get the LR for the vreg; if none, create one. + let mut lr = vreg_ranges[vreg.vreg()]; + if !live.get(vreg.vreg()) { + let from = match kind { + OperandKind::Use => self.cfginfo.block_entry[block.index()], + OperandKind::Def => progpoint, + _ => unreachable!(), + }; + let to = progpoint.next(); + lr = self.add_liverange_to_vreg( + VRegIndex::new(vreg.vreg()), + CodeRange { from, to }, + ); + log::trace!(" -> dead; created LR"); + } + log::trace!(" -> LR {:?}", lr); + + self.insert_use_into_liverange( + lr, + Use::new(operand, progpoint, SLOT_NONE), + ); + + if kind == OperandKind::Def { + live.set(vreg.vreg(), false); + if self.ranges[lr.index()].range.from + == self.cfginfo.block_entry[block.index()] + { + self.ranges[lr.index()].range.from = progpoint; + } + self.ranges[lr.index()].set_flag(LiveRangeFlag::StartsAtDef); + } else { + live.set(vreg.vreg(), true); + vreg_ranges[vreg.vreg()] = lr; + } + + // Handle liveness of the other vreg. Note + // that this is somewhat special. For the + // destination case, we want the pinned + // vreg's LR to start just *after* the + // operand we inserted above, because + // otherwise it would overlap, and + // interfere, and prevent allocation. For + // the source case, we want to "poke a + // hole" in the LR: if it's live going + // downward, end it just after the operand + // and restart it before; if it isn't + // (this is the last use), start it + // before. + if kind == OperandKind::Def { + log::trace!(" -> src on pinned vreg {:?}", pinned_vreg); + // The *other* vreg is a def, so the pinned-vreg + // mention is a use. If already live, + // end the existing LR just *after* + // the `progpoint` defined above and + // start a new one just *before* the + // `progpoint` defined above, + // preserving the start. If not, start + // a new one live back to the top of + // the block, starting just before + // `progpoint`. + if live.get(pinned_vreg.vreg()) { + let pinned_lr = vreg_ranges[pinned_vreg.vreg()]; + let orig_start = self.ranges[pinned_lr.index()].range.from; + log::trace!( + " -> live with LR {:?}; truncating to start at {:?}", + pinned_lr, + progpoint.next() + ); + self.ranges[pinned_lr.index()].range.from = progpoint.next(); + let new_lr = self.add_liverange_to_vreg( + VRegIndex::new(pinned_vreg.vreg()), + CodeRange { + from: orig_start, + to: progpoint.prev(), + }, + ); + vreg_ranges[pinned_vreg.vreg()] = new_lr; + log::trace!(" -> created LR {:?} with remaining range from {:?} to {:?}", new_lr, orig_start, progpoint); + + // Add an edit right now to indicate that at + // this program point, the given + // preg is now known as that vreg, + // not the preg, but immediately + // after, it is known as the preg + // again. This is used by the + // checker. + self.insert_move( + ProgPoint::after(inst), + InsertMovePrio::Regular, + Allocation::reg(preg), + Allocation::reg(preg), + Some(dst.vreg()), + ); + self.insert_move( + ProgPoint::before(inst.next()), + InsertMovePrio::MultiFixedReg, + Allocation::reg(preg), + Allocation::reg(preg), + Some(src.vreg()), + ); + } else { + if inst > self.cfginfo.block_entry[block.index()].inst() { + let new_lr = self.add_liverange_to_vreg( + VRegIndex::new(pinned_vreg.vreg()), + CodeRange { + from: self.cfginfo.block_entry[block.index()], + to: ProgPoint::before(inst), + }, + ); + vreg_ranges[pinned_vreg.vreg()] = new_lr; + live.set(pinned_vreg.vreg(), true); + log::trace!( + " -> was not live; created new LR {:?}", + new_lr + ); + } + + // Add an edit right now to indicate that at + // this program point, the given + // preg is now known as that vreg, + // not the preg. This is used by + // the checker. + self.insert_move( + ProgPoint::after(inst), + InsertMovePrio::BlockParam, + Allocation::reg(preg), + Allocation::reg(preg), + Some(dst.vreg()), + ); + } + } else { + log::trace!(" -> dst on pinned vreg {:?}", pinned_vreg); + // The *other* vreg is a use, so the pinned-vreg + // mention is a def. Truncate its LR + // just *after* the `progpoint` + // defined above. + if live.get(pinned_vreg.vreg()) { + let pinned_lr = vreg_ranges[pinned_vreg.vreg()]; + self.ranges[pinned_lr.index()].range.from = progpoint.next(); + log::trace!( + " -> was live with LR {:?}; truncated start to {:?}", + pinned_lr, + progpoint.next() + ); + live.set(pinned_vreg.vreg(), false); + + // Add a no-op edit right now to indicate that + // at this program point, the + // given preg is now known as that + // preg, not the vreg. This is + // used by the checker. + self.insert_move( + ProgPoint::before(inst.next()), + InsertMovePrio::PostRegular, + Allocation::reg(preg), + Allocation::reg(preg), + Some(dst.vreg()), + ); + } + // Otherwise, if dead, no need to create + // a dummy LR -- there is no + // reservation to make (the other vreg + // will land in the reg with the + // fixed-reg operand constraint, but + // it's a dead move anyway). + } + } else { + // Redefine src and dst operands to have + // positions of After and Before respectively + // (see note below), and to have Any + // constraints if they were originally Reg. + let src_constraint = match src.constraint() { + OperandConstraint::Reg => OperandConstraint::Any, + x => x, + }; + let dst_constraint = match dst.constraint() { + OperandConstraint::Reg => OperandConstraint::Any, + x => x, + }; + let src = Operand::new( + src.vreg(), + src_constraint, + OperandKind::Use, + OperandPos::Late, + ); + let dst = Operand::new( + dst.vreg(), + dst_constraint, + OperandKind::Def, + OperandPos::Early, + ); + + if self.annotations_enabled { + self.annotate( + ProgPoint::after(inst), + format!( + " prog-move v{} ({:?}) -> v{} ({:?})", + src.vreg().vreg(), + src_constraint, + dst.vreg().vreg(), + dst_constraint, + ), + ); + } + + // N.B.: in order to integrate with the move + // resolution that joins LRs in general, we + // conceptually treat the move as happening + // between the move inst's After and the next + // inst's Before. Thus the src LR goes up to + // (exclusive) next-inst-pre, and the dst LR + // starts at next-inst-pre. We have to take + // care in our move insertion to handle this + // like other inter-inst moves, i.e., at + // `Regular` priority, so it properly happens + // in parallel with other inter-LR moves. + // + // Why the progpoint between move and next + // inst, and not the progpoint between prev + // inst and move? Because a move can be the + // first inst in a block, but cannot be the + // last; so the following progpoint is always + // within the same block, while the previous + // one may be an inter-block point (and the + // After of the prev inst in a different + // block). + + // Handle the def w.r.t. liveranges: trim the + // start of the range and mark it dead at this + // point in our backward scan. + let pos = ProgPoint::before(inst.next()); + let mut dst_lr = vreg_ranges[dst.vreg().vreg()]; + if !live.get(dst.vreg().vreg()) { + let from = pos; + let to = pos.next(); + dst_lr = self.add_liverange_to_vreg( + VRegIndex::new(dst.vreg().vreg()), + CodeRange { from, to }, + ); + log::trace!(" -> invalid LR for def; created {:?}", dst_lr); + } + log::trace!(" -> has existing LR {:?}", dst_lr); + // Trim the LR to start here. + if self.ranges[dst_lr.index()].range.from + == self.cfginfo.block_entry[block.index()] + { + log::trace!(" -> started at block start; trimming to {:?}", pos); + self.ranges[dst_lr.index()].range.from = pos; + } + self.ranges[dst_lr.index()].set_flag(LiveRangeFlag::StartsAtDef); + live.set(dst.vreg().vreg(), false); + vreg_ranges[dst.vreg().vreg()] = LiveRangeIndex::invalid(); + self.vreg_regs[dst.vreg().vreg()] = dst.vreg(); + + // Handle the use w.r.t. liveranges: make it live + // and create an initial LR back to the start of + // the block. + let pos = ProgPoint::after(inst); + let src_lr = if !live.get(src.vreg().vreg()) { + let range = CodeRange { + from: self.cfginfo.block_entry[block.index()], + to: pos.next(), + }; + let src_lr = self.add_liverange_to_vreg( + VRegIndex::new(src.vreg().vreg()), + range, + ); + vreg_ranges[src.vreg().vreg()] = src_lr; + src_lr + } else { + vreg_ranges[src.vreg().vreg()] + }; + + log::trace!(" -> src LR {:?}", src_lr); + + // Add to live-set. + let src_is_dead_after_move = !live.get(src.vreg().vreg()); + live.set(src.vreg().vreg(), true); + + // Add to program-moves lists. + self.prog_move_srcs.push(( + (VRegIndex::new(src.vreg().vreg()), inst), + Allocation::none(), + )); + self.prog_move_dsts.push(( + (VRegIndex::new(dst.vreg().vreg()), inst.next()), + Allocation::none(), + )); + self.stats.prog_moves += 1; + if src_is_dead_after_move { + self.stats.prog_moves_dead_src += 1; + self.prog_move_merges.push((src_lr, dst_lr)); + } + } + } + + continue; + } + + // Process defs and uses. + for &cur_pos in &[InstPosition::After, InstPosition::Before] { + for i in 0..self.func.inst_operands(inst).len() { + // don't borrow `self` + let operand = self.func.inst_operands(inst)[i]; + let pos = match (operand.kind(), operand.pos()) { + (OperandKind::Mod, _) => ProgPoint::before(inst), + (OperandKind::Def, OperandPos::Early) => ProgPoint::before(inst), + (OperandKind::Def, OperandPos::Late) => ProgPoint::after(inst), + (OperandKind::Use, OperandPos::Late) => ProgPoint::after(inst), + // If this is a branch, extend `pos` to + // the end of the block. (Branch uses are + // blockparams and need to be live at the + // end of the block.) + (OperandKind::Use, _) if self.func.is_branch(inst) => { + self.cfginfo.block_exit[block.index()] + } + // If there are any reused inputs in this + // instruction, and this is *not* the + // reused input, force `pos` to + // `After`. (See note below for why; it's + // very subtle!) + (OperandKind::Use, OperandPos::Early) + if reused_input.is_some() && reused_input.unwrap() != i => + { + ProgPoint::after(inst) + } + (OperandKind::Use, OperandPos::Early) => ProgPoint::before(inst), + }; + + if pos.pos() != cur_pos { + continue; + } + + log::trace!( + "processing inst{} operand at {:?}: {:?}", + inst.index(), + pos, + operand + ); + + match operand.kind() { + OperandKind::Def | OperandKind::Mod => { + log::trace!("Def of {} at {:?}", operand.vreg(), pos); + + // Fill in vreg's actual data. + self.vreg_regs[operand.vreg().vreg()] = operand.vreg(); + + // Get or create the LiveRange. + let mut lr = vreg_ranges[operand.vreg().vreg()]; + log::trace!(" -> has existing LR {:?}", lr); + // If there was no liverange (dead def), create a trivial one. + if !live.get(operand.vreg().vreg()) { + let from = match operand.kind() { + OperandKind::Def => pos, + OperandKind::Mod => self.cfginfo.block_entry[block.index()], + _ => unreachable!(), + }; + let to = match operand.kind() { + OperandKind::Def => pos.next(), + OperandKind::Mod => pos.next().next(), // both Before and After positions + _ => unreachable!(), + }; + lr = self.add_liverange_to_vreg( + VRegIndex::new(operand.vreg().vreg()), + CodeRange { from, to }, + ); + log::trace!(" -> invalid; created {:?}", lr); + vreg_ranges[operand.vreg().vreg()] = lr; + live.set(operand.vreg().vreg(), true); + } + // Create the use in the LiveRange. + self.insert_use_into_liverange(lr, Use::new(operand, pos, i as u8)); + // If def (not mod), this reg is now dead, + // scanning backward; make it so. + if operand.kind() == OperandKind::Def { + // Trim the range for this vreg to start + // at `pos` if it previously ended at the + // start of this block (i.e. was not + // merged into some larger LiveRange due + // to out-of-order blocks). + if self.ranges[lr.index()].range.from + == self.cfginfo.block_entry[block.index()] + { + log::trace!( + " -> started at block start; trimming to {:?}", + pos + ); + self.ranges[lr.index()].range.from = pos; + } + + self.ranges[lr.index()].set_flag(LiveRangeFlag::StartsAtDef); + + // Remove from live-set. + live.set(operand.vreg().vreg(), false); + vreg_ranges[operand.vreg().vreg()] = LiveRangeIndex::invalid(); + } + } + OperandKind::Use => { + // Create/extend the LiveRange if it + // doesn't already exist, and add the use + // to the range. + let mut lr = vreg_ranges[operand.vreg().vreg()]; + if !live.get(operand.vreg().vreg()) { + let range = CodeRange { + from: self.cfginfo.block_entry[block.index()], + to: pos.next(), + }; + lr = self.add_liverange_to_vreg( + VRegIndex::new(operand.vreg().vreg()), + range, + ); + vreg_ranges[operand.vreg().vreg()] = lr; + } + assert!(lr.is_valid()); + + log::trace!("Use of {:?} at {:?} -> {:?}", operand, pos, lr,); + + self.insert_use_into_liverange(lr, Use::new(operand, pos, i as u8)); + + // Add to live-set. + live.set(operand.vreg().vreg(), true); + } + } + } + } + + if self.func.requires_refs_on_stack(inst) { + log::trace!("inst{} is safepoint", inst.index()); + self.safepoints.push(inst); + for vreg in live.iter() { + if let Some(safepoints) = self.safepoints_per_vreg.get_mut(&vreg) { + log::trace!("vreg v{} live at safepoint inst{}", vreg, inst.index()); + safepoints.insert(inst); + } + } + } + } + + // Block parameters define vregs at the very beginning of + // the block. Remove their live vregs from the live set + // here. + for vreg in self.func.block_params(block) { + if live.get(vreg.vreg()) { + live.set(vreg.vreg(), false); + } else { + // Create trivial liverange if blockparam is dead. + let start = self.cfginfo.block_entry[block.index()]; + self.add_liverange_to_vreg( + VRegIndex::new(vreg.vreg()), + CodeRange { + from: start, + to: start.next(), + }, + ); + } + // add `blockparam_ins` entries. + let vreg_idx = VRegIndex::new(vreg.vreg()); + for &pred in self.func.block_preds(block) { + self.blockparam_ins.push((vreg_idx, block, pred)); + } + } + } + + self.safepoints.sort_unstable(); + + // Make ranges in each vreg and uses in each range appear in + // sorted order. We built them in reverse order above, so this + // is a simple reversal, *not* a full sort. + // + // The ordering invariant is always maintained for uses and + // always for ranges in bundles (which are initialized later), + // but not always for ranges in vregs; those are sorted only + // when needed, here and then again at the end of allocation + // when resolving moves. + + for vreg in &mut self.vregs { + vreg.ranges.reverse(); + let mut last = None; + for entry in &mut vreg.ranges { + // Ranges may have been truncated above at defs. We + // need to update with the final range here. + entry.range = self.ranges[entry.index.index()].range; + // Assert in-order and non-overlapping. + assert!(last.is_none() || last.unwrap() <= entry.range.from); + last = Some(entry.range.to); + } + } + + for range in 0..self.ranges.len() { + self.ranges[range].uses.reverse(); + debug_assert!(self.ranges[range] + .uses + .windows(2) + .all(|win| win[0].pos <= win[1].pos)); + } + + // Insert safepoint virtual stack uses, if needed. + for vreg in self.func.reftype_vregs() { + if self.vregs[vreg.vreg()].is_pinned { + continue; + } + let vreg = VRegIndex::new(vreg.vreg()); + let mut inserted = false; + let mut safepoint_idx = 0; + for range_idx in 0..self.vregs[vreg.index()].ranges.len() { + let LiveRangeListEntry { range, index } = + self.vregs[vreg.index()].ranges[range_idx]; + while safepoint_idx < self.safepoints.len() + && ProgPoint::before(self.safepoints[safepoint_idx]) < range.from + { + safepoint_idx += 1; + } + while safepoint_idx < self.safepoints.len() + && range.contains_point(ProgPoint::before(self.safepoints[safepoint_idx])) + { + // Create a virtual use. + let pos = ProgPoint::before(self.safepoints[safepoint_idx]); + let operand = Operand::new( + self.vreg_regs[vreg.index()], + OperandConstraint::Stack, + OperandKind::Use, + OperandPos::Early, + ); + + log::trace!( + "Safepoint-induced stack use of {:?} at {:?} -> {:?}", + operand, + pos, + index, + ); + + self.insert_use_into_liverange(index, Use::new(operand, pos, SLOT_NONE)); + safepoint_idx += 1; + + inserted = true; + } + + if inserted { + self.ranges[index.index()] + .uses + .sort_unstable_by_key(|u| u.pos); + } + + if safepoint_idx >= self.safepoints.len() { + break; + } + } + } + + // Do a fixed-reg cleanup pass: if there are any LiveRanges with + // multiple uses (or defs) at the same ProgPoint and there is + // more than one FixedReg constraint at that ProgPoint, we + // need to record all but one of them in a special fixup list + // and handle them later; otherwise, bundle-splitting to + // create minimal bundles becomes much more complex (we would + // have to split the multiple uses at the same progpoint into + // different bundles, which breaks invariants related to + // disjoint ranges and bundles). + let mut seen_fixed_for_vreg: SmallVec<[VReg; 16]> = smallvec![]; + let mut first_preg: SmallVec<[PRegIndex; 16]> = smallvec![]; + let mut extra_clobbers: SmallVec<[(PReg, Inst); 8]> = smallvec![]; + for vreg in 0..self.vregs.len() { + for range_idx in 0..self.vregs[vreg].ranges.len() { + let entry = self.vregs[vreg].ranges[range_idx]; + let range = entry.index; + log::trace!( + "multi-fixed-reg cleanup: vreg {:?} range {:?}", + VRegIndex::new(vreg), + range, + ); + let mut last_point = None; + let mut fixup_multi_fixed_vregs = |pos: ProgPoint, + slot: usize, + op: &mut Operand, + fixups: &mut Vec<( + ProgPoint, + PRegIndex, + PRegIndex, + usize, + )>| { + if last_point.is_some() && Some(pos) != last_point { + seen_fixed_for_vreg.clear(); + first_preg.clear(); + } + last_point = Some(pos); + + if let OperandConstraint::FixedReg(preg) = op.constraint() { + let vreg_idx = VRegIndex::new(op.vreg().vreg()); + let preg_idx = PRegIndex::new(preg.index()); + log::trace!( + "at pos {:?}, vreg {:?} has fixed constraint to preg {:?}", + pos, + vreg_idx, + preg_idx + ); + if let Some(idx) = seen_fixed_for_vreg.iter().position(|r| *r == op.vreg()) + { + let orig_preg = first_preg[idx]; + if orig_preg != preg_idx { + log::trace!(" -> duplicate; switching to constraint Reg"); + fixups.push((pos, orig_preg, preg_idx, slot)); + *op = Operand::new( + op.vreg(), + OperandConstraint::Reg, + op.kind(), + op.pos(), + ); + log::trace!( + " -> extra clobber {} at inst{}", + preg, + pos.inst().index() + ); + extra_clobbers.push((preg, pos.inst())); + } + } else { + seen_fixed_for_vreg.push(op.vreg()); + first_preg.push(preg_idx); + } + } + }; + + for u in &mut self.ranges[range.index()].uses { + let pos = u.pos; + let slot = u.slot as usize; + fixup_multi_fixed_vregs( + pos, + slot, + &mut u.operand, + &mut self.multi_fixed_reg_fixups, + ); + } + + for &(clobber, inst) in &extra_clobbers { + let range = CodeRange { + from: ProgPoint::before(inst), + to: ProgPoint::before(inst.next()), + }; + self.add_liverange_to_preg(range, clobber); + } + + extra_clobbers.clear(); + first_preg.clear(); + seen_fixed_for_vreg.clear(); + } + } + + self.clobbers.sort_unstable(); + self.blockparam_ins.sort_unstable(); + self.blockparam_outs.sort_unstable(); + self.prog_move_srcs.sort_unstable_by_key(|(pos, _)| *pos); + self.prog_move_dsts.sort_unstable_by_key(|(pos, _)| *pos); + + log::trace!("prog_move_srcs = {:?}", self.prog_move_srcs); + log::trace!("prog_move_dsts = {:?}", self.prog_move_dsts); + + self.stats.initial_liverange_count = self.ranges.len(); + self.stats.blockparam_ins_count = self.blockparam_ins.len(); + self.stats.blockparam_outs_count = self.blockparam_outs.len(); + + Ok(()) + } +} diff --git a/src/ion/merge.rs b/src/ion/merge.rs new file mode 100644 index 00000000..f3eb808b --- /dev/null +++ b/src/ion/merge.rs @@ -0,0 +1,439 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Bundle merging. + +use super::{ + Env, LiveBundleIndex, LiveRangeIndex, LiveRangeKey, Requirement, SpillSet, SpillSetIndex, + SpillSlotIndex, VRegIndex, +}; +use crate::{Function, Inst, OperandConstraint, PReg}; +use smallvec::smallvec; + +impl<'a, F: Function> Env<'a, F> { + pub fn merge_bundles(&mut self, from: LiveBundleIndex, to: LiveBundleIndex) -> bool { + if from == to { + // Merge bundle into self -- trivial merge. + return true; + } + log::trace!( + "merging from bundle{} to bundle{}", + from.index(), + to.index() + ); + + // Both bundles must deal with the same RegClass. + let from_rc = self.spillsets[self.bundles[from.index()].spillset.index()].class; + let to_rc = self.spillsets[self.bundles[to.index()].spillset.index()].class; + if from_rc != to_rc { + log::trace!(" -> mismatching reg classes"); + return false; + } + + // If either bundle is already assigned (due to a pinned vreg), don't merge. + if self.bundles[from.index()].allocation.is_some() + || self.bundles[to.index()].allocation.is_some() + { + log::trace!("one of the bundles is already assigned (pinned)"); + return false; + } + + #[cfg(debug_assertions)] + { + // Sanity check: both bundles should contain only ranges with appropriate VReg classes. + for entry in &self.bundles[from.index()].ranges { + let vreg = self.ranges[entry.index.index()].vreg; + assert_eq!(from_rc, self.vreg_regs[vreg.index()].class()); + } + for entry in &self.bundles[to.index()].ranges { + let vreg = self.ranges[entry.index.index()].vreg; + assert_eq!(to_rc, self.vreg_regs[vreg.index()].class()); + } + } + + // Check for overlap in LiveRanges and for conflicting + // requirements. + let ranges_from = &self.bundles[from.index()].ranges[..]; + let ranges_to = &self.bundles[to.index()].ranges[..]; + let mut idx_from = 0; + let mut idx_to = 0; + let mut range_count = 0; + while idx_from < ranges_from.len() && idx_to < ranges_to.len() { + range_count += 1; + if range_count > 200 { + log::trace!( + "reached merge complexity (range_count = {}); exiting", + range_count + ); + // Limit merge complexity. + return false; + } + + if ranges_from[idx_from].range.from >= ranges_to[idx_to].range.to { + idx_to += 1; + } else if ranges_to[idx_to].range.from >= ranges_from[idx_from].range.to { + idx_from += 1; + } else { + // Overlap -- cannot merge. + log::trace!( + " -> overlap between {:?} and {:?}, exiting", + ranges_from[idx_from].index, + ranges_to[idx_to].index + ); + return false; + } + } + + // Check for a requirements conflict. + if self.bundles[from.index()].cached_stack() + || self.bundles[from.index()].cached_fixed() + || self.bundles[to.index()].cached_stack() + || self.bundles[to.index()].cached_fixed() + { + let req = self + .compute_requirement(from) + .merge(self.compute_requirement(to)); + if req == Requirement::Conflict { + log::trace!(" -> conflicting requirements; aborting merge"); + return false; + } + } + + log::trace!(" -> committing to merge"); + + // If we reach here, then the bundles do not overlap -- merge + // them! We do this with a merge-sort-like scan over both + // lists, building a new range list and replacing the list on + // `to` when we're done. + if ranges_from.is_empty() { + // `from` bundle is empty -- trivial merge. + log::trace!(" -> from bundle{} is empty; trivial merge", from.index()); + return true; + } + if ranges_to.is_empty() { + // `to` bundle is empty -- just move the list over from + // `from` and set `bundle` up-link on all ranges. + log::trace!(" -> to bundle{} is empty; trivial merge", to.index()); + let list = std::mem::replace(&mut self.bundles[from.index()].ranges, smallvec![]); + for entry in &list { + self.ranges[entry.index.index()].bundle = to; + + if self.annotations_enabled { + self.annotate( + entry.range.from, + format!( + " MERGE range{} v{} from bundle{} to bundle{}", + entry.index.index(), + self.ranges[entry.index.index()].vreg.index(), + from.index(), + to.index(), + ), + ); + } + } + self.bundles[to.index()].ranges = list; + + if self.bundles[from.index()].cached_stack() { + self.bundles[to.index()].set_cached_stack(); + } + if self.bundles[from.index()].cached_fixed() { + self.bundles[to.index()].set_cached_fixed(); + } + + return true; + } + + log::trace!( + "merging: ranges_from = {:?} ranges_to = {:?}", + ranges_from, + ranges_to + ); + + // Two non-empty lists of LiveRanges: concatenate and + // sort. This is faster than a mergesort-like merge into a new + // list, empirically. + let from_list = std::mem::replace(&mut self.bundles[from.index()].ranges, smallvec![]); + for entry in &from_list { + self.ranges[entry.index.index()].bundle = to; + } + self.bundles[to.index()] + .ranges + .extend_from_slice(&from_list[..]); + self.bundles[to.index()] + .ranges + .sort_unstable_by_key(|entry| entry.range.from); + + if self.annotations_enabled { + log::trace!("merging: merged = {:?}", self.bundles[to.index()].ranges); + let mut last_range = None; + for i in 0..self.bundles[to.index()].ranges.len() { + let entry = self.bundles[to.index()].ranges[i]; + if last_range.is_some() { + assert!(last_range.unwrap() < entry.range); + } + last_range = Some(entry.range); + + if self.ranges[entry.index.index()].bundle == from { + self.annotate( + entry.range.from, + format!( + " MERGE range{} v{} from bundle{} to bundle{}", + entry.index.index(), + self.ranges[entry.index.index()].vreg.index(), + from.index(), + to.index(), + ), + ); + } + + log::trace!( + " -> merged result for bundle{}: range{}", + to.index(), + entry.index.index(), + ); + } + } + + if self.bundles[from.index()].spillset != self.bundles[to.index()].spillset { + let from_vregs = std::mem::replace( + &mut self.spillsets[self.bundles[from.index()].spillset.index()].vregs, + smallvec![], + ); + let to_vregs = &mut self.spillsets[self.bundles[to.index()].spillset.index()].vregs; + for vreg in from_vregs { + if !to_vregs.contains(&vreg) { + to_vregs.push(vreg); + } + } + } + + if self.bundles[from.index()].cached_stack() { + self.bundles[to.index()].set_cached_stack(); + } + if self.bundles[from.index()].cached_fixed() { + self.bundles[to.index()].set_cached_fixed(); + } + + true + } + + pub fn merge_vreg_bundles(&mut self) { + // Create a bundle for every vreg, initially. + log::trace!("merge_vreg_bundles: creating vreg bundles"); + for vreg in 0..self.vregs.len() { + let vreg = VRegIndex::new(vreg); + if self.vregs[vreg.index()].ranges.is_empty() { + continue; + } + + // If this is a pinned vreg, go ahead and add it to the + // commitment map, and avoid creating a bundle entirely. + if self.vregs[vreg.index()].is_pinned { + for entry in &self.vregs[vreg.index()].ranges { + let preg = self + .func + .is_pinned_vreg(self.vreg_regs[vreg.index()]) + .unwrap(); + let key = LiveRangeKey::from_range(&entry.range); + self.pregs[preg.index()] + .allocations + .btree + .insert(key, LiveRangeIndex::invalid()); + } + continue; + } + + let bundle = self.create_bundle(); + self.bundles[bundle.index()].ranges = self.vregs[vreg.index()].ranges.clone(); + log::trace!("vreg v{} gets bundle{}", vreg.index(), bundle.index()); + for entry in &self.bundles[bundle.index()].ranges { + log::trace!( + " -> with LR range{}: {:?}", + entry.index.index(), + entry.range + ); + self.ranges[entry.index.index()].bundle = bundle; + } + + let mut fixed = false; + let mut stack = false; + for entry in &self.bundles[bundle.index()].ranges { + for u in &self.ranges[entry.index.index()].uses { + if let OperandConstraint::FixedReg(_) = u.operand.constraint() { + fixed = true; + } + if let OperandConstraint::Stack = u.operand.constraint() { + stack = true; + } + if fixed && stack { + break; + } + } + } + if fixed { + self.bundles[bundle.index()].set_cached_fixed(); + } + if stack { + self.bundles[bundle.index()].set_cached_stack(); + } + + // Create a spillslot for this bundle. + let ssidx = SpillSetIndex::new(self.spillsets.len()); + let reg = self.vreg_regs[vreg.index()]; + let size = self.func.spillslot_size(reg.class()) as u8; + self.spillsets.push(SpillSet { + vregs: smallvec![vreg], + slot: SpillSlotIndex::invalid(), + size, + required: false, + class: reg.class(), + reg_hint: PReg::invalid(), + spill_bundle: LiveBundleIndex::invalid(), + }); + self.bundles[bundle.index()].spillset = ssidx; + } + + for inst in 0..self.func.num_insts() { + let inst = Inst::new(inst); + + // Attempt to merge Reuse-constraint operand outputs with the + // corresponding inputs. + for op in self.func.inst_operands(inst) { + if let OperandConstraint::Reuse(reuse_idx) = op.constraint() { + let src_vreg = op.vreg(); + let dst_vreg = self.func.inst_operands(inst)[reuse_idx].vreg(); + if self.vregs[src_vreg.vreg()].is_pinned + || self.vregs[dst_vreg.vreg()].is_pinned + { + continue; + } + + log::trace!( + "trying to merge reused-input def: src {} to dst {}", + src_vreg, + dst_vreg + ); + let src_bundle = + self.ranges[self.vregs[src_vreg.vreg()].ranges[0].index.index()].bundle; + assert!(src_bundle.is_valid()); + let dest_bundle = + self.ranges[self.vregs[dst_vreg.vreg()].ranges[0].index.index()].bundle; + assert!(dest_bundle.is_valid()); + self.merge_bundles(/* from */ dest_bundle, /* to */ src_bundle); + } + } + } + + // Attempt to merge blockparams with their inputs. + for i in 0..self.blockparam_outs.len() { + let (from_vreg, _, _, to_vreg) = self.blockparam_outs[i]; + log::trace!( + "trying to merge blockparam v{} with input v{}", + to_vreg.index(), + from_vreg.index() + ); + let to_bundle = self.ranges[self.vregs[to_vreg.index()].ranges[0].index.index()].bundle; + assert!(to_bundle.is_valid()); + let from_bundle = + self.ranges[self.vregs[from_vreg.index()].ranges[0].index.index()].bundle; + assert!(from_bundle.is_valid()); + log::trace!( + " -> from bundle{} to bundle{}", + from_bundle.index(), + to_bundle.index() + ); + self.merge_bundles(from_bundle, to_bundle); + } + + // Attempt to merge move srcs/dsts. + for i in 0..self.prog_move_merges.len() { + let (src, dst) = self.prog_move_merges[i]; + log::trace!("trying to merge move src LR {:?} to dst LR {:?}", src, dst); + let src = self.resolve_merged_lr(src); + let dst = self.resolve_merged_lr(dst); + log::trace!( + "resolved LR-construction merging chains: move-merge is now src LR {:?} to dst LR {:?}", + src, + dst + ); + + let dst_vreg = self.vreg_regs[self.ranges[dst.index()].vreg.index()]; + let src_vreg = self.vreg_regs[self.ranges[src.index()].vreg.index()]; + if self.vregs[src_vreg.vreg()].is_pinned && self.vregs[dst_vreg.vreg()].is_pinned { + continue; + } + if self.vregs[src_vreg.vreg()].is_pinned { + let dest_bundle = self.ranges[dst.index()].bundle; + let spillset = self.bundles[dest_bundle.index()].spillset; + self.spillsets[spillset.index()].reg_hint = + self.func.is_pinned_vreg(src_vreg).unwrap(); + continue; + } + if self.vregs[dst_vreg.vreg()].is_pinned { + let src_bundle = self.ranges[src.index()].bundle; + let spillset = self.bundles[src_bundle.index()].spillset; + self.spillsets[spillset.index()].reg_hint = + self.func.is_pinned_vreg(dst_vreg).unwrap(); + continue; + } + + let src_bundle = self.ranges[src.index()].bundle; + assert!(src_bundle.is_valid()); + let dest_bundle = self.ranges[dst.index()].bundle; + assert!(dest_bundle.is_valid()); + self.stats.prog_move_merge_attempt += 1; + if self.merge_bundles(/* from */ dest_bundle, /* to */ src_bundle) { + self.stats.prog_move_merge_success += 1; + } + } + + log::trace!("done merging bundles"); + } + + pub fn resolve_merged_lr(&self, mut lr: LiveRangeIndex) -> LiveRangeIndex { + let mut iter = 0; + while iter < 100 && self.ranges[lr.index()].merged_into.is_valid() { + lr = self.ranges[lr.index()].merged_into; + iter += 1; + } + lr + } + + pub fn compute_bundle_prio(&self, bundle: LiveBundleIndex) -> u32 { + // The priority is simply the total "length" -- the number of + // instructions covered by all LiveRanges. + let mut total = 0; + for entry in &self.bundles[bundle.index()].ranges { + total += entry.range.len() as u32; + } + total + } + + pub fn queue_bundles(&mut self) { + for bundle in 0..self.bundles.len() { + log::trace!("enqueueing bundle{}", bundle); + if self.bundles[bundle].ranges.is_empty() { + log::trace!(" -> no ranges; skipping"); + continue; + } + let bundle = LiveBundleIndex::new(bundle); + let prio = self.compute_bundle_prio(bundle); + log::trace!(" -> prio {}", prio); + self.bundles[bundle.index()].prio = prio; + self.recompute_bundle_properties(bundle); + self.allocation_queue + .insert(bundle, prio as usize, PReg::invalid()); + } + self.stats.merged_bundle_count = self.allocation_queue.heap.len(); + } +} diff --git a/src/ion/mod.rs b/src/ion/mod.rs new file mode 100644 index 00000000..e2d73b58 --- /dev/null +++ b/src/ion/mod.rs @@ -0,0 +1,145 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Backtracking register allocator. See doc/DESIGN.md for details of +//! its design. + +use crate::cfg::CFGInfo; +use crate::{Function, MachineEnv, Output, PReg, ProgPoint, RegAllocError, RegClass}; +use std::collections::HashMap; + +pub(crate) mod data_structures; +pub use data_structures::Stats; +use data_structures::*; +pub(crate) mod reg_traversal; +use reg_traversal::*; +pub(crate) mod requirement; +use requirement::*; +pub(crate) mod redundant_moves; +use redundant_moves::*; +pub(crate) mod liveranges; +use liveranges::*; +pub(crate) mod merge; +pub(crate) mod process; +use process::*; +pub(crate) mod dump; +pub(crate) mod moves; +pub(crate) mod spill; +pub(crate) mod stackmap; + +impl<'a, F: Function> Env<'a, F> { + pub(crate) fn new( + func: &'a F, + env: &'a MachineEnv, + cfginfo: CFGInfo, + annotations_enabled: bool, + ) -> Self { + let n = func.num_insts(); + Self { + func, + env, + cfginfo, + + liveins: Vec::with_capacity(func.num_blocks()), + liveouts: Vec::with_capacity(func.num_blocks()), + blockparam_outs: vec![], + blockparam_ins: vec![], + blockparam_allocs: vec![], + bundles: Vec::with_capacity(n), + ranges: Vec::with_capacity(4 * n), + spillsets: Vec::with_capacity(n), + vregs: Vec::with_capacity(n), + vreg_regs: Vec::with_capacity(n), + pregs: vec![], + allocation_queue: PrioQueue::new(), + clobbers: vec![], + safepoints: vec![], + safepoints_per_vreg: HashMap::new(), + spilled_bundles: vec![], + spillslots: vec![], + slots_by_size: vec![], + allocated_bundle_count: 0, + + extra_spillslot: vec![None, None], + + prog_move_srcs: Vec::with_capacity(n / 2), + prog_move_dsts: Vec::with_capacity(n / 2), + prog_move_merges: Vec::with_capacity(n / 2), + + multi_fixed_reg_fixups: vec![], + inserted_moves: vec![], + edits: Vec::with_capacity(n), + allocs: Vec::with_capacity(4 * n), + inst_alloc_offsets: vec![], + num_spillslots: 0, + safepoint_slots: vec![], + + stats: Stats::default(), + + debug_annotations: std::collections::HashMap::new(), + annotations_enabled, + } + } + + pub(crate) fn init(&mut self) -> Result<(), RegAllocError> { + self.create_pregs_and_vregs(); + self.compute_liveness()?; + self.merge_vreg_bundles(); + self.queue_bundles(); + if log::log_enabled!(log::Level::Trace) { + self.dump_state(); + } + Ok(()) + } + + pub(crate) fn run(&mut self) -> Result<(), RegAllocError> { + self.process_bundles()?; + self.try_allocating_regs_for_spilled_bundles(); + self.allocate_spillslots(); + self.apply_allocations_and_insert_moves(); + self.resolve_inserted_moves(); + self.compute_stackmaps(); + Ok(()) + } +} + +pub fn run( + func: &F, + mach_env: &MachineEnv, + enable_annotations: bool, +) -> Result { + let cfginfo = CFGInfo::new(func)?; + + let mut env = Env::new(func, mach_env, cfginfo, enable_annotations); + env.init()?; + + env.run()?; + + if enable_annotations { + env.dump_results(); + } + + Ok(Output { + edits: env + .edits + .into_iter() + .map(|(pos, _, edit)| (ProgPoint::from_index(pos), edit)) + .collect(), + allocs: env.allocs, + inst_alloc_offsets: env.inst_alloc_offsets, + num_spillslots: env.num_spillslots as usize, + debug_locations: vec![], + safepoint_slots: env.safepoint_slots, + stats: env.stats, + }) +} diff --git a/src/ion/moves.rs b/src/ion/moves.rs new file mode 100644 index 00000000..56336c4e --- /dev/null +++ b/src/ion/moves.rs @@ -0,0 +1,1164 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Move resolution. + +use super::{ + Env, InsertMovePrio, InsertedMove, LiveRangeFlag, LiveRangeIndex, RedundantMoveEliminator, + VRegIndex, SLOT_NONE, +}; + +use crate::moves::ParallelMoves; +use crate::{ + Allocation, Block, Edit, Function, Inst, InstPosition, OperandConstraint, OperandKind, + OperandPos, ProgPoint, RegClass, VReg, +}; +use smallvec::{smallvec, SmallVec}; +use std::fmt::Debug; + +impl<'a, F: Function> Env<'a, F> { + pub fn is_start_of_block(&self, pos: ProgPoint) -> bool { + let block = self.cfginfo.insn_block[pos.inst().index()]; + pos == self.cfginfo.block_entry[block.index()] + } + pub fn is_end_of_block(&self, pos: ProgPoint) -> bool { + let block = self.cfginfo.insn_block[pos.inst().index()]; + pos == self.cfginfo.block_exit[block.index()] + } + + pub fn insert_move( + &mut self, + pos: ProgPoint, + prio: InsertMovePrio, + from_alloc: Allocation, + to_alloc: Allocation, + to_vreg: Option, + ) { + log::trace!( + "insert_move: pos {:?} prio {:?} from_alloc {:?} to_alloc {:?}", + pos, + prio, + from_alloc, + to_alloc + ); + match (from_alloc.as_reg(), to_alloc.as_reg()) { + (Some(from), Some(to)) => { + assert_eq!(from.class(), to.class()); + } + _ => {} + } + self.inserted_moves.push(InsertedMove { + pos, + prio, + from_alloc, + to_alloc, + to_vreg, + }); + } + + pub fn get_alloc(&self, inst: Inst, slot: usize) -> Allocation { + let inst_allocs = &self.allocs[self.inst_alloc_offsets[inst.index()] as usize..]; + inst_allocs[slot] + } + + pub fn set_alloc(&mut self, inst: Inst, slot: usize, alloc: Allocation) { + let inst_allocs = &mut self.allocs[self.inst_alloc_offsets[inst.index()] as usize..]; + inst_allocs[slot] = alloc; + } + + pub fn get_alloc_for_range(&self, range: LiveRangeIndex) -> Allocation { + log::trace!("get_alloc_for_range: {:?}", range); + let bundle = self.ranges[range.index()].bundle; + log::trace!(" -> bundle: {:?}", bundle); + let bundledata = &self.bundles[bundle.index()]; + log::trace!(" -> allocation {:?}", bundledata.allocation); + if bundledata.allocation != Allocation::none() { + bundledata.allocation + } else { + log::trace!(" -> spillset {:?}", bundledata.spillset); + log::trace!( + " -> spill slot {:?}", + self.spillsets[bundledata.spillset.index()].slot + ); + self.spillslots[self.spillsets[bundledata.spillset.index()].slot.index()].alloc + } + } + + pub fn apply_allocations_and_insert_moves(&mut self) { + log::trace!("apply_allocations_and_insert_moves"); + log::trace!("blockparam_ins: {:?}", self.blockparam_ins); + log::trace!("blockparam_outs: {:?}", self.blockparam_outs); + + // Now that all splits are done, we can pay the cost once to + // sort VReg range lists and update with the final ranges. + for vreg in &mut self.vregs { + for entry in &mut vreg.ranges { + entry.range = self.ranges[entry.index.index()].range; + } + vreg.ranges.sort_unstable_by_key(|entry| entry.range.from); + } + + /// We create "half-moves" in order to allow a single-scan + /// strategy with a subsequent sort. Basically, the key idea + /// is that as our single scan through a range for a vreg hits + /// upon the source or destination of an edge-move, we emit a + /// "half-move". These half-moves are carefully keyed in a + /// particular sort order (the field order below is + /// significant!) so that all half-moves on a given (from, to) + /// block-edge appear contiguously, and then all moves from a + /// given vreg appear contiguously. Within a given from-vreg, + /// pick the first `Source` (there should only be one, but + /// imprecision in liveranges due to loop handling sometimes + /// means that a blockparam-out is also recognized as a normal-out), + /// and then for each `Dest`, copy the source-alloc to that + /// dest-alloc. + #[derive(Clone, Debug, PartialEq, Eq)] + struct HalfMove { + key: u64, + alloc: Allocation, + } + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] + #[repr(u8)] + enum HalfMoveKind { + Source = 0, + Dest = 1, + } + fn half_move_key( + from_block: Block, + to_block: Block, + to_vreg: VRegIndex, + kind: HalfMoveKind, + ) -> u64 { + assert!(from_block.index() < 1 << 21); + assert!(to_block.index() < 1 << 21); + assert!(to_vreg.index() < 1 << 21); + ((from_block.index() as u64) << 43) + | ((to_block.index() as u64) << 22) + | ((to_vreg.index() as u64) << 1) + | (kind as u8 as u64) + } + impl HalfMove { + fn from_block(&self) -> Block { + Block::new(((self.key >> 43) & ((1 << 21) - 1)) as usize) + } + fn to_block(&self) -> Block { + Block::new(((self.key >> 22) & ((1 << 21) - 1)) as usize) + } + fn to_vreg(&self) -> VRegIndex { + VRegIndex::new(((self.key >> 1) & ((1 << 21) - 1)) as usize) + } + fn kind(&self) -> HalfMoveKind { + if self.key & 1 == 1 { + HalfMoveKind::Dest + } else { + HalfMoveKind::Source + } + } + } + + let mut half_moves: Vec = Vec::with_capacity(6 * self.func.num_insts()); + let mut reuse_input_insts = Vec::with_capacity(self.func.num_insts() / 2); + + let mut blockparam_in_idx = 0; + let mut blockparam_out_idx = 0; + let mut prog_move_src_idx = 0; + let mut prog_move_dst_idx = 0; + for vreg in 0..self.vregs.len() { + let vreg = VRegIndex::new(vreg); + + let pinned_alloc = if self.vregs[vreg.index()].is_pinned { + self.func.is_pinned_vreg(self.vreg_regs[vreg.index()]) + } else { + None + }; + + // For each range in each vreg, insert moves or + // half-moves. We also scan over `blockparam_ins` and + // `blockparam_outs`, which are sorted by (block, vreg), + // and over program-move srcs/dsts to fill in allocations. + let mut prev = LiveRangeIndex::invalid(); + for range_idx in 0..self.vregs[vreg.index()].ranges.len() { + let entry = self.vregs[vreg.index()].ranges[range_idx]; + let alloc = pinned_alloc + .map(|preg| Allocation::reg(preg)) + .unwrap_or_else(|| self.get_alloc_for_range(entry.index)); + let range = entry.range; + log::trace!( + "apply_allocations: vreg {:?} LR {:?} with range {:?} has alloc {:?} (pinned {:?})", + vreg, + entry.index, + range, + alloc, + pinned_alloc, + ); + debug_assert!(alloc != Allocation::none()); + + if self.annotations_enabled { + self.annotate( + range.from, + format!( + " <<< start v{} in {} (range{}) (bundle{})", + vreg.index(), + alloc, + entry.index.index(), + self.ranges[entry.index.index()].bundle.raw_u32(), + ), + ); + self.annotate( + range.to, + format!( + " end v{} in {} (range{}) (bundle{}) >>>", + vreg.index(), + alloc, + entry.index.index(), + self.ranges[entry.index.index()].bundle.raw_u32(), + ), + ); + } + + // Does this range follow immediately after a prior + // range in the same block? If so, insert a move (if + // the allocs differ). We do this directly rather than + // with half-moves because we eagerly know both sides + // already (and also, half-moves are specific to + // inter-block transfers). + // + // Note that we do *not* do this if there is also a + // def as the first use in the new range: it's + // possible that an old liverange covers the Before + // pos of an inst, a new liverange covers the After + // pos, and the def also happens at After. In this + // case we don't want to an insert a move after the + // instruction copying the old liverange. + // + // Note also that we assert that the new range has to + // start at the Before-point of an instruction; we + // can't insert a move that logically happens just + // before After (i.e. in the middle of a single + // instruction). + // + // Also note that this case is not applicable to + // pinned vregs (because they are always in one PReg). + if pinned_alloc.is_none() && prev.is_valid() { + let prev_alloc = self.get_alloc_for_range(prev); + let prev_range = self.ranges[prev.index()].range; + let first_is_def = + self.ranges[entry.index.index()].has_flag(LiveRangeFlag::StartsAtDef); + debug_assert!(prev_alloc != Allocation::none()); + + if prev_range.to == range.from + && !self.is_start_of_block(range.from) + && !first_is_def + { + log::trace!( + "prev LR {} abuts LR {} in same block; moving {} -> {} for v{}", + prev.index(), + entry.index.index(), + prev_alloc, + alloc, + vreg.index() + ); + assert_eq!(range.from.pos(), InstPosition::Before); + self.insert_move( + range.from, + InsertMovePrio::Regular, + prev_alloc, + alloc, + Some(self.vreg_regs[vreg.index()]), + ); + } + } + + // The block-to-block edge-move logic is not + // applicable to pinned vregs, which are always in one + // PReg (so never need moves within their own vreg + // ranges). + if pinned_alloc.is_none() { + // Scan over blocks whose ends are covered by this + // range. For each, for each successor that is not + // already in this range (hence guaranteed to have the + // same allocation) and if the vreg is live, add a + // Source half-move. + let mut block = self.cfginfo.insn_block[range.from.inst().index()]; + while block.is_valid() && block.index() < self.func.num_blocks() { + if range.to < self.cfginfo.block_exit[block.index()].next() { + break; + } + log::trace!("examining block with end in range: block{}", block.index()); + for &succ in self.func.block_succs(block) { + log::trace!( + " -> has succ block {} with entry {:?}", + succ.index(), + self.cfginfo.block_entry[succ.index()] + ); + if range.contains_point(self.cfginfo.block_entry[succ.index()]) { + continue; + } + log::trace!(" -> out of this range, requires half-move if live"); + if self.is_live_in(succ, vreg) { + log::trace!(" -> live at input to succ, adding halfmove"); + half_moves.push(HalfMove { + key: half_move_key(block, succ, vreg, HalfMoveKind::Source), + alloc, + }); + } + } + + // Scan forward in `blockparam_outs`, adding all + // half-moves for outgoing values to blockparams + // in succs. + log::trace!( + "scanning blockparam_outs for v{} block{}: blockparam_out_idx = {}", + vreg.index(), + block.index(), + blockparam_out_idx, + ); + while blockparam_out_idx < self.blockparam_outs.len() { + let (from_vreg, from_block, to_block, to_vreg) = + self.blockparam_outs[blockparam_out_idx]; + if (from_vreg, from_block) > (vreg, block) { + break; + } + if (from_vreg, from_block) == (vreg, block) { + log::trace!( + " -> found: from v{} block{} to v{} block{}", + from_vreg.index(), + from_block.index(), + to_vreg.index(), + to_vreg.index() + ); + half_moves.push(HalfMove { + key: half_move_key( + from_block, + to_block, + to_vreg, + HalfMoveKind::Source, + ), + alloc, + }); + + if self.annotations_enabled { + self.annotate( + self.cfginfo.block_exit[block.index()], + format!( + "blockparam-out: block{} to block{}: v{} to v{} in {}", + from_block.index(), + to_block.index(), + from_vreg.index(), + to_vreg.index(), + alloc + ), + ); + } + } + + blockparam_out_idx += 1; + } + + block = block.next(); + } + + // Scan over blocks whose beginnings are covered by + // this range and for which the vreg is live at the + // start of the block. For each, for each predecessor, + // add a Dest half-move. + let mut block = self.cfginfo.insn_block[range.from.inst().index()]; + if self.cfginfo.block_entry[block.index()] < range.from { + block = block.next(); + } + while block.is_valid() && block.index() < self.func.num_blocks() { + if self.cfginfo.block_entry[block.index()] >= range.to { + break; + } + + // Add half-moves for blockparam inputs. + log::trace!( + "scanning blockparam_ins at vreg {} block {}: blockparam_in_idx = {}", + vreg.index(), + block.index(), + blockparam_in_idx + ); + while blockparam_in_idx < self.blockparam_ins.len() { + let (to_vreg, to_block, from_block) = + self.blockparam_ins[blockparam_in_idx]; + if (to_vreg, to_block) > (vreg, block) { + break; + } + if (to_vreg, to_block) == (vreg, block) { + half_moves.push(HalfMove { + key: half_move_key( + from_block, + to_block, + to_vreg, + HalfMoveKind::Dest, + ), + alloc, + }); + log::trace!( + "match: blockparam_in: v{} in block{} from block{} into {}", + to_vreg.index(), + to_block.index(), + from_block.index(), + alloc, + ); + #[cfg(debug_assertions)] + { + if log::log_enabled!(log::Level::Trace) { + self.annotate( + self.cfginfo.block_entry[block.index()], + format!( + "blockparam-in: block{} to block{}:into v{} in {}", + from_block.index(), + to_block.index(), + to_vreg.index(), + alloc + ), + ); + } + } + } + blockparam_in_idx += 1; + } + + if !self.is_live_in(block, vreg) { + block = block.next(); + continue; + } + + log::trace!( + "scanning preds at vreg {} block {} for ends outside the range", + vreg.index(), + block.index() + ); + + // Now find any preds whose ends are not in the + // same range, and insert appropriate moves. + for &pred in self.func.block_preds(block) { + log::trace!( + "pred block {} has exit {:?}", + pred.index(), + self.cfginfo.block_exit[pred.index()] + ); + if range.contains_point(self.cfginfo.block_exit[pred.index()]) { + continue; + } + log::trace!(" -> requires half-move"); + half_moves.push(HalfMove { + key: half_move_key(pred, block, vreg, HalfMoveKind::Dest), + alloc, + }); + } + + block = block.next(); + } + + // If this is a blockparam vreg and the start of block + // is in this range, add to blockparam_allocs. + let (blockparam_block, blockparam_idx) = + self.cfginfo.vreg_def_blockparam[vreg.index()]; + if blockparam_block.is_valid() + && range.contains_point(self.cfginfo.block_entry[blockparam_block.index()]) + { + self.blockparam_allocs.push(( + blockparam_block, + blockparam_idx, + vreg, + alloc, + )); + } + } + + // Scan over def/uses and apply allocations. + for use_idx in 0..self.ranges[entry.index.index()].uses.len() { + let usedata = self.ranges[entry.index.index()].uses[use_idx]; + log::trace!("applying to use: {:?}", usedata); + debug_assert!(range.contains_point(usedata.pos)); + let inst = usedata.pos.inst(); + let slot = usedata.slot; + let operand = usedata.operand; + // Safepoints add virtual uses with no slots; + // avoid these. + if slot != SLOT_NONE { + self.set_alloc(inst, slot as usize, alloc); + } + if let OperandConstraint::Reuse(_) = operand.constraint() { + reuse_input_insts.push(inst); + } + } + + // Scan over program move srcs/dsts to fill in allocations. + + // Move srcs happen at `After` of a given + // inst. Compute [from, to) semi-inclusive range of + // inst indices for which we should fill in the source + // with this LR's allocation. + // + // range from inst-Before or inst-After covers cur + // inst's After; so includes move srcs from inst. + let move_src_start = (vreg, range.from.inst()); + // range to (exclusive) inst-Before or inst-After + // covers only prev inst's After; so includes move + // srcs to (exclusive) inst. + let move_src_end = (vreg, range.to.inst()); + log::trace!( + "vreg {:?} range {:?}: looking for program-move sources from {:?} to {:?}", + vreg, + range, + move_src_start, + move_src_end + ); + while prog_move_src_idx < self.prog_move_srcs.len() + && self.prog_move_srcs[prog_move_src_idx].0 < move_src_start + { + log::trace!(" -> skipping idx {}", prog_move_src_idx); + prog_move_src_idx += 1; + } + while prog_move_src_idx < self.prog_move_srcs.len() + && self.prog_move_srcs[prog_move_src_idx].0 < move_src_end + { + log::trace!( + " -> setting idx {} ({:?}) to alloc {:?}", + prog_move_src_idx, + self.prog_move_srcs[prog_move_src_idx].0, + alloc + ); + self.prog_move_srcs[prog_move_src_idx].1 = alloc; + prog_move_src_idx += 1; + } + + // move dsts happen at Before point. + // + // Range from inst-Before includes cur inst, while inst-After includes only next inst. + let move_dst_start = if range.from.pos() == InstPosition::Before { + (vreg, range.from.inst()) + } else { + (vreg, range.from.inst().next()) + }; + // Range to (exclusive) inst-Before includes prev + // inst, so to (exclusive) cur inst; range to + // (exclusive) inst-After includes cur inst, so to + // (exclusive) next inst. + let move_dst_end = if range.to.pos() == InstPosition::Before { + (vreg, range.to.inst()) + } else { + (vreg, range.to.inst().next()) + }; + log::trace!( + "vreg {:?} range {:?}: looking for program-move dests from {:?} to {:?}", + vreg, + range, + move_dst_start, + move_dst_end + ); + while prog_move_dst_idx < self.prog_move_dsts.len() + && self.prog_move_dsts[prog_move_dst_idx].0 < move_dst_start + { + log::trace!(" -> skipping idx {}", prog_move_dst_idx); + prog_move_dst_idx += 1; + } + while prog_move_dst_idx < self.prog_move_dsts.len() + && self.prog_move_dsts[prog_move_dst_idx].0 < move_dst_end + { + log::trace!( + " -> setting idx {} ({:?}) to alloc {:?}", + prog_move_dst_idx, + self.prog_move_dsts[prog_move_dst_idx].0, + alloc + ); + self.prog_move_dsts[prog_move_dst_idx].1 = alloc; + prog_move_dst_idx += 1; + } + + prev = entry.index; + } + } + + // Sort the half-moves list. For each (from, to, + // from-vreg) tuple, find the from-alloc and all the + // to-allocs, and insert moves on the block edge. + half_moves.sort_unstable_by_key(|h| h.key); + log::trace!("halfmoves: {:?}", half_moves); + self.stats.halfmoves_count = half_moves.len(); + + let mut i = 0; + while i < half_moves.len() { + // Find a Source. + while i < half_moves.len() && half_moves[i].kind() != HalfMoveKind::Source { + i += 1; + } + if i >= half_moves.len() { + break; + } + let src = &half_moves[i]; + i += 1; + + // Find all Dests. + let dest_key = src.key | 1; + let first_dest = i; + while i < half_moves.len() && half_moves[i].key == dest_key { + i += 1; + } + let last_dest = i; + + log::trace!( + "halfmove match: src {:?} dests {:?}", + src, + &half_moves[first_dest..last_dest] + ); + + // Determine the ProgPoint where moves on this (from, to) + // edge should go: + // - If there is more than one in-edge to `to`, then + // `from` must have only one out-edge; moves go at tail of + // `from` just before last Branch/Ret. + // - Otherwise, there must be at most one in-edge to `to`, + // and moves go at start of `to`. + let from_last_insn = self.func.block_insns(src.from_block()).last(); + let to_first_insn = self.func.block_insns(src.to_block()).first(); + let from_is_ret = self.func.is_ret(from_last_insn); + let to_is_entry = self.func.entry_block() == src.to_block(); + let from_outs = + self.func.block_succs(src.from_block()).len() + if from_is_ret { 1 } else { 0 }; + let to_ins = + self.func.block_preds(src.to_block()).len() + if to_is_entry { 1 } else { 0 }; + + let (insertion_point, prio) = if to_ins > 1 && from_outs <= 1 { + ( + // N.B.: though semantically the edge moves happen + // after the branch, we must insert them before + // the branch because otherwise, of course, they + // would never execute. This is correct even in + // the presence of branches that read register + // inputs (e.g. conditional branches on some RISCs + // that branch on reg zero/not-zero, or any + // indirect branch), but for a very subtle reason: + // all cases of such branches will (or should) + // have multiple successors, and thus due to + // critical-edge splitting, their successors will + // have only the single predecessor, and we prefer + // to insert at the head of the successor in that + // case (rather than here). We make this a + // requirement, in fact: the user of this library + // shall not read registers in a branch + // instruction of there is only one successor per + // the given CFG information. + ProgPoint::before(from_last_insn), + InsertMovePrio::OutEdgeMoves, + ) + } else if to_ins <= 1 { + ( + ProgPoint::before(to_first_insn), + InsertMovePrio::InEdgeMoves, + ) + } else { + panic!( + "Critical edge: can't insert moves between blocks {:?} and {:?}", + src.from_block(), + src.to_block() + ); + }; + + let mut last = None; + for dest in first_dest..last_dest { + let dest = &half_moves[dest]; + if last == Some(dest.alloc) { + continue; + } + self.insert_move( + insertion_point, + prio, + src.alloc, + dest.alloc, + Some(self.vreg_regs[dest.to_vreg().index()]), + ); + last = Some(dest.alloc); + } + } + + // Handle multi-fixed-reg constraints by copying. + for (progpoint, from_preg, to_preg, slot) in + std::mem::replace(&mut self.multi_fixed_reg_fixups, vec![]) + { + log::trace!( + "multi-fixed-move constraint at {:?} from p{} to p{}", + progpoint, + from_preg.index(), + to_preg.index() + ); + self.insert_move( + progpoint, + InsertMovePrio::MultiFixedReg, + Allocation::reg(self.pregs[from_preg.index()].reg), + Allocation::reg(self.pregs[to_preg.index()].reg), + None, + ); + self.set_alloc( + progpoint.inst(), + slot, + Allocation::reg(self.pregs[to_preg.index()].reg), + ); + } + + // Handle outputs that reuse inputs: copy beforehand, then set + // input's alloc to output's. + // + // Note that the output's allocation may not *actually* be + // valid until InstPosition::After, but the reused input may + // occur at InstPosition::Before. This may appear incorrect, + // but we make it work by ensuring that all *other* inputs are + // extended to InstPosition::After so that the def will not + // interfere. (The liveness computation code does this -- we + // do not require the user to do so.) + // + // One might ask: why not insist that input-reusing defs occur + // at InstPosition::Before? this would be correct, but would + // mean that the reused input and the reusing output + // interfere, *guaranteeing* that every such case would + // require a move. This is really bad on ISAs (like x86) where + // reused inputs are ubiquitous. + // + // Another approach might be to put the def at Before, and + // trim the reused input's liverange back to the previous + // instruction's After. This is kind of OK until (i) a block + // boundary occurs between the prior inst and this one, or + // (ii) any moves/spills/reloads occur between the two + // instructions. We really do need the input to be live at + // this inst's Before. + // + // In principle what we really need is a "BeforeBefore" + // program point, but we don't want to introduce that + // everywhere and pay the cost of twice as many ProgPoints + // throughout the allocator. + // + // Or we could introduce a separate move instruction -- this + // is the approach that regalloc.rs takes with "mod" operands + // -- but that is also costly. + // + // So we take this approach (invented by IonMonkey -- somewhat + // hard to discern, though see [0] for a comment that makes + // this slightly less unclear) to avoid interference between + // the actual reused input and reusing output, ensure + // interference (hence no incorrectness) between other inputs + // and the reusing output, and not require a separate explicit + // move instruction. + // + // [0] https://searchfox.org/mozilla-central/rev/3a798ef9252896fb389679f06dd3203169565af0/js/src/jit/shared/Lowering-shared-inl.h#108-110 + for inst in reuse_input_insts { + let mut input_reused: SmallVec<[usize; 4]> = smallvec![]; + for output_idx in 0..self.func.inst_operands(inst).len() { + let operand = self.func.inst_operands(inst)[output_idx]; + if let OperandConstraint::Reuse(input_idx) = operand.constraint() { + debug_assert!(!input_reused.contains(&input_idx)); + debug_assert_eq!(operand.pos(), OperandPos::Late); + input_reused.push(input_idx); + let input_alloc = self.get_alloc(inst, input_idx); + let output_alloc = self.get_alloc(inst, output_idx); + log::trace!( + "reuse-input inst {:?}: output {} has alloc {:?}, input {} has alloc {:?}", + inst, + output_idx, + output_alloc, + input_idx, + input_alloc + ); + if input_alloc != output_alloc { + #[cfg(debug_assertions)] + { + if log::log_enabled!(log::Level::Trace) { + self.annotate( + ProgPoint::before(inst), + format!( + " reuse-input-copy: {} -> {}", + input_alloc, output_alloc + ), + ); + } + } + let input_operand = self.func.inst_operands(inst)[input_idx]; + self.insert_move( + ProgPoint::before(inst), + InsertMovePrio::ReusedInput, + input_alloc, + output_alloc, + Some(input_operand.vreg()), + ); + self.set_alloc(inst, input_idx, output_alloc); + } + } + } + } + + // Sort the prog-moves lists and insert moves to reify the + // input program's move operations. + self.prog_move_srcs + .sort_unstable_by_key(|((_, inst), _)| *inst); + self.prog_move_dsts + .sort_unstable_by_key(|((_, inst), _)| inst.prev()); + let prog_move_srcs = std::mem::replace(&mut self.prog_move_srcs, vec![]); + let prog_move_dsts = std::mem::replace(&mut self.prog_move_dsts, vec![]); + assert_eq!(prog_move_srcs.len(), prog_move_dsts.len()); + for (&((_, from_inst), from_alloc), &((to_vreg, to_inst), to_alloc)) in + prog_move_srcs.iter().zip(prog_move_dsts.iter()) + { + log::trace!( + "program move at inst {:?}: alloc {:?} -> {:?} (v{})", + from_inst, + from_alloc, + to_alloc, + to_vreg.index(), + ); + assert!(from_alloc.is_some()); + assert!(to_alloc.is_some()); + assert_eq!(from_inst, to_inst.prev()); + // N.B.: these moves happen with the *same* priority as + // LR-to-LR moves, because they work just like them: they + // connect a use at one progpoint (move-After) with a def + // at an adjacent progpoint (move+1-Before), so they must + // happen in parallel with all other LR-to-LR moves. + self.insert_move( + ProgPoint::before(to_inst), + InsertMovePrio::Regular, + from_alloc, + to_alloc, + Some(self.vreg_regs[to_vreg.index()]), + ); + } + } + + pub fn resolve_inserted_moves(&mut self) { + // For each program point, gather all moves together. Then + // resolve (see cases below). + let mut i = 0; + self.inserted_moves + .sort_unstable_by_key(|m| (m.pos.to_index(), m.prio)); + + // Redundant-move elimination state tracker. + let mut redundant_moves = RedundantMoveEliminator::default(); + + fn redundant_move_process_side_effects<'a, F: Function>( + this: &Env<'a, F>, + redundant_moves: &mut RedundantMoveEliminator, + from: ProgPoint, + to: ProgPoint, + ) { + // If any safepoints in range, clear and return. + // Also, if we cross a block boundary, clear and return. + if this.cfginfo.insn_block[from.inst().index()] + != this.cfginfo.insn_block[to.inst().index()] + { + redundant_moves.clear(); + return; + } + for inst in from.inst().index()..=to.inst().index() { + if this.func.requires_refs_on_stack(Inst::new(inst)) { + redundant_moves.clear(); + return; + } + } + + let start_inst = if from.pos() == InstPosition::Before { + from.inst() + } else { + from.inst().next() + }; + let end_inst = if to.pos() == InstPosition::Before { + to.inst() + } else { + to.inst().next() + }; + for inst in start_inst.index()..end_inst.index() { + let inst = Inst::new(inst); + for (i, op) in this.func.inst_operands(inst).iter().enumerate() { + match op.kind() { + OperandKind::Def | OperandKind::Mod => { + let alloc = this.get_alloc(inst, i); + redundant_moves.clear_alloc(alloc); + } + _ => {} + } + } + for reg in this.func.inst_clobbers(inst) { + redundant_moves.clear_alloc(Allocation::reg(*reg)); + } + } + } + + let mut last_pos = ProgPoint::before(Inst::new(0)); + + while i < self.inserted_moves.len() { + let start = i; + let pos = self.inserted_moves[i].pos; + let prio = self.inserted_moves[i].prio; + while i < self.inserted_moves.len() + && self.inserted_moves[i].pos == pos + && self.inserted_moves[i].prio == prio + { + i += 1; + } + let moves = &self.inserted_moves[start..i]; + + redundant_move_process_side_effects(self, &mut redundant_moves, last_pos, pos); + last_pos = pos; + + // Gather all the moves with Int class and Float class + // separately. These cannot interact, so it is safe to + // have two separate ParallelMove instances. They need to + // be separate because moves between the two classes are + // impossible. (We could enhance ParallelMoves to + // understand register classes and take multiple scratch + // regs, but this seems simpler.) + let mut int_moves: SmallVec<[InsertedMove; 8]> = smallvec![]; + let mut float_moves: SmallVec<[InsertedMove; 8]> = smallvec![]; + let mut self_moves: SmallVec<[InsertedMove; 8]> = smallvec![]; + + for m in moves { + if m.from_alloc.is_reg() && m.to_alloc.is_reg() { + assert_eq!(m.from_alloc.class(), m.to_alloc.class()); + } + if m.from_alloc == m.to_alloc { + if m.to_vreg.is_some() { + self_moves.push(m.clone()); + } + continue; + } + match m.from_alloc.class() { + RegClass::Int => { + int_moves.push(m.clone()); + } + RegClass::Float => { + float_moves.push(m.clone()); + } + } + } + + for &(regclass, moves) in + &[(RegClass::Int, &int_moves), (RegClass::Float, &float_moves)] + { + // All moves in `moves` semantically happen in + // parallel. Let's resolve these to a sequence of moves + // that can be done one at a time. + let scratch = self.env.scratch_by_class[regclass as u8 as usize]; + let mut parallel_moves = ParallelMoves::new(Allocation::reg(scratch)); + log::trace!("parallel moves at pos {:?} prio {:?}", pos, prio); + for m in moves { + if (m.from_alloc != m.to_alloc) || m.to_vreg.is_some() { + log::trace!(" {} -> {}", m.from_alloc, m.to_alloc,); + parallel_moves.add(m.from_alloc, m.to_alloc, m.to_vreg); + } + } + + let resolved = parallel_moves.resolve(); + + // If (i) the scratch register is used, and (ii) a + // stack-to-stack move exists, then we need to + // allocate an additional scratch spillslot to which + // we can temporarily spill the scratch reg when we + // lower the stack-to-stack move to a + // stack-to-scratch-to-stack sequence. + let scratch_used = resolved.iter().any(|&(src, dst, _)| { + src == Allocation::reg(scratch) || dst == Allocation::reg(scratch) + }); + let stack_stack_move = resolved + .iter() + .any(|&(src, dst, _)| src.is_stack() && dst.is_stack()); + let extra_slot = if scratch_used && stack_stack_move { + if self.extra_spillslot[regclass as u8 as usize].is_none() { + let slot = self.allocate_spillslot(regclass); + self.extra_spillslot[regclass as u8 as usize] = Some(slot); + } + self.extra_spillslot[regclass as u8 as usize] + } else { + None + }; + + let mut scratch_used_yet = false; + for (src, dst, to_vreg) in resolved { + log::trace!(" resolved: {} -> {} ({:?})", src, dst, to_vreg); + let action = redundant_moves.process_move(src, dst, to_vreg); + if !action.elide { + if dst == Allocation::reg(scratch) { + scratch_used_yet = true; + } + if src.is_stack() && dst.is_stack() { + if !scratch_used_yet { + self.add_edit( + pos, + prio, + Edit::Move { + from: src, + to: Allocation::reg(scratch), + to_vreg, + }, + ); + self.add_edit( + pos, + prio, + Edit::Move { + from: Allocation::reg(scratch), + to: dst, + to_vreg, + }, + ); + } else { + assert!(extra_slot.is_some()); + self.add_edit( + pos, + prio, + Edit::Move { + from: Allocation::reg(scratch), + to: extra_slot.unwrap(), + to_vreg: None, + }, + ); + self.add_edit( + pos, + prio, + Edit::Move { + from: src, + to: Allocation::reg(scratch), + to_vreg, + }, + ); + self.add_edit( + pos, + prio, + Edit::Move { + from: Allocation::reg(scratch), + to: dst, + to_vreg, + }, + ); + self.add_edit( + pos, + prio, + Edit::Move { + from: extra_slot.unwrap(), + to: Allocation::reg(scratch), + to_vreg: None, + }, + ); + } + } else { + self.add_edit( + pos, + prio, + Edit::Move { + from: src, + to: dst, + to_vreg, + }, + ); + } + } else { + log::trace!(" -> redundant move elided"); + } + if let Some((alloc, vreg)) = action.def_alloc { + log::trace!( + " -> converted to DefAlloc: alloc {} vreg {}", + alloc, + vreg + ); + self.add_edit(pos, prio, Edit::DefAlloc { alloc, vreg }); + } + } + } + + for m in &self_moves { + log::trace!( + "self move at pos {:?} prio {:?}: {} -> {} to_vreg {:?}", + pos, + prio, + m.from_alloc, + m.to_alloc, + m.to_vreg + ); + let action = redundant_moves.process_move(m.from_alloc, m.to_alloc, m.to_vreg); + assert!(action.elide); + if let Some((alloc, vreg)) = action.def_alloc { + log::trace!(" -> DefAlloc: alloc {} vreg {}", alloc, vreg); + self.add_edit(pos, prio, Edit::DefAlloc { alloc, vreg }); + } + } + } + + // Add edits to describe blockparam locations too. This is + // required by the checker. This comes after any edge-moves. + self.blockparam_allocs + .sort_unstable_by_key(|&(block, idx, _, _)| (block, idx)); + self.stats.blockparam_allocs_count = self.blockparam_allocs.len(); + let mut i = 0; + while i < self.blockparam_allocs.len() { + let start = i; + let block = self.blockparam_allocs[i].0; + while i < self.blockparam_allocs.len() && self.blockparam_allocs[i].0 == block { + i += 1; + } + let params = &self.blockparam_allocs[start..i]; + let vregs = params + .iter() + .map(|(_, _, vreg_idx, _)| self.vreg_regs[vreg_idx.index()]) + .collect::>(); + let allocs = params + .iter() + .map(|(_, _, _, alloc)| *alloc) + .collect::>(); + assert_eq!(vregs.len(), self.func.block_params(block).len()); + assert_eq!(allocs.len(), self.func.block_params(block).len()); + for (vreg, alloc) in vregs.into_iter().zip(allocs.into_iter()) { + self.add_edit( + self.cfginfo.block_entry[block.index()], + InsertMovePrio::BlockParam, + Edit::DefAlloc { alloc, vreg }, + ); + } + } + + // Ensure edits are in sorted ProgPoint order. N.B.: this must + // be a stable sort! We have to keep the order produced by the + // parallel-move resolver for all moves within a single sort + // key. + self.edits.sort_by_key(|&(pos, prio, _)| (pos, prio)); + self.stats.edits_count = self.edits.len(); + + // Add debug annotations. + if self.annotations_enabled { + for i in 0..self.edits.len() { + let &(pos, _, ref edit) = &self.edits[i]; + match edit { + &Edit::Move { from, to, to_vreg } => { + self.annotate( + ProgPoint::from_index(pos), + format!("move {} -> {} ({:?})", from, to, to_vreg), + ); + } + &Edit::DefAlloc { alloc, vreg } => { + let s = format!("defalloc {:?} := {:?}", alloc, vreg); + self.annotate(ProgPoint::from_index(pos), s); + } + } + } + } + } + + pub fn add_edit(&mut self, pos: ProgPoint, prio: InsertMovePrio, edit: Edit) { + match &edit { + &Edit::Move { from, to, to_vreg } if from == to && to_vreg.is_none() => return, + &Edit::Move { from, to, .. } if from.is_reg() && to.is_reg() => { + assert_eq!(from.as_reg().unwrap().class(), to.as_reg().unwrap().class()); + } + _ => {} + } + + self.edits.push((pos.to_index(), prio, edit)); + } +} diff --git a/src/ion/process.rs b/src/ion/process.rs new file mode 100644 index 00000000..8c10a9bb --- /dev/null +++ b/src/ion/process.rs @@ -0,0 +1,1079 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Main allocation loop that processes bundles. + +use super::{ + spill_weight_from_constraint, CodeRange, Env, LiveBundleIndex, LiveBundleVec, LiveRangeFlag, + LiveRangeIndex, LiveRangeKey, LiveRangeList, LiveRangeListEntry, PRegIndex, RegTraversalIter, + Requirement, SpillWeight, UseList, +}; +use crate::{ + Allocation, Function, Inst, InstPosition, OperandConstraint, OperandKind, PReg, ProgPoint, + RegAllocError, +}; +use fxhash::FxHashSet; +use smallvec::smallvec; +use std::fmt::Debug; + +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum AllocRegResult { + Allocated(Allocation), + Conflict(LiveBundleVec, ProgPoint), + ConflictWithFixed(u32, ProgPoint), + ConflictHighCost, +} + +impl<'a, F: Function> Env<'a, F> { + pub fn process_bundles(&mut self) -> Result<(), RegAllocError> { + while let Some((bundle, reg_hint)) = self.allocation_queue.pop() { + self.stats.process_bundle_count += 1; + self.process_bundle(bundle, reg_hint)?; + } + self.stats.final_liverange_count = self.ranges.len(); + self.stats.final_bundle_count = self.bundles.len(); + self.stats.spill_bundle_count = self.spilled_bundles.len(); + + Ok(()) + } + + pub fn try_to_allocate_bundle_to_reg( + &mut self, + bundle: LiveBundleIndex, + reg: PRegIndex, + // if the max bundle weight in the conflict set exceeds this + // cost (if provided), just return + // `AllocRegResult::ConflictHighCost`. + max_allowable_cost: Option, + ) -> AllocRegResult { + log::trace!("try_to_allocate_bundle_to_reg: {:?} -> {:?}", bundle, reg); + let mut conflicts = smallvec![]; + let mut conflict_set = FxHashSet::default(); + let mut max_conflict_weight = 0; + // Traverse the BTreeMap in order by requesting the whole + // range spanned by the bundle and iterating over that + // concurrently with our ranges. Because our ranges are in + // order, and the BTreeMap is as well, this allows us to have + // an overall O(n log n) + O(b) complexity, where the PReg has + // n current ranges and the bundle has b ranges, rather than + // O(b * n log n) with the simple probe-for-each-bundle-range + // approach. + // + // Note that the comparator function on a CodeRange tests for + // *overlap*, so we are checking whether the BTree contains + // any preg range that *overlaps* with range `range`, not + // literally the range `range`. + let bundle_ranges = &self.bundles[bundle.index()].ranges; + let from_key = LiveRangeKey::from_range(&CodeRange { + from: bundle_ranges.first().unwrap().range.from, + to: bundle_ranges.first().unwrap().range.from, + }); + let mut preg_range_iter = self.pregs[reg.index()] + .allocations + .btree + .range(from_key..) + .peekable(); + log::trace!( + "alloc map for {:?} in range {:?}..: {:?}", + reg, + from_key, + self.pregs[reg.index()].allocations.btree + ); + let mut first_conflict: Option = None; + + 'ranges: for entry in bundle_ranges { + log::trace!(" -> range LR {:?}: {:?}", entry.index, entry.range); + let key = LiveRangeKey::from_range(&entry.range); + + let mut skips = 0; + 'alloc: loop { + log::trace!(" -> PReg range {:?}", preg_range_iter.peek()); + + // Advance our BTree traversal until it is >= this bundle + // range (i.e., skip PReg allocations in the BTree that + // are completely before this bundle range). + + if preg_range_iter.peek().is_some() && *preg_range_iter.peek().unwrap().0 < key { + log::trace!( + "Skipping PReg range {:?}", + preg_range_iter.peek().unwrap().0 + ); + preg_range_iter.next(); + skips += 1; + if skips >= 16 { + let from_pos = entry.range.from; + let from_key = LiveRangeKey::from_range(&CodeRange { + from: from_pos, + to: from_pos, + }); + preg_range_iter = self.pregs[reg.index()] + .allocations + .btree + .range(from_key..) + .peekable(); + skips = 0; + } + continue 'alloc; + } + skips = 0; + + // If there are no more PReg allocations, we're done! + if preg_range_iter.peek().is_none() { + log::trace!(" -> no more PReg allocations; so no conflict possible!"); + break 'ranges; + } + + // If the current PReg range is beyond this range, there is no conflict; continue. + if *preg_range_iter.peek().unwrap().0 > key { + log::trace!( + " -> next PReg allocation is at {:?}; moving to next VReg range", + preg_range_iter.peek().unwrap().0 + ); + break 'alloc; + } + + // Otherwise, there is a conflict. + let preg_key = *preg_range_iter.peek().unwrap().0; + assert_eq!(preg_key, key); // Assert that this range overlaps. + let preg_range = preg_range_iter.next().unwrap().1; + + log::trace!(" -> btree contains range {:?} that overlaps", preg_range); + if preg_range.is_valid() { + log::trace!(" -> from vreg {:?}", self.ranges[preg_range.index()].vreg); + // range from an allocated bundle: find the bundle and add to + // conflicts list. + let conflict_bundle = self.ranges[preg_range.index()].bundle; + log::trace!(" -> conflict bundle {:?}", conflict_bundle); + if !conflict_set.contains(&conflict_bundle) { + conflicts.push(conflict_bundle); + conflict_set.insert(conflict_bundle); + max_conflict_weight = std::cmp::max( + max_conflict_weight, + self.bundles[conflict_bundle.index()].cached_spill_weight(), + ); + if max_allowable_cost.is_some() + && max_conflict_weight > max_allowable_cost.unwrap() + { + log::trace!(" -> reached high cost, retrying early"); + return AllocRegResult::ConflictHighCost; + } + } + + if first_conflict.is_none() { + first_conflict = Some(ProgPoint::from_index(std::cmp::max( + preg_key.from, + key.from, + ))); + } + } else { + log::trace!(" -> conflict with fixed reservation"); + // range from a direct use of the PReg (due to clobber). + return AllocRegResult::ConflictWithFixed( + max_conflict_weight, + ProgPoint::from_index(preg_key.from), + ); + } + } + } + + if conflicts.len() > 0 { + return AllocRegResult::Conflict(conflicts, first_conflict.unwrap()); + } + + // We can allocate! Add our ranges to the preg's BTree. + let preg = self.pregs[reg.index()].reg; + log::trace!(" -> bundle {:?} assigned to preg {:?}", bundle, preg); + self.bundles[bundle.index()].allocation = Allocation::reg(preg); + for entry in &self.bundles[bundle.index()].ranges { + self.pregs[reg.index()] + .allocations + .btree + .insert(LiveRangeKey::from_range(&entry.range), entry.index); + } + + AllocRegResult::Allocated(Allocation::reg(preg)) + } + + pub fn evict_bundle(&mut self, bundle: LiveBundleIndex) { + log::trace!( + "evicting bundle {:?}: alloc {:?}", + bundle, + self.bundles[bundle.index()].allocation + ); + let preg = match self.bundles[bundle.index()].allocation.as_reg() { + Some(preg) => preg, + None => { + log::trace!( + " -> has no allocation! {:?}", + self.bundles[bundle.index()].allocation + ); + return; + } + }; + let preg_idx = PRegIndex::new(preg.index()); + self.bundles[bundle.index()].allocation = Allocation::none(); + for entry in &self.bundles[bundle.index()].ranges { + log::trace!(" -> removing LR {:?} from reg {:?}", entry.index, preg_idx); + self.pregs[preg_idx.index()] + .allocations + .btree + .remove(&LiveRangeKey::from_range(&entry.range)); + } + let prio = self.bundles[bundle.index()].prio; + log::trace!(" -> prio {}; back into queue", prio); + self.allocation_queue + .insert(bundle, prio as usize, PReg::invalid()); + } + + pub fn bundle_spill_weight(&self, bundle: LiveBundleIndex) -> u32 { + self.bundles[bundle.index()].cached_spill_weight() + } + + pub fn maximum_spill_weight_in_bundle_set(&self, bundles: &LiveBundleVec) -> u32 { + log::trace!("maximum_spill_weight_in_bundle_set: {:?}", bundles); + let m = bundles + .iter() + .map(|&b| { + let w = self.bundles[b.index()].cached_spill_weight(); + log::trace!("bundle{}: {}", b.index(), w); + w + }) + .max() + .unwrap_or(0); + log::trace!(" -> max: {}", m); + m + } + + pub fn recompute_bundle_properties(&mut self, bundle: LiveBundleIndex) { + log::trace!("recompute bundle properties: bundle {:?}", bundle); + + let minimal; + let mut fixed = false; + let mut stack = false; + let bundledata = &self.bundles[bundle.index()]; + let first_range = bundledata.ranges[0].index; + let first_range_data = &self.ranges[first_range.index()]; + + self.bundles[bundle.index()].prio = self.compute_bundle_prio(bundle); + + if first_range_data.vreg.is_invalid() { + log::trace!(" -> no vreg; minimal and fixed"); + minimal = true; + fixed = true; + } else { + for u in &first_range_data.uses { + log::trace!(" -> use: {:?}", u); + if let OperandConstraint::FixedReg(_) = u.operand.constraint() { + log::trace!(" -> fixed use at {:?}: {:?}", u.pos, u.operand); + fixed = true; + } + if let OperandConstraint::Stack = u.operand.constraint() { + log::trace!(" -> stack use at {:?}: {:?}", u.pos, u.operand); + stack = true; + } + if stack && fixed { + break; + } + } + // Minimal if the range covers only one instruction. Note + // that it could cover just one ProgPoint, + // i.e. X.Before..X.After, or two ProgPoints, + // i.e. X.Before..X+1.Before. + log::trace!(" -> first range has range {:?}", first_range_data.range); + let bundle_start = self.bundles[bundle.index()] + .ranges + .first() + .unwrap() + .range + .from; + let bundle_end = self.bundles[bundle.index()].ranges.last().unwrap().range.to; + minimal = bundle_start.inst() == bundle_end.prev().inst(); + log::trace!(" -> minimal: {}", minimal); + } + + let spill_weight = if minimal { + if fixed { + log::trace!(" -> fixed and minimal: spill weight 2000000"); + 2_000_000 + } else { + log::trace!(" -> non-fixed and minimal: spill weight 1000000"); + 1_000_000 + } + } else { + let mut total = SpillWeight::zero(); + for entry in &self.bundles[bundle.index()].ranges { + let range_data = &self.ranges[entry.index.index()]; + log::trace!( + " -> uses spill weight: +{:?}", + range_data.uses_spill_weight() + ); + total = total + range_data.uses_spill_weight(); + } + + if self.bundles[bundle.index()].prio > 0 { + let final_weight = (total.to_f32() as u32) / self.bundles[bundle.index()].prio; + log::trace!( + " -> dividing by prio {}; final weight {}", + self.bundles[bundle.index()].prio, + final_weight + ); + final_weight + } else { + 0 + } + }; + + self.bundles[bundle.index()].set_cached_spill_weight_and_props( + spill_weight, + minimal, + fixed, + stack, + ); + } + + pub fn minimal_bundle(&self, bundle: LiveBundleIndex) -> bool { + self.bundles[bundle.index()].cached_minimal() + } + + pub fn recompute_range_properties(&mut self, range: LiveRangeIndex) { + let rangedata = &mut self.ranges[range.index()]; + let mut w = SpillWeight::zero(); + for u in &rangedata.uses { + w = w + SpillWeight::from_bits(u.weight); + log::trace!("range{}: use {:?}", range.index(), u); + } + rangedata.set_uses_spill_weight(w); + if rangedata.uses.len() > 0 && rangedata.uses[0].operand.kind() == OperandKind::Def { + // Note that we *set* the flag here, but we never *clear* + // it: it may be set by a progmove as well (which does not + // create an explicit use or def), and we want to preserve + // that. We will never split or trim ranges in a way that + // removes a def at the front and requires the flag to be + // cleared. + rangedata.set_flag(LiveRangeFlag::StartsAtDef); + } + } + + pub fn find_conflict_split_point(&self, bundle: LiveBundleIndex) -> ProgPoint { + // Find the first use whose requirement causes the merge up to + // this point to go to Conflict. + let mut req = Requirement::Unknown; + for entry in &self.bundles[bundle.index()].ranges { + for u in &self.ranges[entry.index.index()].uses { + let this_req = Requirement::from_operand(u.operand); + req = req.merge(this_req); + if req == Requirement::Conflict { + return u.pos; + } + } + } + + // Fallback: start of bundle. + self.bundles[bundle.index()] + .ranges + .first() + .unwrap() + .range + .from + } + + pub fn get_or_create_spill_bundle( + &mut self, + bundle: LiveBundleIndex, + create_if_absent: bool, + ) -> Option { + let ssidx = self.bundles[bundle.index()].spillset; + let idx = self.spillsets[ssidx.index()].spill_bundle; + if idx.is_valid() { + Some(idx) + } else if create_if_absent { + let idx = self.create_bundle(); + self.spillsets[ssidx.index()].spill_bundle = idx; + self.bundles[idx.index()].spillset = ssidx; + self.spilled_bundles.push(idx); + Some(idx) + } else { + None + } + } + + pub fn split_and_requeue_bundle( + &mut self, + bundle: LiveBundleIndex, + mut split_at: ProgPoint, + reg_hint: PReg, + ) { + self.stats.splits += 1; + log::trace!( + "split bundle {:?} at {:?} and requeue with reg hint (for first part) {:?}", + bundle, + split_at, + reg_hint, + ); + + // Split `bundle` at `split_at`, creating new LiveRanges and + // bundles (and updating vregs' linked lists appropriately), + // and enqueue the new bundles. + + let spillset = self.bundles[bundle.index()].spillset; + + assert!(!self.bundles[bundle.index()].ranges.is_empty()); + // Split point *at* start is OK; this means we peel off + // exactly one use to create a minimal bundle. + let bundle_start = self.bundles[bundle.index()] + .ranges + .first() + .unwrap() + .range + .from; + assert!(split_at >= bundle_start); + let bundle_end = self.bundles[bundle.index()].ranges.last().unwrap().range.to; + assert!(split_at < bundle_end); + + // Is the split point *at* the start? If so, peel off the + // first use: set the split point just after it, or just + // before it if it comes after the start of the bundle. + if split_at == bundle_start { + // Find any uses; if none, just chop off one instruction. + let mut first_use = None; + 'outer: for entry in &self.bundles[bundle.index()].ranges { + for u in &self.ranges[entry.index.index()].uses { + first_use = Some(u.pos); + break 'outer; + } + } + log::trace!(" -> first use loc is {:?}", first_use); + split_at = match first_use { + Some(pos) => { + if pos.inst() == bundle_start.inst() { + ProgPoint::before(pos.inst().next()) + } else { + ProgPoint::before(pos.inst()) + } + } + None => ProgPoint::before( + self.bundles[bundle.index()] + .ranges + .first() + .unwrap() + .range + .from + .inst() + .next(), + ), + }; + log::trace!( + "split point is at bundle start; advancing to {:?}", + split_at + ); + } else { + // Don't split in the middle of an instruction -- this could + // create impossible moves (we cannot insert a move between an + // instruction's uses and defs). + if split_at.pos() == InstPosition::After { + split_at = split_at.next(); + } + if split_at >= bundle_end { + split_at = split_at.prev().prev(); + } + } + + assert!(split_at > bundle_start && split_at < bundle_end); + + // We need to find which LRs fall on each side of the split, + // which LR we need to split down the middle, then update the + // current bundle, create a new one, and (re)-queue both. + + log::trace!(" -> LRs: {:?}", self.bundles[bundle.index()].ranges); + + let mut last_lr_in_old_bundle_idx = 0; // last LR-list index in old bundle + let mut first_lr_in_new_bundle_idx = 0; // first LR-list index in new bundle + for (i, entry) in self.bundles[bundle.index()].ranges.iter().enumerate() { + if split_at > entry.range.from { + last_lr_in_old_bundle_idx = i; + first_lr_in_new_bundle_idx = i; + } + if split_at < entry.range.to { + first_lr_in_new_bundle_idx = i; + break; + } + } + + log::trace!( + " -> last LR in old bundle: LR {:?}", + self.bundles[bundle.index()].ranges[last_lr_in_old_bundle_idx] + ); + log::trace!( + " -> first LR in new bundle: LR {:?}", + self.bundles[bundle.index()].ranges[first_lr_in_new_bundle_idx] + ); + + // Take the sublist of LRs that will go in the new bundle. + let mut new_lr_list: LiveRangeList = self.bundles[bundle.index()] + .ranges + .iter() + .cloned() + .skip(first_lr_in_new_bundle_idx) + .collect(); + self.bundles[bundle.index()] + .ranges + .truncate(last_lr_in_old_bundle_idx + 1); + + // If the first entry in `new_lr_list` is a LR that is split + // down the middle, replace it with a new LR and chop off the + // end of the same LR in the original list. + if split_at > new_lr_list[0].range.from { + assert_eq!(last_lr_in_old_bundle_idx, first_lr_in_new_bundle_idx); + let orig_lr = new_lr_list[0].index; + let new_lr = self.create_liverange(CodeRange { + from: split_at, + to: new_lr_list[0].range.to, + }); + self.ranges[new_lr.index()].vreg = self.ranges[orig_lr.index()].vreg; + log::trace!(" -> splitting LR {:?} into {:?}", orig_lr, new_lr); + let first_use = self.ranges[orig_lr.index()] + .uses + .iter() + .position(|u| u.pos >= split_at) + .unwrap_or(self.ranges[orig_lr.index()].uses.len()); + let rest_uses: UseList = self.ranges[orig_lr.index()] + .uses + .iter() + .cloned() + .skip(first_use) + .collect(); + self.ranges[new_lr.index()].uses = rest_uses; + self.ranges[orig_lr.index()].uses.truncate(first_use); + self.recompute_range_properties(orig_lr); + self.recompute_range_properties(new_lr); + new_lr_list[0].index = new_lr; + new_lr_list[0].range = self.ranges[new_lr.index()].range; + self.ranges[orig_lr.index()].range.to = split_at; + self.bundles[bundle.index()].ranges[last_lr_in_old_bundle_idx].range = + self.ranges[orig_lr.index()].range; + + // Perform a lazy split in the VReg data. We just + // append the new LR and its range; we will sort by + // start of range, and fix up range ends, once when we + // iterate over the VReg's ranges after allocation + // completes (this is the only time when order + // matters). + self.vregs[self.ranges[new_lr.index()].vreg.index()] + .ranges + .push(LiveRangeListEntry { + range: self.ranges[new_lr.index()].range, + index: new_lr, + }); + } + + let new_bundle = self.create_bundle(); + log::trace!(" -> creating new bundle {:?}", new_bundle); + self.bundles[new_bundle.index()].spillset = spillset; + for entry in &new_lr_list { + self.ranges[entry.index.index()].bundle = new_bundle; + } + self.bundles[new_bundle.index()].ranges = new_lr_list; + + // Finally, handle moving LRs to the spill bundle when + // appropriate: If the first range in `new_bundle` or last + // range in `bundle` has "empty space" beyond the first or + // last use (respectively), trim it and put an empty LR into + // the spill bundle. (We are careful to treat the "starts at + // def" flag as an implicit first def even if no def-type Use + // is present.) + while let Some(entry) = self.bundles[bundle.index()].ranges.last().cloned() { + let end = entry.range.to; + let vreg = self.ranges[entry.index.index()].vreg; + let last_use = self.ranges[entry.index.index()].uses.last().map(|u| u.pos); + if last_use.is_none() { + let spill = self + .get_or_create_spill_bundle(bundle, /* create_if_absent = */ true) + .unwrap(); + log::trace!( + " -> bundle {:?} range {:?}: no uses; moving to spill bundle {:?}", + bundle, + entry.index, + spill + ); + self.bundles[spill.index()].ranges.push(entry); + self.bundles[bundle.index()].ranges.pop(); + self.ranges[entry.index.index()].bundle = spill; + continue; + } + let last_use = last_use.unwrap(); + let split = ProgPoint::before(last_use.inst().next()); + if split < end { + let spill = self + .get_or_create_spill_bundle(bundle, /* create_if_absent = */ true) + .unwrap(); + self.bundles[bundle.index()] + .ranges + .last_mut() + .unwrap() + .range + .to = split; + self.ranges[self.bundles[bundle.index()] + .ranges + .last() + .unwrap() + .index + .index()] + .range + .to = split; + let range = CodeRange { + from: split, + to: end, + }; + let empty_lr = self.create_liverange(range); + self.bundles[spill.index()].ranges.push(LiveRangeListEntry { + range, + index: empty_lr, + }); + self.ranges[empty_lr.index()].bundle = spill; + self.vregs[vreg.index()].ranges.push(LiveRangeListEntry { + range, + index: empty_lr, + }); + log::trace!( + " -> bundle {:?} range {:?}: last use implies split point {:?}", + bundle, + entry.index, + split + ); + log::trace!( + " -> moving trailing empty region to new spill bundle {:?} with new LR {:?}", + spill, + empty_lr + ); + } + break; + } + while let Some(entry) = self.bundles[new_bundle.index()].ranges.first().cloned() { + if self.ranges[entry.index.index()].has_flag(LiveRangeFlag::StartsAtDef) { + break; + } + let start = entry.range.from; + let vreg = self.ranges[entry.index.index()].vreg; + let first_use = self.ranges[entry.index.index()].uses.first().map(|u| u.pos); + if first_use.is_none() { + let spill = self + .get_or_create_spill_bundle(new_bundle, /* create_if_absent = */ true) + .unwrap(); + log::trace!( + " -> bundle {:?} range {:?}: no uses; moving to spill bundle {:?}", + new_bundle, + entry.index, + spill + ); + self.bundles[spill.index()].ranges.push(entry); + self.bundles[new_bundle.index()].ranges.drain(..1); + self.ranges[entry.index.index()].bundle = spill; + continue; + } + let first_use = first_use.unwrap(); + let split = ProgPoint::before(first_use.inst()); + if split > start { + let spill = self + .get_or_create_spill_bundle(new_bundle, /* create_if_absent = */ true) + .unwrap(); + self.bundles[new_bundle.index()] + .ranges + .first_mut() + .unwrap() + .range + .from = split; + self.ranges[self.bundles[new_bundle.index()] + .ranges + .first() + .unwrap() + .index + .index()] + .range + .from = split; + let range = CodeRange { + from: start, + to: split, + }; + let empty_lr = self.create_liverange(range); + self.bundles[spill.index()].ranges.push(LiveRangeListEntry { + range, + index: empty_lr, + }); + self.ranges[empty_lr.index()].bundle = spill; + self.vregs[vreg.index()].ranges.push(LiveRangeListEntry { + range, + index: empty_lr, + }); + log::trace!( + " -> bundle {:?} range {:?}: first use implies split point {:?}", + bundle, + entry.index, + first_use, + ); + log::trace!( + " -> moving leading empty region to new spill bundle {:?} with new LR {:?}", + spill, + empty_lr + ); + } + break; + } + + if self.bundles[bundle.index()].ranges.len() > 0 { + self.recompute_bundle_properties(bundle); + let prio = self.bundles[bundle.index()].prio; + self.allocation_queue + .insert(bundle, prio as usize, reg_hint); + } + if self.bundles[new_bundle.index()].ranges.len() > 0 { + self.recompute_bundle_properties(new_bundle); + let prio = self.bundles[new_bundle.index()].prio; + self.allocation_queue + .insert(new_bundle, prio as usize, reg_hint); + } + } + + pub fn process_bundle( + &mut self, + bundle: LiveBundleIndex, + reg_hint: PReg, + ) -> Result<(), RegAllocError> { + let req = self.compute_requirement(bundle); + // Grab a hint from either the queue or our spillset, if any. + let hint_reg = if reg_hint != PReg::invalid() { + reg_hint + } else { + self.spillsets[self.bundles[bundle.index()].spillset.index()].reg_hint + }; + log::trace!("process_bundle: bundle {:?} hint {:?}", bundle, hint_reg,); + + if let Requirement::Conflict = req { + // We have to split right away. We'll find a point to + // split that would allow at least the first half of the + // split to be conflict-free. + assert!( + !self.minimal_bundle(bundle), + "Minimal bundle with conflict!" + ); + let split_point = self.find_conflict_split_point(bundle); + self.split_and_requeue_bundle( + bundle, + /* split_at_point = */ split_point, + reg_hint, + ); + return Ok(()); + } + + // If no requirement at all (because no uses), and *if* a + // spill bundle is already present, then move the LRs over to + // the spill bundle right away. + match req { + Requirement::Unknown | Requirement::Any(_) => { + if let Some(spill) = + self.get_or_create_spill_bundle(bundle, /* create_if_absent = */ false) + { + let mut list = + std::mem::replace(&mut self.bundles[bundle.index()].ranges, smallvec![]); + for entry in &list { + self.ranges[entry.index.index()].bundle = spill; + } + self.bundles[spill.index()].ranges.extend(list.drain(..)); + return Ok(()); + } + } + _ => {} + } + + // Try to allocate! + let mut attempts = 0; + loop { + attempts += 1; + log::trace!("attempt {}, req {:?}", attempts, req); + debug_assert!(attempts < 100 * self.func.num_insts()); + + let (class, fixed_preg) = match req { + Requirement::Fixed(preg) => (preg.class(), Some(preg)), + Requirement::Register(class) => (class, None), + Requirement::Stack(_) => { + // If we must be on the stack, mark our spillset + // as required immediately. + self.spillsets[self.bundles[bundle.index()].spillset.index()].required = true; + return Ok(()); + } + + Requirement::Any(_) | Requirement::Unknown => { + self.spilled_bundles.push(bundle); + return Ok(()); + } + + Requirement::Conflict => { + unreachable!() + } + }; + // Scan all pregs, or the one fixed preg, and attempt to allocate. + + let mut lowest_cost_evict_conflict_set: Option = None; + let mut lowest_cost_evict_conflict_cost: Option = None; + + let mut lowest_cost_split_conflict_cost: Option = None; + let mut lowest_cost_split_conflict_point = ProgPoint::before(Inst::new(0)); + let mut lowest_cost_split_conflict_reg = PReg::invalid(); + + // Heuristic: start the scan for an available + // register at an offset influenced both by our + // location in the code and by the bundle we're + // considering. This has the effect of spreading + // demand more evenly across registers. + let scan_offset = self.ranges[self.bundles[bundle.index()].ranges[0].index.index()] + .range + .from + .inst() + .index() + + bundle.index(); + + self.stats.process_bundle_reg_probe_start_any += 1; + for preg in RegTraversalIter::new( + self.env, + class, + hint_reg, + PReg::invalid(), + scan_offset, + fixed_preg, + ) { + self.stats.process_bundle_reg_probes_any += 1; + let preg_idx = PRegIndex::new(preg.index()); + log::trace!("trying preg {:?}", preg_idx); + + let scan_limit_cost = match ( + lowest_cost_evict_conflict_cost, + lowest_cost_split_conflict_cost, + ) { + (Some(a), Some(b)) => Some(std::cmp::max(a, b)), + _ => None, + }; + match self.try_to_allocate_bundle_to_reg(bundle, preg_idx, scan_limit_cost) { + AllocRegResult::Allocated(alloc) => { + self.stats.process_bundle_reg_success_any += 1; + log::trace!(" -> allocated to any {:?}", preg_idx); + self.spillsets[self.bundles[bundle.index()].spillset.index()].reg_hint = + alloc.as_reg().unwrap(); + return Ok(()); + } + AllocRegResult::Conflict(bundles, first_conflict_point) => { + log::trace!( + " -> conflict with bundles {:?}, first conflict at {:?}", + bundles, + first_conflict_point + ); + + let conflict_cost = self.maximum_spill_weight_in_bundle_set(&bundles); + + if lowest_cost_evict_conflict_cost.is_none() + || conflict_cost < lowest_cost_evict_conflict_cost.unwrap() + { + lowest_cost_evict_conflict_cost = Some(conflict_cost); + lowest_cost_evict_conflict_set = Some(bundles); + } + + let loop_depth = self.cfginfo.approx_loop_depth + [self.cfginfo.insn_block[first_conflict_point.inst().index()].index()]; + let move_cost = spill_weight_from_constraint( + OperandConstraint::Reg, + loop_depth as usize, + /* is_def = */ true, + ) + .to_int(); + if lowest_cost_split_conflict_cost.is_none() + || (conflict_cost + move_cost) + < lowest_cost_split_conflict_cost.unwrap() + { + lowest_cost_split_conflict_cost = Some(conflict_cost + move_cost); + lowest_cost_split_conflict_point = first_conflict_point; + lowest_cost_split_conflict_reg = preg; + } + } + AllocRegResult::ConflictWithFixed(max_cost, point) => { + log::trace!(" -> conflict with fixed alloc; cost of other bundles up to point is {}, conflict at {:?}", max_cost, point); + + let loop_depth = self.cfginfo.approx_loop_depth + [self.cfginfo.insn_block[point.inst().index()].index()]; + let move_cost = spill_weight_from_constraint( + OperandConstraint::Reg, + loop_depth as usize, + /* is_def = */ true, + ) + .to_int(); + + if lowest_cost_split_conflict_cost.is_none() + || (max_cost + move_cost) < lowest_cost_split_conflict_cost.unwrap() + { + lowest_cost_split_conflict_cost = Some(max_cost + move_cost); + lowest_cost_split_conflict_point = point; + lowest_cost_split_conflict_reg = preg; + } + } + AllocRegResult::ConflictHighCost => { + // Simply don't consider -- we already have + // a lower-cost conflict bundle option + // to evict. + continue; + } + } + } + + // Otherwise, we *require* a register, but didn't fit into + // any with current bundle assignments. Hence, we will need + // to either split or attempt to evict some bundles. + + log::trace!( + " -> lowest cost evict: set {:?}, cost {:?}", + lowest_cost_evict_conflict_set, + lowest_cost_evict_conflict_cost, + ); + log::trace!( + " -> lowest cost split: cost {:?}, point {:?}, reg {:?}", + lowest_cost_split_conflict_cost, + lowest_cost_split_conflict_point, + lowest_cost_split_conflict_reg + ); + + // If we reach here, we *must* have an option either to split or evict. + assert!( + lowest_cost_split_conflict_cost.is_some() + || lowest_cost_evict_conflict_cost.is_some() + ); + + let our_spill_weight = self.bundle_spill_weight(bundle); + log::trace!(" -> our spill weight: {}", our_spill_weight); + + // We detect the "too-many-live-registers" case here and + // return an error cleanly, rather than panicking, because + // the regalloc.rs fuzzer depends on the register + // allocator to correctly reject impossible-to-allocate + // programs in order to discard invalid test cases. + if self.minimal_bundle(bundle) + && (attempts >= 2 + || lowest_cost_evict_conflict_cost.is_none() + || lowest_cost_evict_conflict_cost.unwrap() >= our_spill_weight) + { + if let Requirement::Register(class) = req { + // Check if this is a too-many-live-registers situation. + let range = self.bundles[bundle.index()].ranges[0].range; + log::trace!("checking for too many live regs"); + let mut min_bundles_assigned = 0; + let mut fixed_assigned = 0; + let mut total_regs = 0; + for preg in self.env.preferred_regs_by_class[class as u8 as usize] + .iter() + .chain(self.env.non_preferred_regs_by_class[class as u8 as usize].iter()) + { + log::trace!(" -> PR {:?}", preg); + let start = LiveRangeKey::from_range(&CodeRange { + from: range.from.prev(), + to: range.from.prev(), + }); + for (key, lr) in self.pregs[preg.index()].allocations.btree.range(start..) { + let preg_range = key.to_range(); + if preg_range.to <= range.from { + continue; + } + if preg_range.from >= range.to { + break; + } + if lr.is_valid() { + if self.minimal_bundle(self.ranges[lr.index()].bundle) { + log::trace!(" -> min bundle {:?}", lr); + min_bundles_assigned += 1; + } else { + log::trace!(" -> non-min bundle {:?}", lr); + } + } else { + log::trace!(" -> fixed bundle"); + fixed_assigned += 1; + } + } + total_regs += 1; + } + log::trace!( + " -> total {}, fixed {}, min {}", + total_regs, + fixed_assigned, + min_bundles_assigned + ); + if min_bundles_assigned + fixed_assigned >= total_regs { + return Err(RegAllocError::TooManyLiveRegs); + } + } + + panic!("Could not allocate minimal bundle, but the allocation problem should be possible to solve"); + } + + // If our bundle's weight is less than or equal to(*) the + // evict cost, choose to split. Also pick splitting if + // we're on our second or more attempt and we didn't + // allocate. Also pick splitting if the conflict set is + // empty, meaning a fixed conflict that can't be evicted. + // + // (*) the "equal to" part is very important: it prevents + // an infinite loop where two bundles with equal spill + // cost continually evict each other in an infinite + // allocation loop. In such a case, the first bundle in + // wins, and the other splits. + // + // Note that we don't split if the bundle is minimal. + if !self.minimal_bundle(bundle) + && (attempts >= 2 + || lowest_cost_evict_conflict_cost.is_none() + || our_spill_weight <= lowest_cost_evict_conflict_cost.unwrap()) + { + log::trace!( + " -> deciding to split: our spill weight is {}", + self.bundle_spill_weight(bundle) + ); + let bundle_start = self.bundles[bundle.index()].ranges[0].range.from; + let mut split_at_point = + std::cmp::max(lowest_cost_split_conflict_point, bundle_start); + let requeue_with_reg = lowest_cost_split_conflict_reg; + + // Adjust `split_at_point` if it is within a deeper loop + // than the bundle start -- hoist it to just before the + // first loop header it encounters. + let bundle_start_depth = self.cfginfo.approx_loop_depth + [self.cfginfo.insn_block[bundle_start.inst().index()].index()]; + let split_at_depth = self.cfginfo.approx_loop_depth + [self.cfginfo.insn_block[split_at_point.inst().index()].index()]; + if split_at_depth > bundle_start_depth { + for block in (self.cfginfo.insn_block[bundle_start.inst().index()].index() + 1) + ..=self.cfginfo.insn_block[split_at_point.inst().index()].index() + { + if self.cfginfo.approx_loop_depth[block] > bundle_start_depth { + split_at_point = self.cfginfo.block_entry[block]; + break; + } + } + } + + self.split_and_requeue_bundle(bundle, split_at_point, requeue_with_reg); + return Ok(()); + } else { + // Evict all bundles in `conflicting bundles` and try again. + self.stats.evict_bundle_event += 1; + for &bundle in &lowest_cost_evict_conflict_set.unwrap() { + log::trace!(" -> evicting {:?}", bundle); + self.evict_bundle(bundle); + self.stats.evict_bundle_count += 1; + } + } + } + } +} diff --git a/src/ion/redundant_moves.rs b/src/ion/redundant_moves.rs new file mode 100644 index 00000000..44f15d75 --- /dev/null +++ b/src/ion/redundant_moves.rs @@ -0,0 +1,142 @@ +//! Redundant-move elimination. + +use crate::{Allocation, VReg}; +use fxhash::FxHashMap; +use smallvec::{smallvec, SmallVec}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum RedundantMoveState { + Copy(Allocation, Option), + Orig(VReg), + None, +} +#[derive(Clone, Debug, Default)] +pub struct RedundantMoveEliminator { + allocs: FxHashMap, + reverse_allocs: FxHashMap>, +} +#[derive(Copy, Clone, Debug)] +pub struct RedundantMoveAction { + pub elide: bool, + pub def_alloc: Option<(Allocation, VReg)>, +} + +impl RedundantMoveEliminator { + pub fn process_move( + &mut self, + from: Allocation, + to: Allocation, + to_vreg: Option, + ) -> RedundantMoveAction { + // Look up the src and dest. + let from_state = self + .allocs + .get(&from) + .map(|&p| p) + .unwrap_or(RedundantMoveState::None); + let to_state = self + .allocs + .get(&to) + .map(|&p| p) + .unwrap_or(RedundantMoveState::None); + + log::trace!( + " -> redundant move tracker: from {} to {} to_vreg {:?}", + from, + to, + to_vreg + ); + log::trace!( + " -> from_state {:?} to_state {:?}", + from_state, + to_state + ); + + if from == to && to_vreg.is_some() { + self.clear_alloc(to); + self.allocs + .insert(to, RedundantMoveState::Orig(to_vreg.unwrap())); + return RedundantMoveAction { + elide: true, + def_alloc: Some((to, to_vreg.unwrap())), + }; + } + + let src_vreg = match from_state { + RedundantMoveState::Copy(_, opt_r) => opt_r, + RedundantMoveState::Orig(r) => Some(r), + _ => None, + }; + log::trace!(" -> src_vreg {:?}", src_vreg); + let dst_vreg = to_vreg.or(src_vreg); + log::trace!(" -> dst_vreg {:?}", dst_vreg); + let existing_dst_vreg = match to_state { + RedundantMoveState::Copy(_, opt_r) => opt_r, + RedundantMoveState::Orig(r) => Some(r), + _ => None, + }; + log::trace!(" -> existing_dst_vreg {:?}", existing_dst_vreg); + + let elide = match (from_state, to_state) { + (_, RedundantMoveState::Copy(orig_alloc, _)) if orig_alloc == from => true, + (RedundantMoveState::Copy(new_alloc, _), _) if new_alloc == to => true, + _ => false, + }; + log::trace!(" -> elide {}", elide); + + let def_alloc = if dst_vreg != existing_dst_vreg && dst_vreg.is_some() { + Some((to, dst_vreg.unwrap())) + } else { + None + }; + log::trace!(" -> def_alloc {:?}", def_alloc); + + // Invalidate all existing copies of `to` if `to` actually changed value. + if !elide { + self.clear_alloc(to); + } + + // Set up forward and reverse mapping. Don't track stack-to-stack copies. + if from.is_reg() || to.is_reg() { + self.allocs + .insert(to, RedundantMoveState::Copy(from, dst_vreg)); + log::trace!( + " -> create mapping {} -> {:?}", + to, + RedundantMoveState::Copy(from, dst_vreg) + ); + self.reverse_allocs + .entry(from) + .or_insert_with(|| smallvec![]) + .push(to); + } + + RedundantMoveAction { elide, def_alloc } + } + + pub fn clear(&mut self) { + log::trace!(" redundant move eliminator cleared"); + self.allocs.clear(); + self.reverse_allocs.clear(); + } + + pub fn clear_alloc(&mut self, alloc: Allocation) { + log::trace!(" redundant move eliminator: clear {:?}", alloc); + if let Some(ref mut existing_copies) = self.reverse_allocs.get_mut(&alloc) { + for to_inval in existing_copies.iter() { + log::trace!(" -> clear existing copy: {:?}", to_inval); + if let Some(val) = self.allocs.get_mut(to_inval) { + match val { + RedundantMoveState::Copy(_, Some(vreg)) => { + *val = RedundantMoveState::Orig(*vreg); + } + _ => *val = RedundantMoveState::None, + } + } + self.allocs.remove(to_inval); + } + existing_copies.clear(); + } + self.allocs.remove(&alloc); + } +} diff --git a/src/ion/reg_traversal.rs b/src/ion/reg_traversal.rs new file mode 100644 index 00000000..0b457cba --- /dev/null +++ b/src/ion/reg_traversal.rs @@ -0,0 +1,123 @@ +use crate::{MachineEnv, PReg, RegClass}; + +/// This iterator represents a traversal through all allocatable +/// registers of a given class, in a certain order designed to +/// minimize allocation contention. +/// +/// The order in which we try registers is somewhat complex: +/// - First, if there is a hint, we try that. +/// - Then, we try registers in a traversal order that is based on an +/// "offset" (usually the bundle index) spreading pressure evenly +/// among registers to reduce commitment-map contention. +/// - Within that scan, we try registers in two groups: first, +/// prferred registers; then, non-preferred registers. (In normal +/// usage, these consist of caller-save and callee-save registers +/// respectively, to minimize clobber-saves; but they need not.) + +pub struct RegTraversalIter<'a> { + env: &'a MachineEnv, + class: usize, + hints: [Option; 2], + hint_idx: usize, + pref_idx: usize, + non_pref_idx: usize, + offset_pref: usize, + offset_non_pref: usize, + is_fixed: bool, + fixed: Option, +} + +impl<'a> RegTraversalIter<'a> { + pub fn new( + env: &'a MachineEnv, + class: RegClass, + hint_reg: PReg, + hint2_reg: PReg, + offset: usize, + fixed: Option, + ) -> Self { + let mut hint_reg = if hint_reg != PReg::invalid() { + Some(hint_reg) + } else { + None + }; + let mut hint2_reg = if hint2_reg != PReg::invalid() { + Some(hint2_reg) + } else { + None + }; + + if hint_reg.is_none() { + hint_reg = hint2_reg; + hint2_reg = None; + } + let hints = [hint_reg, hint2_reg]; + let class = class as u8 as usize; + let offset_pref = if env.preferred_regs_by_class[class].len() > 0 { + offset % env.preferred_regs_by_class[class].len() + } else { + 0 + }; + let offset_non_pref = if env.non_preferred_regs_by_class[class].len() > 0 { + offset % env.non_preferred_regs_by_class[class].len() + } else { + 0 + }; + Self { + env, + class, + hints, + hint_idx: 0, + pref_idx: 0, + non_pref_idx: 0, + offset_pref, + offset_non_pref, + is_fixed: fixed.is_some(), + fixed, + } + } +} + +impl<'a> std::iter::Iterator for RegTraversalIter<'a> { + type Item = PReg; + + fn next(&mut self) -> Option { + if self.is_fixed { + let ret = self.fixed; + self.fixed = None; + return ret; + } + + fn wrap(idx: usize, limit: usize) -> usize { + if idx >= limit { + idx - limit + } else { + idx + } + } + if self.hint_idx < 2 && self.hints[self.hint_idx].is_some() { + let h = self.hints[self.hint_idx]; + self.hint_idx += 1; + return h; + } + while self.pref_idx < self.env.preferred_regs_by_class[self.class].len() { + let arr = &self.env.preferred_regs_by_class[self.class][..]; + let r = arr[wrap(self.pref_idx + self.offset_pref, arr.len())]; + self.pref_idx += 1; + if Some(r) == self.hints[0] || Some(r) == self.hints[1] { + continue; + } + return Some(r); + } + while self.non_pref_idx < self.env.non_preferred_regs_by_class[self.class].len() { + let arr = &self.env.non_preferred_regs_by_class[self.class][..]; + let r = arr[wrap(self.non_pref_idx + self.offset_non_pref, arr.len())]; + self.non_pref_idx += 1; + if Some(r) == self.hints[0] || Some(r) == self.hints[1] { + continue; + } + return Some(r); + } + None + } +} diff --git a/src/ion/requirement.rs b/src/ion/requirement.rs new file mode 100644 index 00000000..1540fe4e --- /dev/null +++ b/src/ion/requirement.rs @@ -0,0 +1,94 @@ +//! Requirements computation. + +use super::{Env, LiveBundleIndex}; +use crate::{Function, Operand, OperandConstraint, PReg, RegClass}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Requirement { + Unknown, + Fixed(PReg), + Register(RegClass), + Stack(RegClass), + Any(RegClass), + Conflict, +} +impl Requirement { + #[inline(always)] + pub fn class(self) -> RegClass { + match self { + Requirement::Unknown => panic!("No class for unknown Requirement"), + Requirement::Fixed(preg) => preg.class(), + Requirement::Register(class) | Requirement::Any(class) | Requirement::Stack(class) => { + class + } + Requirement::Conflict => panic!("No class for conflicted Requirement"), + } + } + #[inline(always)] + pub fn merge(self, other: Requirement) -> Requirement { + match (self, other) { + (Requirement::Unknown, other) | (other, Requirement::Unknown) => other, + (Requirement::Conflict, _) | (_, Requirement::Conflict) => Requirement::Conflict, + (other, Requirement::Any(rc)) | (Requirement::Any(rc), other) => { + if other.class() == rc { + other + } else { + Requirement::Conflict + } + } + (Requirement::Stack(rc1), Requirement::Stack(rc2)) => { + if rc1 == rc2 { + self + } else { + Requirement::Conflict + } + } + (Requirement::Register(rc), Requirement::Fixed(preg)) + | (Requirement::Fixed(preg), Requirement::Register(rc)) => { + if rc == preg.class() { + Requirement::Fixed(preg) + } else { + Requirement::Conflict + } + } + (Requirement::Register(rc1), Requirement::Register(rc2)) => { + if rc1 == rc2 { + self + } else { + Requirement::Conflict + } + } + (Requirement::Fixed(a), Requirement::Fixed(b)) if a == b => self, + _ => Requirement::Conflict, + } + } + #[inline(always)] + pub fn from_operand(op: Operand) -> Requirement { + match op.constraint() { + OperandConstraint::FixedReg(preg) => Requirement::Fixed(preg), + OperandConstraint::Reg | OperandConstraint::Reuse(_) => { + Requirement::Register(op.class()) + } + OperandConstraint::Stack => Requirement::Stack(op.class()), + _ => Requirement::Any(op.class()), + } + } +} + +impl<'a, F: Function> Env<'a, F> { + pub fn compute_requirement(&self, bundle: LiveBundleIndex) -> Requirement { + let mut req = Requirement::Unknown; + log::trace!("compute_requirement: {:?}", bundle); + for entry in &self.bundles[bundle.index()].ranges { + log::trace!(" -> LR {:?}", entry.index); + for u in &self.ranges[entry.index.index()].uses { + log::trace!(" -> use {:?}", u); + let r = Requirement::from_operand(u.operand); + req = req.merge(r); + log::trace!(" -> req {:?}", req); + } + } + log::trace!(" -> final: {:?}", req); + req + } +} diff --git a/src/ion/spill.rs b/src/ion/spill.rs new file mode 100644 index 00000000..e625f89a --- /dev/null +++ b/src/ion/spill.rs @@ -0,0 +1,218 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Spillslot allocation. + +use super::{ + AllocRegResult, Env, LiveRangeKey, LiveRangeSet, PReg, PRegIndex, RegClass, RegTraversalIter, + SpillSetIndex, SpillSlotData, SpillSlotIndex, SpillSlotList, +}; +use crate::{Allocation, Function, SpillSlot}; + +impl<'a, F: Function> Env<'a, F> { + pub fn try_allocating_regs_for_spilled_bundles(&mut self) { + log::trace!("allocating regs for spilled bundles"); + for i in 0..self.spilled_bundles.len() { + let bundle = self.spilled_bundles[i]; // don't borrow self + + let class = self.spillsets[self.bundles[bundle.index()].spillset.index()].class; + let hint = self.spillsets[self.bundles[bundle.index()].spillset.index()].reg_hint; + + // This may be an empty-range bundle whose ranges are not + // sorted; sort all range-lists again here. + self.bundles[bundle.index()] + .ranges + .sort_unstable_by_key(|entry| entry.range.from); + + let mut success = false; + self.stats.spill_bundle_reg_probes += 1; + for preg in + RegTraversalIter::new(self.env, class, hint, PReg::invalid(), bundle.index(), None) + { + log::trace!("trying bundle {:?} to preg {:?}", bundle, preg); + let preg_idx = PRegIndex::new(preg.index()); + if let AllocRegResult::Allocated(_) = + self.try_to_allocate_bundle_to_reg(bundle, preg_idx, None) + { + self.stats.spill_bundle_reg_success += 1; + success = true; + break; + } + } + if !success { + log::trace!( + "spilling bundle {:?}: marking spillset {:?} as required", + bundle, + self.bundles[bundle.index()].spillset + ); + self.spillsets[self.bundles[bundle.index()].spillset.index()].required = true; + } + } + } + + pub fn spillslot_can_fit_spillset( + &mut self, + spillslot: SpillSlotIndex, + spillset: SpillSetIndex, + ) -> bool { + for &vreg in &self.spillsets[spillset.index()].vregs { + for entry in &self.vregs[vreg.index()].ranges { + if self.spillslots[spillslot.index()] + .ranges + .btree + .contains_key(&LiveRangeKey::from_range(&entry.range)) + { + return false; + } + } + } + true + } + + pub fn allocate_spillset_to_spillslot( + &mut self, + spillset: SpillSetIndex, + spillslot: SpillSlotIndex, + ) { + self.spillsets[spillset.index()].slot = spillslot; + for i in 0..self.spillsets[spillset.index()].vregs.len() { + // don't borrow self + let vreg = self.spillsets[spillset.index()].vregs[i]; + log::trace!( + "spillslot {:?} alloc'ed to spillset {:?}: vreg {:?}", + spillslot, + spillset, + vreg, + ); + for entry in &self.vregs[vreg.index()].ranges { + log::trace!( + "spillslot {:?} getting range {:?} from LR {:?} from vreg {:?}", + spillslot, + entry.range, + entry.index, + vreg, + ); + self.spillslots[spillslot.index()] + .ranges + .btree + .insert(LiveRangeKey::from_range(&entry.range), entry.index); + } + } + } + + pub fn allocate_spillslots(&mut self) { + for spillset in 0..self.spillsets.len() { + log::trace!("allocate spillslot: {}", spillset); + let spillset = SpillSetIndex::new(spillset); + if !self.spillsets[spillset.index()].required { + continue; + } + // Get or create the spillslot list for this size. + let size = self.spillsets[spillset.index()].size as usize; + if size >= self.slots_by_size.len() { + self.slots_by_size.resize( + size + 1, + SpillSlotList { + first_spillslot: SpillSlotIndex::invalid(), + last_spillslot: SpillSlotIndex::invalid(), + }, + ); + } + // Try a few existing spillslots. + let mut spillslot_iter = self.slots_by_size[size].first_spillslot; + let mut first_slot = SpillSlotIndex::invalid(); + let mut prev = SpillSlotIndex::invalid(); + let mut success = false; + for _attempt in 0..10 { + if spillslot_iter.is_invalid() { + break; + } + if spillslot_iter == first_slot { + // We've started looking at slots we placed at the end; end search. + break; + } + if first_slot.is_invalid() { + first_slot = spillslot_iter; + } + + if self.spillslot_can_fit_spillset(spillslot_iter, spillset) { + self.allocate_spillset_to_spillslot(spillset, spillslot_iter); + success = true; + break; + } + // Remove the slot and place it at the end of the respective list. + let next = self.spillslots[spillslot_iter.index()].next_spillslot; + if prev.is_valid() { + self.spillslots[prev.index()].next_spillslot = next; + } else { + self.slots_by_size[size].first_spillslot = next; + } + if !next.is_valid() { + self.slots_by_size[size].last_spillslot = prev; + } + + let last = self.slots_by_size[size].last_spillslot; + if last.is_valid() { + self.spillslots[last.index()].next_spillslot = spillslot_iter; + } else { + self.slots_by_size[size].first_spillslot = spillslot_iter; + } + self.slots_by_size[size].last_spillslot = spillslot_iter; + + prev = spillslot_iter; + spillslot_iter = next; + } + + if !success { + // Allocate a new spillslot. + let spillslot = SpillSlotIndex::new(self.spillslots.len()); + let next = self.slots_by_size[size].first_spillslot; + self.spillslots.push(SpillSlotData { + ranges: LiveRangeSet::new(), + next_spillslot: next, + alloc: Allocation::none(), + class: self.spillsets[spillset.index()].class, + }); + self.slots_by_size[size].first_spillslot = spillslot; + if !next.is_valid() { + self.slots_by_size[size].last_spillslot = spillslot; + } + + self.allocate_spillset_to_spillslot(spillset, spillslot); + } + } + + // Assign actual slot indices to spillslots. + for i in 0..self.spillslots.len() { + self.spillslots[i].alloc = self.allocate_spillslot(self.spillslots[i].class); + } + + log::trace!("spillslot allocator done"); + } + + pub fn allocate_spillslot(&mut self, class: RegClass) -> Allocation { + let size = self.func.spillslot_size(class) as u32; + let mut offset = self.num_spillslots; + // Align up to `size`. + debug_assert!(size.is_power_of_two()); + offset = (offset + size - 1) & !(size - 1); + let slot = if self.func.multi_spillslot_named_by_last_slot() { + offset + size - 1 + } else { + offset + }; + offset += size; + self.num_spillslots = offset; + Allocation::stack(SpillSlot::new(slot as usize, class)) + } +} diff --git a/src/ion/stackmap.rs b/src/ion/stackmap.rs new file mode 100644 index 00000000..c48475cc --- /dev/null +++ b/src/ion/stackmap.rs @@ -0,0 +1,73 @@ +/* + * The following license applies to this file, which was initially + * derived from the files `js/src/jit/BacktrackingAllocator.h` and + * `js/src/jit/BacktrackingAllocator.cpp` in Mozilla Firefox: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Since the initial port, the design has been substantially evolved + * and optimized. + */ + +//! Stackmap computation. + +use super::{Env, ProgPoint, VRegIndex}; +use crate::Function; + +impl<'a, F: Function> Env<'a, F> { + pub fn compute_stackmaps(&mut self) { + // For each ref-typed vreg, iterate through ranges and find + // safepoints in-range. Add the SpillSlot to the stackmap. + + if self.func.reftype_vregs().is_empty() { + return; + } + + // Given `safepoints_per_vreg` from the liveness computation, + // all we have to do is, for each vreg in this map, step + // through the LiveRanges along with a sorted list of + // safepoints; and for each safepoint in the current range, + // emit the allocation into the `safepoint_slots` list. + + log::trace!("safepoints_per_vreg = {:?}", self.safepoints_per_vreg); + + for vreg in self.func.reftype_vregs() { + log::trace!("generating safepoint info for vreg {}", vreg); + let vreg = VRegIndex::new(vreg.vreg()); + let mut safepoints: Vec = self + .safepoints_per_vreg + .get(&vreg.index()) + .unwrap() + .iter() + .map(|&inst| ProgPoint::before(inst)) + .collect(); + safepoints.sort_unstable(); + log::trace!(" -> live over safepoints: {:?}", safepoints); + + let mut safepoint_idx = 0; + for entry in &self.vregs[vreg.index()].ranges { + let range = entry.range; + let alloc = self.get_alloc_for_range(entry.index); + log::trace!(" -> range {:?}: alloc {}", range, alloc); + while safepoint_idx < safepoints.len() && safepoints[safepoint_idx] < range.to { + if safepoints[safepoint_idx] < range.from { + safepoint_idx += 1; + continue; + } + log::trace!(" -> covers safepoint {:?}", safepoints[safepoint_idx]); + + let slot = alloc + .as_stack() + .expect("Reference-typed value not in spillslot at safepoint"); + self.safepoint_slots.push((safepoints[safepoint_idx], slot)); + safepoint_idx += 1; + } + } + } + + self.safepoint_slots.sort_unstable(); + log::trace!("final safepoint slots info: {:?}", self.safepoint_slots); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..19809089 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,1262 @@ +/* + * The following license applies to this file, which derives many + * details (register and constraint definitions, for example) from the + * files `BacktrackingAllocator.h`, `BacktrackingAllocator.cpp`, + * `LIR.h`, and possibly definitions in other related files in + * `js/src/jit/`: + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#![allow(dead_code)] + +pub(crate) mod cfg; +pub(crate) mod domtree; +pub mod indexset; +pub(crate) mod ion; +pub(crate) mod moves; +pub(crate) mod postorder; +pub(crate) mod ssa; + +#[macro_use] +mod index; +pub use index::{Block, Inst, InstRange, InstRangeIter}; + +pub mod checker; + +#[cfg(feature = "fuzzing")] +pub mod fuzzing; + +/// Register classes. +/// +/// Every value has a "register class", which is like a type at the +/// register-allocator level. Every register must belong to only one +/// class; i.e., they are disjoint. +/// +/// For tight bit-packing throughout our data structures, we support +/// only two classes, "int" and "float". This will usually be enough +/// on modern machines, as they have one class of general-purpose +/// integer registers of machine width (e.g. 64 bits), and another +/// class of float/vector registers used both for FP and for vector +/// operations. If needed, we could adjust bitpacking to allow for +/// more classes in the future. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum RegClass { + Int = 0, + Float = 1, +} + +/// A physical register. Contains a physical register number and a class. +/// +/// The `hw_enc` field contains the physical register number and is in +/// a logically separate index space per class; in other words, Int +/// register 0 is different than Float register 0. +/// +/// Because of bit-packed encodings throughout the implementation, +/// `hw_enc` must fit in 5 bits, i.e., at most 32 registers per class. +/// +/// The value returned by `index()`, in contrast, is in a single index +/// space shared by all classes, in order to enable uniform reasoning +/// about physical registers. This is done by putting the class bit at +/// the MSB, or equivalently, declaring that indices 0..31 are the 32 +/// integer registers and indices 32..63 are the 32 float registers. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PReg { + hw_enc: u8, + class: RegClass, +} + +impl PReg { + pub const MAX_BITS: usize = 5; + pub const MAX: usize = (1 << Self::MAX_BITS) - 1; + pub const MAX_INDEX: usize = 1 << (Self::MAX_BITS + 1); // including RegClass bit + + /// Create a new PReg. The `hw_enc` range is 6 bits. + #[inline(always)] + pub const fn new(hw_enc: usize, class: RegClass) -> Self { + // We don't have const panics yet (rust-lang/rust#85194) so we + // need to use a little indexing trick here. We unfortunately + // can't use the `static-assertions` crate because we need + // this to work both for const `hw_enc` and for runtime + // values. + const HW_ENC_MUST_BE_IN_BOUNDS: &[bool; PReg::MAX + 1] = &[true; PReg::MAX + 1]; + let _ = HW_ENC_MUST_BE_IN_BOUNDS[hw_enc]; + + PReg { + hw_enc: hw_enc as u8, + class, + } + } + + /// The physical register number, as encoded by the ISA for the particular register class. + #[inline(always)] + pub fn hw_enc(self) -> usize { + let hw_enc = self.hw_enc as usize; + hw_enc + } + + /// The register class. + #[inline(always)] + pub fn class(self) -> RegClass { + self.class + } + + /// Get an index into the (not necessarily contiguous) index space of + /// all physical registers. Allows one to maintain an array of data for + /// all PRegs and index it efficiently. + #[inline(always)] + pub fn index(self) -> usize { + ((self.class as u8 as usize) << 5) | (self.hw_enc as usize) + } + + /// Construct a PReg from the value returned from `.index()`. + #[inline(always)] + pub fn from_index(index: usize) -> Self { + let class = (index >> 5) & 1; + let class = match class { + 0 => RegClass::Int, + 1 => RegClass::Float, + _ => unreachable!(), + }; + let index = index & Self::MAX; + PReg::new(index, class) + } + + /// Return the "invalid PReg", which can be used to initialize + /// data structures. + #[inline(always)] + pub fn invalid() -> Self { + PReg::new(Self::MAX, RegClass::Int) + } +} + +impl std::fmt::Debug for PReg { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "PReg(hw = {}, class = {:?}, index = {})", + self.hw_enc(), + self.class(), + self.index() + ) + } +} + +impl std::fmt::Display for PReg { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let class = match self.class() { + RegClass::Int => "i", + RegClass::Float => "f", + }; + write!(f, "p{}{}", self.hw_enc(), class) + } +} + +/// A virtual register. Contains a virtual register number and a +/// class. +/// +/// A virtual register ("vreg") corresponds to an SSA value for SSA +/// input, or just a register when we allow for non-SSA input. All +/// dataflow in the input program is specified via flow through a +/// virtual register; even uses of specially-constrained locations, +/// such as fixed physical registers, are done by using vregs, because +/// we need the vreg's live range in order to track the use of that +/// location. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct VReg { + bits: u32, +} + +impl VReg { + pub const MAX_BITS: usize = 20; + pub const MAX: usize = (1 << Self::MAX_BITS) - 1; + + #[inline(always)] + pub const fn new(virt_reg: usize, class: RegClass) -> Self { + // See comment in `PReg::new()`: we are emulating a const + // assert here until const panics are stable. + const VIRT_REG_MUST_BE_IN_BOUNDS: &[bool; VReg::MAX + 1] = &[true; VReg::MAX + 1]; + let _ = VIRT_REG_MUST_BE_IN_BOUNDS[virt_reg]; + + VReg { + bits: ((virt_reg as u32) << 1) | (class as u8 as u32), + } + } + + #[inline(always)] + pub fn vreg(self) -> usize { + let vreg = (self.bits >> 1) as usize; + vreg + } + + #[inline(always)] + pub fn class(self) -> RegClass { + match self.bits & 1 { + 0 => RegClass::Int, + 1 => RegClass::Float, + _ => unreachable!(), + } + } + + #[inline(always)] + pub fn invalid() -> Self { + VReg::new(Self::MAX, RegClass::Int) + } +} + +impl std::fmt::Debug for VReg { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "VReg(vreg = {}, class = {:?})", + self.vreg(), + self.class() + ) + } +} + +impl std::fmt::Display for VReg { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "v{}", self.vreg()) + } +} + +/// A spillslot is a space in the stackframe used by the allocator to +/// temporarily store a value. +/// +/// The allocator is responsible for allocating indices in this space, +/// and will specify how many spillslots have been used when the +/// allocation is completed. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct SpillSlot { + bits: u32, +} + +impl SpillSlot { + /// Create a new SpillSlot of a given class. + #[inline(always)] + pub fn new(slot: usize, class: RegClass) -> Self { + assert!(slot < (1 << 24)); + SpillSlot { + bits: (slot as u32) | (class as u8 as u32) << 24, + } + } + + /// Get the spillslot index for this spillslot. + #[inline(always)] + pub fn index(self) -> usize { + (self.bits & 0x00ffffff) as usize + } + + /// Get the class for this spillslot. + #[inline(always)] + pub fn class(self) -> RegClass { + match (self.bits >> 24) as u8 { + 0 => RegClass::Int, + 1 => RegClass::Float, + _ => unreachable!(), + } + } + + /// Get the spillslot `offset` slots away. + #[inline(always)] + pub fn plus(self, offset: usize) -> Self { + SpillSlot::new(self.index() + offset, self.class()) + } + + /// Get the invalid spillslot, used for initializing data structures. + #[inline(always)] + pub fn invalid() -> Self { + SpillSlot { bits: 0xffff_ffff } + } + + /// Is this the invalid spillslot? + #[inline(always)] + pub fn is_invalid(self) -> bool { + self == Self::invalid() + } + + /// Is this a valid spillslot (not `SpillSlot::invalid()`)? + #[inline(always)] + pub fn is_valid(self) -> bool { + self != Self::invalid() + } +} + +impl std::fmt::Display for SpillSlot { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "stack{}", self.index()) + } +} + +/// An `OperandConstraint` specifies where a vreg's value must be +/// placed at a particular reference to that vreg via an +/// `Operand`. The constraint may be loose -- "any register of a given +/// class", for example -- or very specific, such as "this particular +/// physical register". The allocator's result will always satisfy all +/// given constraints; however, if the input has a combination of +/// constraints that are impossible to satisfy, then allocation may +/// fail or the allocator may panic (providing impossible constraints +/// is usually a programming error in the client, rather than a +/// function of bad input). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum OperandConstraint { + /// Any location is fine (register or stack slot). + Any, + /// Operand must be in a register. Register is read-only for Uses. + Reg, + /// Operand must be on the stack. + Stack, + /// Operand must be in a fixed register. + FixedReg(PReg), + /// On defs only: reuse a use's register. + Reuse(usize), +} + +impl std::fmt::Display for OperandConstraint { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Any => write!(f, "any"), + Self::Reg => write!(f, "reg"), + Self::Stack => write!(f, "stack"), + Self::FixedReg(preg) => write!(f, "fixed({})", preg), + Self::Reuse(idx) => write!(f, "reuse({})", idx), + } + } +} + +/// The "kind" of the operand: whether it reads a vreg (Use), writes a +/// vreg (Def), or reads and then writes (Mod, for "modify"). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum OperandKind { + Def = 0, + Mod = 1, + Use = 2, +} + +/// The "position" of the operand: where it has its read/write +/// effects. These are positions "in" the instruction, and "early" and +/// "late" are relative to the instruction's main effect or +/// computation. In other words, the allocator assumes that the +/// instruction (i) performs all reads and writes of "early" operands, +/// (ii) does its work, and (iii) performs all reads and writes of its +/// "late" operands. +/// +/// A "write" (def) at "early" or a "read" (use) at "late" may be +/// slightly nonsensical, given the above, if the read is necessary +/// for the computation or the write is a result of it. A way to think +/// of it is that the value (even if a result of execution) *could* +/// have been read or written at the given location without causing +/// any register-usage conflicts. In other words, these write-early or +/// use-late operands ensure that the particular allocations are valid +/// for longer than usual and that a register is not reused between +/// the use (normally complete at "Early") and the def (normally +/// starting at "Late"). See `Operand` for more. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum OperandPos { + Early = 0, + Late = 1, +} + +/// An `Operand` encodes everything about a mention of a register in +/// an instruction: virtual register number, and any constraint that +/// applies to the register at this program point. +/// +/// An Operand may be a use or def (this corresponds to `LUse` and +/// `LAllocation` in Ion). +/// +/// Generally, regalloc2 considers operands to have their effects at +/// one of two points that exist in an instruction: "Early" or +/// "Late". All operands at a given program-point are assigned +/// non-conflicting locations based on their constraints. Each operand +/// has a "kind", one of use/def/mod, corresponding to +/// read/write/read-write, respectively. +/// +/// Usually, an instruction's inputs will be "early uses" and outputs +/// will be "late defs", though there are valid use-cases for other +/// combinations too. For example, a single "instruction" seen by the +/// regalloc that lowers into multiple machine instructions and reads +/// some of its inputs after it starts to write outputs must either +/// make those input(s) "late uses" or those output(s) "early defs" so +/// that the conflict (overlap) is properly accounted for. See +/// comments on the constructors below for more. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Operand { + /// Bit-pack into 32 bits. + /// + /// constraint:3 kind:2 pos:1 class:1 preg:5 vreg:20 + /// + /// where `constraint` is an `OperandConstraint`, `kind` is an + /// `OperandKind`, `pos` is an `OperandPos`, `class` is a + /// `RegClass`, `preg` is a `PReg` or an index for a reused-input + /// constraint, and `vreg` is a vreg index. + bits: u32, +} + +impl Operand { + /// Construct a new operand. + #[inline(always)] + pub fn new( + vreg: VReg, + constraint: OperandConstraint, + kind: OperandKind, + pos: OperandPos, + ) -> Self { + let (preg_field, constraint_field): (u32, u32) = match constraint { + OperandConstraint::Any => (0, 0), + OperandConstraint::Reg => (0, 1), + OperandConstraint::Stack => (0, 2), + OperandConstraint::FixedReg(preg) => { + assert_eq!(preg.class(), vreg.class()); + (preg.hw_enc() as u32, 3) + } + OperandConstraint::Reuse(which) => { + assert!(which <= PReg::MAX); + (which as u32, 4) + } + }; + let class_field = vreg.class() as u8 as u32; + let pos_field = pos as u8 as u32; + let kind_field = kind as u8 as u32; + Operand { + bits: vreg.vreg() as u32 + | (preg_field << 20) + | (class_field << 25) + | (pos_field << 26) + | (kind_field << 27) + | (constraint_field << 29), + } + } + + /// Create an `Operand` that designates a use of a VReg that must + /// be in a register, and that is used at the "before" point, + /// i.e., can be overwritten by a result. + #[inline(always)] + pub fn reg_use(vreg: VReg) -> Self { + Operand::new( + vreg, + OperandConstraint::Reg, + OperandKind::Use, + OperandPos::Early, + ) + } + + /// Create an `Operand` that designates a use of a VReg that must + /// be in a register, and that is used up until the "after" point, + /// i.e., must not conflict with any results. + #[inline(always)] + pub fn reg_use_at_end(vreg: VReg) -> Self { + Operand::new( + vreg, + OperandConstraint::Reg, + OperandKind::Use, + OperandPos::Late, + ) + } + + /// Create an `Operand` that designates a definition of a VReg + /// that must be in a register, and that occurs at the "after" + /// point, i.e. may reuse a register that carried a use into this + /// instruction. + #[inline(always)] + pub fn reg_def(vreg: VReg) -> Self { + Operand::new( + vreg, + OperandConstraint::Reg, + OperandKind::Def, + OperandPos::Late, + ) + } + + /// Create an `Operand` that designates a definition of a VReg + /// that must be in a register, and that occurs early at the + /// "before" point, i.e., must not conflict with any input to the + /// instruction. + #[inline(always)] + pub fn reg_def_at_start(vreg: VReg) -> Self { + Operand::new( + vreg, + OperandConstraint::Reg, + OperandKind::Def, + OperandPos::Early, + ) + } + + /// Create an `Operand` that designates a def (and use) of a + /// temporary *within* the instruction. This register is assumed + /// to be written by the instruction, and will not conflict with + /// any input or output, but should not be used after the + /// instruction completes. + /// + /// Note that within a single instruction, the dedicated scratch + /// register (as specified in the `MachineEnv`) is also always + /// available for use. The register allocator may use the register + /// *between* instructions in order to implement certain sequences + /// of moves, but will never hold a value live in the scratch + /// register across an instruction. + #[inline(always)] + pub fn reg_temp(vreg: VReg) -> Self { + // For now a temp is equivalent to a def-at-start operand, + // which gives the desired semantics but does not enforce the + // "not reused later" constraint. + Operand::new( + vreg, + OperandConstraint::Reg, + OperandKind::Def, + OperandPos::Early, + ) + } + + /// Create an `Operand` that designates a def of a vreg that must + /// reuse the register assigned to an input to the + /// instruction. The input is identified by `idx` (is the `idx`th + /// `Operand` for the instruction) and must be constraint to a + /// register, i.e., be the result of `Operand::reg_use(vreg)`. + #[inline(always)] + pub fn reg_reuse_def(vreg: VReg, idx: usize) -> Self { + Operand::new( + vreg, + OperandConstraint::Reuse(idx), + OperandKind::Def, + OperandPos::Late, + ) + } + + /// Create an `Operand` that designates a use of a vreg and + /// ensures that it is placed in the given, fixed PReg at the + /// use. It is guaranteed that the `Allocation` resulting for this + /// operand will be `preg`. + #[inline(always)] + pub fn reg_fixed_use(vreg: VReg, preg: PReg) -> Self { + Operand::new( + vreg, + OperandConstraint::FixedReg(preg), + OperandKind::Use, + OperandPos::Early, + ) + } + + /// Create an `Operand` that designates a def of a vreg and + /// ensures that it is placed in the given, fixed PReg at the + /// def. It is guaranteed that the `Allocation` resulting for this + /// operand will be `preg`. + #[inline(always)] + pub fn reg_fixed_def(vreg: VReg, preg: PReg) -> Self { + Operand::new( + vreg, + OperandConstraint::FixedReg(preg), + OperandKind::Def, + OperandPos::Late, + ) + } + + /// Get the virtual register designated by an operand. Every + /// operand must name some virtual register, even if it constrains + /// the operand to a fixed physical register as well; the vregs + /// are used to track dataflow. + #[inline(always)] + pub fn vreg(self) -> VReg { + let vreg_idx = ((self.bits as usize) & VReg::MAX) as usize; + VReg::new(vreg_idx, self.class()) + } + + /// Get the register class used by this operand. + #[inline(always)] + pub fn class(self) -> RegClass { + let class_field = (self.bits >> 25) & 1; + match class_field { + 0 => RegClass::Int, + 1 => RegClass::Float, + _ => unreachable!(), + } + } + + /// Get the "kind" of this operand: a definition (write), a use + /// (read), or a "mod" / modify (a read followed by a write). + #[inline(always)] + pub fn kind(self) -> OperandKind { + let kind_field = (self.bits >> 27) & 3; + match kind_field { + 0 => OperandKind::Def, + 1 => OperandKind::Mod, + 2 => OperandKind::Use, + _ => unreachable!(), + } + } + + /// Get the "position" of this operand, i.e., where its read + /// and/or write occurs: either before the instruction executes, + /// or after it does. Ordinarily, uses occur at "before" and defs + /// at "after", though there are cases where this is not true. + #[inline(always)] + pub fn pos(self) -> OperandPos { + let pos_field = (self.bits >> 26) & 1; + match pos_field { + 0 => OperandPos::Early, + 1 => OperandPos::Late, + _ => unreachable!(), + } + } + + /// Get the "constraint" of this operand, i.e., what requirements + /// its allocation must fulfill. + #[inline(always)] + pub fn constraint(self) -> OperandConstraint { + let constraint_field = (self.bits >> 29) & 7; + let preg_field = ((self.bits >> 20) as usize) & PReg::MAX; + match constraint_field { + 0 => OperandConstraint::Any, + 1 => OperandConstraint::Reg, + 2 => OperandConstraint::Stack, + 3 => OperandConstraint::FixedReg(PReg::new(preg_field, self.class())), + 4 => OperandConstraint::Reuse(preg_field), + _ => unreachable!(), + } + } + + /// Get the raw 32-bit encoding of this operand's fields. + #[inline(always)] + pub fn bits(self) -> u32 { + self.bits + } + + /// Construct an `Operand` from the raw 32-bit encoding returned + /// from `bits()`. + #[inline(always)] + pub fn from_bits(bits: u32) -> Self { + debug_assert!(bits >> 29 <= 4); + Operand { bits } + } +} + +impl std::fmt::Debug for Operand { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + std::fmt::Display::fmt(self, f) + } +} + +impl std::fmt::Display for Operand { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match (self.kind(), self.pos()) { + (OperandKind::Def, OperandPos::Late) + | (OperandKind::Mod | OperandKind::Use, OperandPos::Early) => { + write!(f, "{:?}", self.kind())?; + } + _ => { + write!(f, "{:?}@{:?}", self.kind(), self.pos())?; + } + } + write!( + f, + ": {}{} {}", + self.vreg(), + match self.class() { + RegClass::Int => "i", + RegClass::Float => "f", + }, + self.constraint() + ) + } +} + +/// An Allocation represents the end result of regalloc for an +/// Operand. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Allocation { + /// Bit-pack in 32 bits. + /// + /// kind:3 unused:1 index:28 + bits: u32, +} + +impl std::fmt::Debug for Allocation { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + std::fmt::Display::fmt(self, f) + } +} + +impl std::fmt::Display for Allocation { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self.kind() { + AllocationKind::None => write!(f, "none"), + AllocationKind::Reg => write!(f, "{}", self.as_reg().unwrap()), + AllocationKind::Stack => write!(f, "{}", self.as_stack().unwrap()), + } + } +} + +impl Allocation { + /// Construct a new Allocation. + #[inline(always)] + pub(crate) fn new(kind: AllocationKind, index: usize) -> Self { + assert!(index < (1 << 28)); + Self { + bits: ((kind as u8 as u32) << 29) | (index as u32), + } + } + + /// Get the "none" allocation, which is distinct from the other + /// possibilities and is used to initialize data structures. + #[inline(always)] + pub fn none() -> Allocation { + Allocation::new(AllocationKind::None, 0) + } + + /// Create an allocation into a register. + #[inline(always)] + pub fn reg(preg: PReg) -> Allocation { + Allocation::new(AllocationKind::Reg, preg.index()) + } + + /// Create an allocation into a spillslot. + #[inline(always)] + pub fn stack(slot: SpillSlot) -> Allocation { + Allocation::new(AllocationKind::Stack, slot.bits as usize) + } + + /// Get the allocation's "kind": none, register, or stack (spillslot). + #[inline(always)] + pub fn kind(self) -> AllocationKind { + match (self.bits >> 29) & 7 { + 0 => AllocationKind::None, + 1 => AllocationKind::Reg, + 2 => AllocationKind::Stack, + _ => unreachable!(), + } + } + + /// Is the allocation "none"? + #[inline(always)] + pub fn is_none(self) -> bool { + self.kind() == AllocationKind::None + } + + /// Is the allocation not "none"? + #[inline(always)] + pub fn is_some(self) -> bool { + self.kind() != AllocationKind::None + } + + /// Is the allocation a register? + #[inline(always)] + pub fn is_reg(self) -> bool { + self.kind() == AllocationKind::Reg + } + + /// Is the allocation on the stack (a spillslot)? + #[inline(always)] + pub fn is_stack(self) -> bool { + self.kind() == AllocationKind::Stack + } + + /// Get the index of the spillslot or register. If register, this + /// is an index that can be used by `PReg::from_index()`. + #[inline(always)] + pub fn index(self) -> usize { + (self.bits & ((1 << 28) - 1)) as usize + } + + /// Get the allocation as a physical register, if any. + #[inline(always)] + pub fn as_reg(self) -> Option { + if self.kind() == AllocationKind::Reg { + Some(PReg::from_index(self.index())) + } else { + None + } + } + + /// Get the allocation as a spillslot, if any. + #[inline(always)] + pub fn as_stack(self) -> Option { + if self.kind() == AllocationKind::Stack { + Some(SpillSlot { + bits: self.index() as u32, + }) + } else { + None + } + } + + /// Get the raw bits for the packed encoding of this allocation. + #[inline(always)] + pub fn bits(self) -> u32 { + self.bits + } + + /// Construct an allocation from its packed encoding. + #[inline(always)] + pub fn from_bits(bits: u32) -> Self { + debug_assert!(bits >> 29 >= 5); + Self { bits } + } +} + +/// An allocation is one of two "kinds" (or "none"): register or +/// spillslot/stack. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(u8)] +pub enum AllocationKind { + None = 0, + Reg = 1, + Stack = 2, +} + +impl Allocation { + /// Get the register class of an allocation's value. + #[inline(always)] + pub fn class(self) -> RegClass { + match self.kind() { + AllocationKind::None => panic!("Allocation::None has no class"), + AllocationKind::Reg => self.as_reg().unwrap().class(), + AllocationKind::Stack => self.as_stack().unwrap().class(), + } + } +} + +/// A trait defined by the regalloc client to provide access to its +/// machine-instruction / CFG representation. +/// +/// (This trait's design is inspired by, and derives heavily from, the +/// trait of the same name in regalloc.rs.) +pub trait Function { + // ------------- + // CFG traversal + // ------------- + + /// How many instructions are there? + fn num_insts(&self) -> usize; + + /// How many blocks are there? + fn num_blocks(&self) -> usize; + + /// Get the index of the entry block. + fn entry_block(&self) -> Block; + + /// Provide the range of instruction indices contained in each block. + fn block_insns(&self, block: Block) -> InstRange; + + /// Get CFG successors for a given block. + fn block_succs(&self, block: Block) -> &[Block]; + + /// Get the CFG predecessors for a given block. + fn block_preds(&self, block: Block) -> &[Block]; + + /// Get the block parameters for a given block. + fn block_params(&self, block: Block) -> &[VReg]; + + /// Determine whether an instruction is a call instruction. This is used + /// only for splitting heuristics. + fn is_call(&self, insn: Inst) -> bool; + + /// Determine whether an instruction is a return instruction. + fn is_ret(&self, insn: Inst) -> bool; + + /// Determine whether an instruction is the end-of-block + /// branch. If so, its operands at the indices given by + /// `branch_blockparam_arg_offset()` below *must* be the block + /// parameters for each of its block's `block_succs` successor + /// blocks, in order. + fn is_branch(&self, insn: Inst) -> bool; + + /// If `insn` is a branch at the end of `block`, returns the + /// operand index at which outgoing blockparam arguments are + /// found. Starting at this index, blockparam arguments for each + /// successor block's blockparams, in order, must be found. + /// + /// It is an error if `self.inst_operands(insn).len() - + /// self.branch_blockparam_arg_offset(insn)` is not exactly equal + /// to the sum of blockparam counts for all successor blocks. + fn branch_blockparam_arg_offset(&self, block: Block, insn: Inst) -> usize; + + /// Determine whether an instruction requires all reference-typed + /// values to be placed onto the stack. For these instructions, + /// stackmaps will be provided. + /// + /// This is usually associated with the concept of a "safepoint", + /// though strictly speaking, a safepoint could also support + /// reference-typed values in registers if there were a way to + /// denote their locations and if this were acceptable to the + /// client. Usually garbage-collector implementations want to see + /// roots on the stack, so we do that for now. + fn requires_refs_on_stack(&self, _: Inst) -> bool { + false + } + + /// Determine whether an instruction is a move; if so, return the + /// Operands for (src, dst). + fn is_move(&self, insn: Inst) -> Option<(Operand, Operand)>; + + // -------------------------- + // Instruction register slots + // -------------------------- + + /// Get the Operands for an instruction. + fn inst_operands(&self, insn: Inst) -> &[Operand]; + + /// Get the clobbers for an instruction; these are the registers + /// that, after the instruction has executed, hold values that are + /// arbitrary, separately from the usual outputs to the + /// instruction. It is invalid to read a register that has been + /// clobbered; the register allocator is free to assume that + /// clobbered registers are filled with garbage and available for + /// reuse. It will avoid storing any value in a clobbered register + /// that must be live across the instruction. + /// + /// Another way of seeing this is that a clobber is equivalent to + /// an "early def" of a fresh vreg that is not used anywhere else + /// in the program, with a fixed-register constraint that places + /// it in a given PReg chosen by the client prior to regalloc. + /// + /// Every register written by an instruction must either + /// correspond to (be assigned to) an Operand of kind `Def` or + /// `Mod`, or else must be a "clobber". + /// + /// This can be used to, for example, describe ABI-specified + /// registers that are not preserved by a call instruction, or + /// fixed physical registers written by an instruction but not + /// used as a vreg output, or fixed physical registers used as + /// temps within an instruction out of necessity. + fn inst_clobbers(&self, insn: Inst) -> &[PReg]; + + /// Get the number of `VReg` in use in this function. + fn num_vregs(&self) -> usize; + + /// Get the VRegs that are pointer/reference types. This has the + /// following effects for each such vreg: + /// + /// - At all safepoint instructions, the vreg will be in a + /// SpillSlot, not in a register. + /// - The vreg *may not* be used as a register operand on + /// safepoint instructions: this is because a vreg can only live + /// in one place at a time. The client should copy the value to an + /// integer-typed vreg and use this to pass a pointer as an input + /// to a safepoint instruction (such as a function call). + /// - At all safepoint instructions, all live vregs' locations + /// will be included in a list in the `Output` below, so that + /// pointer-inspecting/updating functionality (such as a moving + /// garbage collector) may observe and edit their values. + fn reftype_vregs(&self) -> &[VReg] { + &[] + } + + /// Get the VRegs for which we should generate value-location + /// metadata for debugging purposes. This can be used to generate + /// e.g. DWARF with valid prgram-point ranges for each value + /// expression in a way that is more efficient than a post-hoc + /// analysis of the allocator's output. + /// + /// Each tuple is (vreg, inclusive_start, exclusive_end, + /// label). In the `Output` there will be (label, inclusive_start, + /// exclusive_end, alloc)` tuples. The ranges may not exactly + /// match -- specifically, the returned metadata may cover only a + /// subset of the requested ranges -- if the value is not live for + /// the entire requested ranges. + fn debug_value_labels(&self) -> &[(Inst, Inst, VReg, u32)] { + &[] + } + + /// Is the given vreg pinned to a preg? If so, every use of the + /// vreg is automatically assigned to the preg, and live-ranges of + /// the vreg allocate the preg exclusively (are not spilled + /// elsewhere). The user must take care not to have too many live + /// pinned vregs such that allocation is no longer possible; + /// liverange computation will check that this is the case (that + /// there are enough remaining allocatable pregs of every class to + /// hold all Reg-constrained operands). + fn is_pinned_vreg(&self, _: VReg) -> Option { + None + } + + /// Return a list of all pinned vregs. + fn pinned_vregs(&self) -> &[VReg] { + &[] + } + + // -------------- + // Spills/reloads + // -------------- + + /// How many logical spill slots does the given regclass require? E.g., on + /// a 64-bit machine, spill slots may nominally be 64-bit words, but a + /// 128-bit vector value will require two slots. The regalloc will always + /// align on this size. + /// + /// (This trait method's design and doc text derives from + /// regalloc.rs' trait of the same name.) + fn spillslot_size(&self, regclass: RegClass) -> usize; + + /// When providing a spillslot number for a multi-slot spillslot, + /// do we provide the first or the last? This is usually related + /// to which direction the stack grows and different clients may + /// have different preferences. + fn multi_spillslot_named_by_last_slot(&self) -> bool { + false + } +} + +/// A position before or after an instruction at which we can make an +/// edit. +/// +/// Note that this differs from `OperandPos` in that the former +/// describes specifically a constraint on an operand, while this +/// describes a program point. `OperandPos` could grow more options in +/// the future, for example if we decide that an "early write" or +/// "late read" phase makes sense, while `InstPosition` will always +/// describe these two insertion points. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(u8)] +pub enum InstPosition { + Before = 0, + After = 1, +} + +/// A program point: a single point before or after a given instruction. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ProgPoint { + bits: u32, +} + +impl std::fmt::Debug for ProgPoint { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "progpoint{}{}", + self.inst().index(), + match self.pos() { + InstPosition::Before => "-pre", + InstPosition::After => "-post", + } + ) + } +} + +impl ProgPoint { + /// Create a new ProgPoint before or after the given instruction. + #[inline(always)] + pub fn new(inst: Inst, pos: InstPosition) -> Self { + let bits = ((inst.0 as u32) << 1) | (pos as u8 as u32); + Self { bits } + } + + /// Create a new ProgPoint before the given instruction. + #[inline(always)] + pub fn before(inst: Inst) -> Self { + Self::new(inst, InstPosition::Before) + } + + /// Create a new ProgPoint after the given instruction. + #[inline(always)] + pub fn after(inst: Inst) -> Self { + Self::new(inst, InstPosition::After) + } + + /// Get the instruction that this ProgPoint is before or after. + #[inline(always)] + pub fn inst(self) -> Inst { + // Cast to i32 to do an arithmetic right-shift, which will + // preserve an `Inst::invalid()` (which is -1, or all-ones). + Inst::new(((self.bits as i32) >> 1) as usize) + } + + /// Get the "position" (Before or After) relative to the + /// instruction. + #[inline(always)] + pub fn pos(self) -> InstPosition { + match self.bits & 1 { + 0 => InstPosition::Before, + 1 => InstPosition::After, + _ => unreachable!(), + } + } + + /// Get the "next" program point: for After, this is the Before of + /// the next instruction, while for Before, this is After of the + /// same instruction. + #[inline(always)] + pub fn next(self) -> ProgPoint { + Self { + bits: self.bits + 1, + } + } + + /// Get the "previous" program point, the inverse of `.next()` + /// above. + #[inline(always)] + pub fn prev(self) -> ProgPoint { + Self { + bits: self.bits - 1, + } + } + + /// Convert to a raw encoding in 32 bits. + #[inline(always)] + pub fn to_index(self) -> u32 { + self.bits + } + + /// Construct from the raw 32-bit encoding. + #[inline(always)] + pub fn from_index(index: u32) -> Self { + Self { bits: index } + } +} + +/// An instruction to insert into the program to perform some data movement. +#[derive(Clone, Debug)] +pub enum Edit { + /// Move one allocation to another. Each allocation may be a + /// register or a stack slot (spillslot). However, stack-to-stack + /// moves will never be generated. + /// + /// `to_vreg`, if defined, is useful as metadata: it indicates + /// that the moved value is a def of a new vreg. + /// + /// `Move` edits will be generated even if src and dst allocation + /// are the same if the vreg changes; this allows proper metadata + /// tracking even when moves are elided. + Move { + from: Allocation, + to: Allocation, + to_vreg: Option, + }, + + /// Define a particular Allocation to contain a particular VReg. Useful + /// for the checker. + DefAlloc { alloc: Allocation, vreg: VReg }, +} + +/// A machine envrionment tells the register allocator which registers +/// are available to allocate and what register may be used as a +/// scratch register for each class, and some other miscellaneous info +/// as well. +#[derive(Clone, Debug)] +pub struct MachineEnv { + /// Physical registers. Every register that might be mentioned in + /// any constraint must be listed here, even if it is not + /// allocatable (present in one of + /// `{preferred,non_preferred}_regs_by_class`). + pub regs: Vec, + + /// Preferred physical registers for each class. These are the + /// registers that will be allocated first, if free. + pub preferred_regs_by_class: [Vec; 2], + + /// Non-preferred physical registers for each class. These are the + /// registers that will be allocated if a preferred register is + /// not available; using one of these is considered suboptimal, + /// but still better than spilling. + pub non_preferred_regs_by_class: [Vec; 2], + + /// One scratch register per class. This is needed to perform + /// moves between registers when cyclic move patterns occur. The + /// register should not be placed in either the preferred or + /// non-preferred list (i.e., it is not otherwise allocatable). + /// + /// Note that the register allocator will freely use this register + /// between instructions, but *within* the machine code generated + /// by a single (regalloc-level) instruction, the client is free + /// to use the scratch register. E.g., if one "instruction" causes + /// the emission of two machine-code instructions, this lowering + /// can use the scratch register between them. + pub scratch_by_class: [PReg; 2], +} + +/// The output of the register allocator. +#[derive(Clone, Debug)] +pub struct Output { + /// How many spillslots are needed in the frame? + pub num_spillslots: usize, + + /// Edits (insertions or removals). Guaranteed to be sorted by + /// program point. + pub edits: Vec<(ProgPoint, Edit)>, + + /// Allocations for each operand. Mapping from instruction to + /// allocations provided by `inst_alloc_offsets` below. + pub allocs: Vec, + + /// Allocation offset in `allocs` for each instruction. + pub inst_alloc_offsets: Vec, + + /// Safepoint records: at a given program point, a reference-typed value lives in the given SpillSlot. + pub safepoint_slots: Vec<(ProgPoint, SpillSlot)>, + + /// Debug info: a labeled value (as applied to vregs by + /// `Function::debug_value_labels()` on the input side) is located + /// in the given allocation from the first program point + /// (inclusive) to the second (exclusive). Guaranteed to be sorted + /// by label and program point, and the ranges are guaranteed to + /// be disjoint. + pub debug_locations: Vec<(u32, ProgPoint, ProgPoint, Allocation)>, + + /// Internal stats from the allocator. + pub stats: ion::Stats, +} + +impl Output { + /// Get the allocations assigned to a given instruction. + pub fn inst_allocs(&self, inst: Inst) -> &[Allocation] { + let start = self.inst_alloc_offsets[inst.index()] as usize; + let end = if inst.index() + 1 == self.inst_alloc_offsets.len() { + self.allocs.len() + } else { + self.inst_alloc_offsets[inst.index() + 1] as usize + }; + &self.allocs[start..end] + } +} + +/// An error that prevents allocation. +#[derive(Clone, Debug)] +pub enum RegAllocError { + /// Critical edge is not split between given blocks. + CritEdge(Block, Block), + /// Invalid SSA for given vreg at given inst: multiple defs or + /// illegal use. `inst` may be `Inst::invalid()` if this concerns + /// a block param. + SSA(VReg, Inst), + /// Invalid basic block: does not end in branch/ret, or contains a + /// branch/ret in the middle. + BB(Block), + /// Invalid branch: operand count does not match sum of block + /// params of successor blocks. + Branch(Inst), + /// A VReg is live-in on entry; this is not allowed. + EntryLivein, + /// A branch has non-blockparam arg(s) and at least one of the + /// successor blocks has more than one predecessor, forcing + /// edge-moves before this branch. This is disallowed because it + /// places a use after the edge moves occur; insert an edge block + /// to avoid the situation. + DisallowedBranchArg(Inst), + /// Too many pinned VRegs + Reg-constrained Operands are live at + /// once, making allocation impossible. + TooManyLiveRegs, +} + +impl std::fmt::Display for RegAllocError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +impl std::error::Error for RegAllocError {} + +/// Run the allocator. +pub fn run( + func: &F, + env: &MachineEnv, + options: &RegallocOptions, +) -> Result { + ion::run(func, env, options.verbose_log) +} + +/// Options for allocation. +#[derive(Clone, Copy, Debug, Default)] +pub struct RegallocOptions { + /// Add extra verbosity to debug logs. + pub verbose_log: bool, +} diff --git a/src/moves.rs b/src/moves.rs new file mode 100644 index 00000000..0bb388ea --- /dev/null +++ b/src/moves.rs @@ -0,0 +1,206 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +use crate::Allocation; +use smallvec::{smallvec, SmallVec}; + +pub type MoveVec = SmallVec<[(Allocation, Allocation, T); 16]>; + +/// A `ParallelMoves` represents a list of alloc-to-alloc moves that +/// must happen in parallel -- i.e., all reads of sources semantically +/// happen before all writes of destinations, and destinations are +/// allowed to overwrite sources. It can compute a list of sequential +/// moves that will produce the equivalent data movement, possibly +/// using a scratch register if one is necessary. +pub struct ParallelMoves { + parallel_moves: MoveVec, + scratch: Allocation, +} + +impl ParallelMoves { + pub fn new(scratch: Allocation) -> Self { + Self { + parallel_moves: smallvec![], + scratch, + } + } + + pub fn add(&mut self, from: Allocation, to: Allocation, t: T) { + self.parallel_moves.push((from, to, t)); + } + + fn sources_overlap_dests(&self) -> bool { + // Assumes `parallel_moves` has already been sorted in `resolve()` below. + for &(_, dst, _) in &self.parallel_moves { + if self + .parallel_moves + .binary_search_by_key(&dst, |&(src, _, _)| src) + .is_ok() + { + return true; + } + } + false + } + + pub fn resolve(mut self) -> MoveVec { + // Easy case: zero or one move. Just return our vec. + if self.parallel_moves.len() <= 1 { + return self.parallel_moves; + } + + // Sort moves by source so that we can efficiently test for + // presence. + self.parallel_moves.sort_by_key(|&(src, dst, _)| (src, dst)); + + // Do any dests overlap sources? If not, we can also just + // return the list. + if !self.sources_overlap_dests() { + return self.parallel_moves; + } + + // General case: some moves overwrite dests that other moves + // read as sources. We'll use a general algorithm. + // + // *Important property*: because we expect that each register + // has only one writer (otherwise the effect of the parallel + // move is undefined), each move can only block one other move + // (with its one source corresponding to the one writer of + // that source). Thus, we *can only have simple cycles* (those + // that are a ring of nodes, i.e., with only one path from a + // node back to itself); there are no SCCs that are more + // complex than that. We leverage this fact below to avoid + // having to do a full Tarjan SCC DFS (with lowest-index + // computation, etc.): instead, as soon as we find a cycle, we + // know we have the full cycle and we can do a cyclic move + // sequence and continue. + + // Sort moves by destination and check that each destination + // has only one writer. + self.parallel_moves.sort_by_key(|&(_, dst, _)| dst); + if cfg!(debug_assertions) { + let mut last_dst = None; + for &(_, dst, _) in &self.parallel_moves { + if last_dst.is_some() { + debug_assert!(last_dst.unwrap() != dst); + } + last_dst = Some(dst); + } + } + + // Construct a mapping from move indices to moves they must + // come before. Any given move must come before a move that + // overwrites its destination; we have moves sorted by dest + // above so we can efficiently find such a move, if any. + let mut must_come_before: SmallVec<[Option; 16]> = + smallvec![None; self.parallel_moves.len()]; + for (i, &(src, _, _)) in self.parallel_moves.iter().enumerate() { + if let Ok(move_to_dst_idx) = self + .parallel_moves + .binary_search_by_key(&src, |&(_, dst, _)| dst) + { + must_come_before[i] = Some(move_to_dst_idx); + } + } + + // Do a simple stack-based DFS and emit moves in postorder, + // then reverse at the end for RPO. Unlike Tarjan's SCC + // algorithm, we can emit a cycle as soon as we find one, as + // noted above. + let mut ret: MoveVec = smallvec![]; + let mut stack: SmallVec<[usize; 16]> = smallvec![]; + let mut visited: SmallVec<[bool; 16]> = smallvec![false; self.parallel_moves.len()]; + let mut onstack: SmallVec<[bool; 16]> = smallvec![false; self.parallel_moves.len()]; + + stack.push(0); + onstack[0] = true; + loop { + if stack.is_empty() { + if let Some(next) = visited.iter().position(|&flag| !flag) { + stack.push(next); + onstack[next] = true; + } else { + break; + } + } + + let top = *stack.last().unwrap(); + visited[top] = true; + match must_come_before[top] { + None => { + ret.push(self.parallel_moves[top]); + onstack[top] = false; + stack.pop(); + while let Some(top) = stack.pop() { + ret.push(self.parallel_moves[top]); + onstack[top] = false; + } + } + Some(next) if visited[next] && !onstack[next] => { + ret.push(self.parallel_moves[top]); + onstack[top] = false; + stack.pop(); + while let Some(top) = stack.pop() { + ret.push(self.parallel_moves[top]); + onstack[top] = false; + } + } + Some(next) if !visited[next] && !onstack[next] => { + stack.push(next); + onstack[next] = true; + continue; + } + Some(next) => { + // Found a cycle -- emit a cyclic-move sequence + // for the cycle on the top of stack, then normal + // moves below it. Recall that these moves will be + // reversed in sequence, so from the original + // parallel move set + // + // { B := A, C := B, A := B } + // + // we will generate something like: + // + // A := scratch + // B := A + // C := B + // scratch := C + // + // which will become: + // + // scratch := C + // C := B + // B := A + // A := scratch + let mut last_dst = None; + let mut scratch_src = None; + while let Some(move_idx) = stack.pop() { + onstack[move_idx] = false; + let (mut src, dst, dst_t) = self.parallel_moves[move_idx]; + if last_dst.is_none() { + scratch_src = Some(src); + src = self.scratch; + } else { + assert_eq!(last_dst.unwrap(), src); + } + ret.push((src, dst, dst_t)); + + last_dst = Some(dst); + + if move_idx == next { + break; + } + } + if let Some(src) = scratch_src { + ret.push((src, self.scratch, T::default())); + } + } + } + } + + ret.reverse(); + ret + } +} diff --git a/src/postorder.rs b/src/postorder.rs new file mode 100644 index 00000000..96e9787f --- /dev/null +++ b/src/postorder.rs @@ -0,0 +1,56 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +//! Fast postorder computation. + +use crate::Block; +use smallvec::{smallvec, SmallVec}; + +pub fn calculate<'a, SuccFn: Fn(Block) -> &'a [Block]>( + num_blocks: usize, + entry: Block, + succ_blocks: SuccFn, +) -> Vec { + let mut ret = vec![]; + + // State: visited-block map, and explicit DFS stack. + let mut visited = vec![]; + visited.resize(num_blocks, false); + + struct State<'a> { + block: Block, + succs: &'a [Block], + next_succ: usize, + } + let mut stack: SmallVec<[State; 64]> = smallvec![]; + + visited[entry.index()] = true; + stack.push(State { + block: entry, + succs: succ_blocks(entry), + next_succ: 0, + }); + + while let Some(ref mut state) = stack.last_mut() { + // Perform one action: push to new succ, skip an already-visited succ, or pop. + if state.next_succ < state.succs.len() { + let succ = state.succs[state.next_succ]; + state.next_succ += 1; + if !visited[succ.index()] { + visited[succ.index()] = true; + stack.push(State { + block: succ, + succs: succ_blocks(succ), + next_succ: 0, + }); + } + } else { + ret.push(state.block); + stack.pop(); + } + } + + ret +} diff --git a/src/ssa.rs b/src/ssa.rs new file mode 100644 index 00000000..d8df647d --- /dev/null +++ b/src/ssa.rs @@ -0,0 +1,98 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +//! SSA-related utilities. + +use crate::cfg::CFGInfo; + +use crate::{Block, Function, Inst, OperandKind, RegAllocError}; + +pub fn validate_ssa(f: &F, cfginfo: &CFGInfo) -> Result<(), RegAllocError> { + // Walk the blocks in arbitrary order. Check, for every use, that + // the def is either in the same block in an earlier inst, or is + // defined (by inst or blockparam) in some other block that + // dominates this one. Also check that for every block param and + // inst def, that this is the only def. + let mut defined = vec![false; f.num_vregs()]; + for block in 0..f.num_blocks() { + let block = Block::new(block); + for blockparam in f.block_params(block) { + if defined[blockparam.vreg()] { + return Err(RegAllocError::SSA(*blockparam, Inst::invalid())); + } + defined[blockparam.vreg()] = true; + } + for iix in f.block_insns(block).iter() { + let operands = f.inst_operands(iix); + for operand in operands { + match operand.kind() { + OperandKind::Use => { + let def_block = if cfginfo.vreg_def_inst[operand.vreg().vreg()].is_valid() { + cfginfo.insn_block[cfginfo.vreg_def_inst[operand.vreg().vreg()].index()] + } else { + cfginfo.vreg_def_blockparam[operand.vreg().vreg()].0 + }; + if def_block.is_invalid() { + return Err(RegAllocError::SSA(operand.vreg(), iix)); + } + if !cfginfo.dominates(def_block, block) { + return Err(RegAllocError::SSA(operand.vreg(), iix)); + } + } + OperandKind::Def => { + if defined[operand.vreg().vreg()] { + return Err(RegAllocError::SSA(operand.vreg(), iix)); + } + defined[operand.vreg().vreg()] = true; + } + OperandKind::Mod => { + // Mod (modify) operands are not used in SSA, + // but can be used by non-SSA code (e.g. with + // the regalloc.rs compatibility shim). + return Err(RegAllocError::SSA(operand.vreg(), iix)); + } + } + } + } + } + + // Check that the length of branch args matches the sum of the + // number of blockparams in their succs, and that the end of every + // block ends in this branch or in a ret, and that there are no + // other branches or rets in the middle of the block. + for block in 0..f.num_blocks() { + let block = Block::new(block); + let insns = f.block_insns(block); + for insn in insns.iter() { + if insn == insns.last() { + if !(f.is_branch(insn) || f.is_ret(insn)) { + return Err(RegAllocError::BB(block)); + } + if f.is_branch(insn) { + let expected = f + .block_succs(block) + .iter() + .map(|&succ| f.block_params(succ).len()) + .sum(); + if f.inst_operands(insn).len() != expected { + return Err(RegAllocError::Branch(insn)); + } + } + } else { + if f.is_branch(insn) || f.is_ret(insn) { + return Err(RegAllocError::BB(block)); + } + } + } + } + + // Check that the entry block has no block args: otherwise it is + // undefined what their value would be. + if f.block_params(f.entry_block()).len() > 0 { + return Err(RegAllocError::BB(f.entry_block())); + } + + Ok(()) +} diff --git a/test/Cargo.toml b/test/Cargo.toml new file mode 100644 index 00000000..bfbb291d --- /dev/null +++ b/test/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "regalloc2-test" +version = "0.0.1" +authors = ["Chris Fallin ", "Mozilla SpiderMonkey Developers"] +edition = "2018" +license = "Apache-2.0 WITH LLVM-exception AND MPL-2.0" +description = "small test driver for benchmarking regalloc2" +repository = "https://github.com/bytecodealliance/regalloc2" + +[dependencies] +regalloc2 = { version = "*", path = "../", features = ["fuzzing"] } + +# Keep this in sync with libfuzzer_sys's crate version: +arbitrary = { version = "^0.4.6" } +rand = { version = "0.8" } +rand_chacha = { version = "0.3" } +env_logger = { version = "*" } + +[dev-dependencies] +criterion = "0.3" + +[profile.release] +debug = true + +[[bench]] +name = "regalloc" +harness = false + diff --git a/test/benches/regalloc.rs b/test/benches/regalloc.rs new file mode 100644 index 00000000..85cee8c5 --- /dev/null +++ b/test/benches/regalloc.rs @@ -0,0 +1,56 @@ +//! Criterion-based benchmark target that computes insts/second for +//! arbitrary inputs. + +use arbitrary::{Arbitrary, Unstructured}; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use regalloc2::fuzzing::func::{machine_env, Func}; +use regalloc2::ion; +use regalloc2::Function; + +fn create_random_func(seed: u64, size: usize) -> Func { + let mut bytes: Vec = vec![]; + bytes.resize(size, 0); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + rng.fill(&mut bytes[..]); + loop { + let mut u = Unstructured::new(&bytes[..]); + match Func::arbitrary(&mut u) { + Ok(f) => { + return f; + } + Err(arbitrary::Error::NotEnoughData) => { + let len = bytes.len(); + bytes.resize(len + 1024, 0); + rng.fill(&mut bytes[len..]); + } + Err(e) => panic!("unexpected error: {:?}", e), + } + } +} + +fn run_regalloc(c: &mut Criterion) { + const SIZE: usize = 1000 * 1000; + env_logger::init(); + let env = machine_env(); + let mut group = c.benchmark_group("benches"); + for iter in 0..3 { + let func = create_random_func(iter, SIZE); + eprintln!("==== {} instructions", func.insts()); + group.throughput(Throughput::Elements(func.insts() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(iter), &iter, |b, _| { + b.iter(|| { + // For fair comparison with regalloc.rs, which needs + // to clone its Func on every alloc, we clone + // too. Seems to make a few percent difference. + let func = func.clone(); + ion::run(&func, &env).expect("regalloc did not succeed"); + }); + }); + } + group.finish(); +} + +criterion_group!(benches, run_regalloc); +criterion_main!(benches); diff --git a/test/src/main.rs b/test/src/main.rs new file mode 100644 index 00000000..c6fd7792 --- /dev/null +++ b/test/src/main.rs @@ -0,0 +1,50 @@ +/* + * Released under the terms of the Apache 2.0 license with LLVM + * exception. See `LICENSE` for details. + */ + +use arbitrary::{Arbitrary, Unstructured}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use regalloc2::fuzzing::func::{machine_env, Func}; +use regalloc2::ion; +use regalloc2::Function; + +fn create_random_func(seed: u64, size: usize) -> Func { + let mut bytes: Vec = vec![]; + bytes.resize(size, 0); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + rng.fill(&mut bytes[..]); + loop { + let mut u = Unstructured::new(&bytes[..]); + match Func::arbitrary(&mut u) { + Ok(f) => { + return f; + } + Err(arbitrary::Error::NotEnoughData) => { + let len = bytes.len(); + bytes.resize(len + 1024, 0); + rng.fill(&mut bytes[len..]); + } + Err(e) => panic!("unexpected error: {:?}", e), + } + } +} + +fn main() { + const SIZE: usize = 1000 * 1000; + env_logger::init(); + let env = machine_env(); + for iter in 0..3 { + let func = create_random_func(iter, SIZE); + eprintln!("==== {} instructions", func.insts()); + let mut stats: ion::Stats = ion::Stats::default(); + for i in 0..1000 { + let out = ion::run(&func, &env).expect("regalloc did not succeed"); + if i == 0 { + stats = out.stats; + } + } + eprintln!("Stats: {:?}", stats); + } +}