cmd/compile: schedule carry chain arithmetic disjointly

This results in a 1.7-2.4x improvement in native go crypto/elliptic multiplication operations on PPC64, and similar improvements might be possible on other architectures which use flags or similar to represent the carry bit in SSA form. If it is possible, schedule carry chains independently of each other to avoid clobbering the carry flag. This is very expensive. This is done by: 1. Identifying carry bit using, but not creating ops, and lowering their priority below all other ops which do not need to be placed at the top of a block. This effectively ensures only one carry chain will be placed at a time in most important cases (crypto/elliptic/internal/fiat contains most of them). 2. Raising the priority of carry bit generating ops to schedule later in a block to ensure they are placed as soon as they are ready. Likewise, tuple ops which separate carrying ops are scored similar to 2 above. This prevents unrelated ops from being scheduled between carry-dependent operations. This occurs when unrelated ops are ready to schedule alongside such tuple ops. This reduces the chances a flag clobbering op might be placed between two carry-dependent operations. With PPC64 Add64/Sub64 lowering into SSA and this patch, the net performance difference in crypto/elliptic benchmarks on P9/ppc64le are: name old time/op new time/op delta ScalarBaseMult/P256 46.3µs ± 0% 46.9µs ± 0% +1.34% ScalarBaseMult/P224 356µs ± 0% 209µs ± 0% -41.14% ScalarBaseMult/P384 1.20ms ± 0% 0.57ms ± 0% -52.14% ScalarBaseMult/P521 3.38ms ± 0% 1.44ms ± 0% -57.27% ScalarMult/P256 199µs ± 0% 199µs ± 0% -0.17% ScalarMult/P224 357µs ± 0% 212µs ± 0% -40.56% ScalarMult/P384 1.20ms ± 0% 0.58ms ± 0% -51.86% ScalarMult/P521 3.37ms ± 0% 1.44ms ± 0% -57.32% MarshalUnmarshal/P256/Uncompressed 2.59µs ± 0% 2.52µs ± 0% -2.63% MarshalUnmarshal/P256/Compressed 2.58µs ± 0% 2.52µs ± 0% -2.06% MarshalUnmarshal/P224/Uncompressed 1.54µs ± 0% 1.40µs ± 0% -9.42% MarshalUnmarshal/P224/Compressed 1.54µs ± 0% 1.39µs ± 0% -9.87% MarshalUnmarshal/P384/Uncompressed 2.40µs ± 0% 1.80µs ± 0% -24.93% MarshalUnmarshal/P384/Compressed 2.35µs ± 0% 1.81µs ± 0% -23.03% MarshalUnmarshal/P521/Uncompressed 3.79µs ± 0% 2.58µs ± 0% -31.81% MarshalUnmarshal/P521/Compressed 3.80µs ± 0% 2.60µs ± 0% -31.67% Note, P256 uses an asm implementation, thus, little variation is expected. Updates #40171 Change-Id: I810850e8ff429505424c92d6fe37f99aaa0c6e84 Reviewed-on: https://go-review.googlesource.com/c/go/+/393656 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Filippo Valsorda <valsorda@google.com>
golang · May 8, 2022 · c386269 · c386269
1 parent 1efe387
commit c386269
Showing 1 changed file with 78 additions and 1 deletion.
diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go
@@ -14,6 +14,7 @@ const (
 	ScorePhi = iota // towards top of block
 	ScoreArg
 	ScoreNilCheck
+	ScoreCarryChainTail
 	ScoreReadTuple
 	ScoreVarDef
 	ScoreMemory
@@ -158,7 +159,35 @@ func schedule(f *Func) {
 				// this value is already live. This also removes its
 				// false dependency on the other part of the tuple.
 				// Also ensures tuple is never spilled.
-				score[v.ID] = ScoreReadTuple
+				if (v.Op == OpSelect1 || v.Op == OpSelect0) && v.Args[0].Op.isCarry() {
+					// Score tuple ops of carry ops later to ensure they do not
+					// delay scheduling the tuple-generating op. If such tuple ops
+					// are not placed more readily, unrelated carry clobbering ops
+					// may be placed inbetween two carry-dependent operations.
+					score[v.ID] = ScoreFlags
+				} else {
+					score[v.ID] = ScoreReadTuple
+				}
+			case v.Op.isCarry():
+				if w := v.getCarryProducer(); w != nil {
+					// The producing op is not the final user of the carry bit. Its
+					// current score is one of unscored, Flags, or CarryChainTail.
+					// These occur if the producer has not been scored, another user
+					// of the producers carry flag was scored (there are >1 users of
+					// the carry out flag), or it was visited earlier and already
+					// scored CarryChainTail (and prove w is not a tail).
+					score[w.ID] = ScoreFlags
+				}
+				// Verify v has not been scored. If v has not been visited, v may be the
+				// the final (tail) operation in a carry chain. If v is not, v will be
+				// rescored above when v's carry-using op is scored. When scoring is done,
+				// only tail operations will retain the CarryChainTail score.
+				if score[v.ID] != ScoreFlags {
+					// Score the tail of carry chain operations to a lower (earlier in the
+					// block) priority. This creates a priority inversion which allows only
+					// one chain to be scheduled, if possible.
+					score[v.ID] = ScoreCarryChainTail
+				}
 			case v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags():
 				// Schedule flag register generation as late as possible.
 				// This makes sure that we only have one live flags
@@ -263,6 +292,14 @@ func schedule(f *Func) {
 
 			v := heap.Pop(priq).(*Value)
 
+			if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.Op.isCarry() {
+				// Add some debugging noise if the chain of carrying ops will not
+				// likely be scheduled without potential carry flag clobbers.
+				if !isCarryChainReady(v, uses) {
+					f.Warnl(v.Pos, "carry chain ending with %v not ready", v)
+				}
+			}
+
 			// Add it to the schedule.
 			// Do not emit tuple-reading ops until we're ready to emit the tuple-generating op.
 			//TODO: maybe remove ReadTuple score above, if it does not help on performance
@@ -519,6 +556,46 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
 	return order
 }
 
+// Return whether all dependent carry ops can be scheduled after this.
+func isCarryChainReady(v *Value, uses []int32) bool {
+	// A chain can be scheduled in it's entirety if
+	// the use count of each dependent op is 1. If none,
+	// schedule the first.
+	j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1.
+	for k := v; k != nil; k = k.getCarryProducer() {
+		j += int(uses[k.ID]) - 1
+	}
+	return j == 0
+}
+
+// Return whether op is an operation which produces a carry bit value, but does not consume it.
+func (op Op) isCarryCreator() bool {
+	switch op {
+	case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
+		return true
+	}
+	return false
+}
+
+// Return whether op consumes or creates a carry a bit value.
+func (op Op) isCarry() bool {
+	switch op {
+	case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
+		return true
+	}
+	return op.isCarryCreator()
+}
+
+// Return the producing *Value of the carry bit of this op, or nil if none.
+func (v *Value) getCarryProducer() *Value {
+	if v.Op.isCarry() && !v.Op.isCarryCreator() {
+		// PPC64 carry dependencies are conveyed through their final argument.
+		// Likewise, there is always an OpSelect1 between them.
+		return v.Args[len(v.Args)-1].Args[0]
+	}
+	return nil
+}
+
 type bySourcePos []*Value
 
 func (s bySourcePos) Len() int           { return len(s) }