tinyspeck · ajm188 · Oct 4, 2021 · Jun 10, 2021 · May 18, 2021 · Sep 24, 2021
diff --git a/go/vt/key/key.go b/go/vt/key/key.go
@@ -20,6 +20,7 @@ import (
  "bytes"
  "encoding/binary"
  "encoding/hex"
+ "errors"
  "fmt"
  "math"
  "regexp"
@@ -191,8 +192,30 @@ func KeyRangeEqual(left, right *topodatapb.KeyRange) bool {
  if right == nil {
  return len(left.Start) == 0 && len(left.End) == 0
  }
- return bytes.Equal(left.Start, right.Start) &&
- bytes.Equal(left.End, right.End)
+ return bytes.Equal(addPadding(left.Start), addPadding(right.Start)) &&
+ bytes.Equal(addPadding(left.End), addPadding(right.End))
+}
+
+// addPadding adds padding to make sure keyrange represents an 8 byte integer.
+// From Vitess docs:
+// A hash vindex produces an 8-byte number.
+// This means that all numbers less than 0x8000000000000000 will fall in shard -80.
+// Any number with the highest bit set will be >= 0x8000000000000000, and will therefore
+// belong to shard 80-.
+// This means that from a keyrange perspective -80 == 00-80 == 0000-8000 == 000000-800000
+// If we don't add this padding, we could run into issues when transitioning from keyranges
+// that use 2 bytes to 4 bytes.
+func addPadding(kr []byte) []byte {
+ paddedKr := make([]byte, 8)
+
+ for i := 0; i < len(kr); i++ {
+ paddedKr = append(paddedKr, kr[i])
+ }
+
+ for i := len(kr); i < 8; i++ {
+ paddedKr = append(paddedKr, 0)
+ }
+ return paddedKr
 }
 
 // KeyRangeStartSmaller returns true if right's keyrange start is _after_ left's start
@@ -214,7 +237,19 @@ func KeyRangeStartEqual(left, right *topodatapb.KeyRange) bool {
  if right == nil {
  return len(left.Start) == 0
  }
- return bytes.Equal(left.Start, right.Start)
+ return bytes.Equal(addPadding(left.Start), addPadding(right.Start))
+}
+
+// KeyRangeContiguous returns true if the end of the left key range exactly
+// matches the start of the right key range (i.e they are contigious)
+func KeyRangeContiguous(left, right *topodatapb.KeyRange) bool {
+ if left == nil {
+ return right == nil || (len(right.Start) == 0 && len(right.End) == 0)
+ }
+ if right == nil {
+ return len(left.Start) == 0 && len(left.End) == 0
+ }
+ return bytes.Equal(addPadding(left.End), addPadding(right.Start))
 }
 
 // KeyRangeEndEqual returns true if both key ranges have the same end
@@ -225,7 +260,7 @@ func KeyRangeEndEqual(left, right *topodatapb.KeyRange) bool {
  if right == nil {
  return len(left.End) == 0
  }
- return bytes.Equal(left.End, right.End)
+ return bytes.Equal(addPadding(left.End), addPadding(right.End))
 }
 
 // For more info on the following functions, see:
@@ -346,3 +381,74 @@ var krRegexp = regexp.MustCompile(`^[0-9a-fA-F]*-[0-9a-fA-F]*$`)
 func IsKeyRange(kr string) bool {
  return krRegexp.MatchString(kr)
 }
+
+// GenerateShardRanges returns shard ranges assuming a keyspace with N shards.
+func GenerateShardRanges(shards int) ([]string, error) {
+ var format string
+ var maxShards int
+
+ switch {
+ case shards <= 0:
+ return nil, errors.New("shards must be greater than zero")
+ case shards <= 256:
+ format = "%02x"
+ maxShards = 256
+ case shards <= 65536:
+ format = "%04x"
+ maxShards = 65536
+ default:
+ return nil, errors.New("this function does not support more than 65336 shards in a single keyspace")
+ }
+
+ rangeFormatter := func(start, end int) string {
+ var (
+ startKid string
+ endKid string
+ )
+
+ if start != 0 {
+ startKid = fmt.Sprintf(format, start)
+ }
+
+ if end != maxShards {
+ endKid = fmt.Sprintf(format, end)
+ }
+
+ return fmt.Sprintf("%s-%s", startKid, endKid)
+ }
+
+ start := 0
+ end := 0
+
+ // If shards does not divide evenly into maxShards, then there is some lossiness,
+ // where each shard is smaller than it should technically be (if, for example, size == 25.6).
+ // If we choose to keep everything in ints, then we have two choices:
+ // - Have every shard in #numshards be a uniform size, tack on an additional shard
+ // at the end of the range to account for the loss. This is bad because if you ask for
+ // 7 shards, you'll actually get 7 uniform shards with 1 small shard, for 8 total shards.
+ // It's also bad because one shard will have much different data distribution than the rest.
+ // - Expand the final shard to include whatever is left in the keyrange. This will give the
+ // correct number of shards, which is good, but depending on how lossy each individual shard is,
+ // you could end with that final shard being significantly larger than the rest of the shards,
+ // so this doesn't solve the data distribution problem.
+ //
+ // By tracking the "real" end (both in the real number sense, and in the truthfulness of the value sense),
+ // we can re-truncate the integer end on each iteration, which spreads the lossiness more
+ // evenly across the shards.
+ //
+ // This implementation has no impact on shard numbers that are powers of 2, even at large numbers,
+ // which you can see in the tests.
+ size := float64(maxShards) / float64(shards)
+ realEnd := float64(0)
+ shardRanges := make([]string, 0, shards)
+
+ for i := 1; i <= shards; i++ {
+ realEnd = float64(i) * size
+
+ end = int(realEnd)
+ shardRanges = append(shardRanges, rangeFormatter(start, end))
+ start = end
+ }
+
+ return shardRanges, nil
+}