vitessio · harshit-gangal · Aug 23, 2020 · Aug 11, 2020 · Aug 20, 2020 · Aug 21, 2020
diff --git a/go/test/endtoend/vtgate/main_test.go b/go/test/endtoend/vtgate/main_test.go
@@ -114,12 +114,28 @@ create table t6_id2_idx(
 	keyspace_id varbinary(50),
 	primary key(id1),
 	key(id2)
+) Engine=InnoDB;
+
+create table t7_xxhash(
+	uid varchar(50),
+	phone bigint,
+    msg varchar(100),
+    primary key(uid)
+) Engine=InnoDB;
+
+create table t7_xxhash_idx(
+	phone bigint,
+	keyspace_id varbinary(50),
+	primary key(phone, keyspace_id)
 ) Engine=InnoDB;`
 
 	VSchema = `
 {
   "sharded": true,
   "vindexes": {
+    "unicode_loose_xxhash" : {
+	  "type": "unicode_loose_xxhash"
+    },
     "unicode_loose_md5" : {
 	  "type": "unicode_loose_md5"
     },
@@ -175,6 +191,16 @@ create table t6_id2_idx(
         "ignore_nulls": "true"
       },
       "owner": "t6"
+    },
+    "t7_xxhash_vdx": {
+      "type": "consistent_lookup",
+      "params": {
+        "table": "t7_xxhash_idx",
+        "from": "phone",
+        "to": "keyspace_id",
+        "ignore_nulls": "true"
+      },
+      "owner": "t7_xxhash"
     }
   },
   "tables": {
@@ -307,6 +333,26 @@ create table t6_id2_idx(
           "type": "VARCHAR"
         }
       ]
+    },
+	"t7_xxhash": {
+      "column_vindexes": [
+        {
+          "column": "uid",
+          "name": "unicode_loose_xxhash"
+        },
+        {
+          "column": "phone",
+          "name": "t7_xxhash_vdx"
+        }
+      ]
+    },
+    "t7_xxhash_idx": {
+      "column_vindexes": [
+        {
+          "column": "phone",
+          "name": "unicode_loose_xxhash"
+        }
+      ]
     }
   }
 }`

diff --git a/go/test/endtoend/vtgate/misc_test.go b/go/test/endtoend/vtgate/misc_test.go
@@ -257,6 +257,25 @@ func TestExplainPassthrough(t *testing.T) {
 	// but we are trying to make the test less fragile
 }
 
+func TestXXHash(t *testing.T) {
+	defer cluster.PanicHandler(t)
+	ctx := context.Background()
+	conn, err := mysql.Connect(ctx, &vtParams)
+	require.Nil(t, err)
+	defer conn.Close()
+
+	exec(t, conn, "insert into t7_xxhash(uid, phone, msg) values('u-1', 1, 'message')")
+	assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where phone = 1", `[[VARCHAR("u-1") INT64(1) VARCHAR("message")]]`)
+	assertMatches(t, conn, "select phone, keyspace_id from t7_xxhash_idx", `[[INT64(1) VARBINARY("\x1cU^f\xbfyE^")]]`)
+	exec(t, conn, "update t7_xxhash set phone = 2 where uid = 'u-1'")
+	assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where phone = 1", `[]`)
+	assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where phone = 2", `[[VARCHAR("u-1") INT64(2) VARCHAR("message")]]`)
+	assertMatches(t, conn, "select phone, keyspace_id from t7_xxhash_idx", `[[INT64(2) VARBINARY("\x1cU^f\xbfyE^")]]`)
+	exec(t, conn, "delete from t7_xxhash where uid = 'u-1'")
+	assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where uid = 'u-1'", `[]`)
+	assertMatches(t, conn, "select phone, keyspace_id from t7_xxhash_idx", `[]`)
+}
+
 func assertMatches(t *testing.T, conn *mysql.Conn, query, expected string) {
 	t.Helper()
 	qr := exec(t, conn, query)

diff --git a/go/vt/vtgate/vindexes/binarymd5.go b/go/vt/vtgate/vindexes/binarymd5.go
@@ -62,7 +62,7 @@ func (vind *BinaryMD5) NeedsVCursor() bool {
 func (vind *BinaryMD5) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]byte) ([]bool, error) {
 	out := make([]bool, len(ids))
 	for i := range ids {
-		out[i] = bytes.Equal(binHash(ids[i].ToBytes()), ksids[i])
+		out[i] = bytes.Equal(vMD5Hash(ids[i].ToBytes()), ksids[i])
 	}
 	return out, nil
 }
@@ -71,12 +71,12 @@ func (vind *BinaryMD5) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]byte) (
 func (vind *BinaryMD5) Map(cursor VCursor, ids []sqltypes.Value) ([]key.Destination, error) {
 	out := make([]key.Destination, len(ids))
 	for i, id := range ids {
-		out[i] = key.DestinationKeyspaceID(binHash(id.ToBytes()))
+		out[i] = key.DestinationKeyspaceID(vMD5Hash(id.ToBytes()))
 	}
 	return out, nil
 }
 
-func binHash(source []byte) []byte {
+func vMD5Hash(source []byte) []byte {
 	sum := md5.Sum(source)
 	return sum[:]
 }

diff --git a/go/vt/vtgate/vindexes/binarymd5_test.go b/go/vt/vtgate/vindexes/binarymd5_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package vindexes
 
 import (
+	"fmt"
 	"reflect"
 	"testing"
 
@@ -88,3 +89,37 @@ func TestSQLValue(t *testing.T) {
 		t.Errorf("Map(%#v): %#v, want %#v", val, out, want)
 	}
 }
+
+func BenchmarkMD5Hash(b *testing.B) {
+	for _, benchSize := range []struct {
+		name string
+		n    int
+	}{
+		{"8B", 8},
+		{"32B", 32},
+		{"64B", 64},
+		{"512B", 512},
+		{"1KB", 1e3},
+		{"4KB", 4e3},
+	} {
+		input := make([]byte, benchSize.n)
+		for i := range input {
+			input[i] = byte(i)
+		}
+
+		name := fmt.Sprintf("md5Hash,direct,bytes,n=%s", benchSize.name)
+		b.Run(name, func(b *testing.B) {
+			benchmarkMD5HashBytes(b, input)
+		})
+
+	}
+}
+
+var sinkMD5 []byte
+
+func benchmarkMD5HashBytes(b *testing.B, input []byte) {
+	b.SetBytes(int64(len(input)))
+	for i := 0; i < b.N; i++ {
+		sinkMD5 = vMD5Hash(input)
+	}
+}
diff --git a/go/vt/vtgate/vindexes/lookup_unicodeloosemd5_hash.go b/go/vt/vtgate/vindexes/lookup_unicodeloosemd5_hash.go
@@ -382,7 +382,7 @@ func (lhu *LookupUnicodeLooseMD5HashUnique) MarshalJSON() ([]byte, error) {
 }
 
 func unicodeHashValue(value sqltypes.Value) (sqltypes.Value, error) {
-	hash, err := unicodeHash(value)
+	hash, err := unicodeHash(vMD5Hash, value)
 	if err != nil {
 		return sqltypes.NULL, err
 	}

diff --git a/go/vt/vtgate/vindexes/unicode.go b/go/vt/vtgate/vindexes/unicode.go
@@ -0,0 +1,89 @@
+/*
+Copyright 2020 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vindexes
+
+import (
+	"bytes"
+	"fmt"
+	"sync"
+	"unicode/utf8"
+
+	"vitess.io/vitess/go/sqltypes"
+
+	"golang.org/x/text/collate"
+	"golang.org/x/text/language"
+)
+
+// Shared functions for Unicode string normalization
+// for Vindexes.
+
+func unicodeHash(hashFunc func([]byte) []byte, key sqltypes.Value) ([]byte, error) {
+	collator := collatorPool.Get().(*pooledCollator)
+	defer collatorPool.Put(collator)
+
+	norm, err := normalize(collator.col, collator.buf, key.ToBytes())
+	if err != nil {
+		return nil, err
+	}
+	return hashFunc(norm), nil
+}
+
+func normalize(col *collate.Collator, buf *collate.Buffer, in []byte) ([]byte, error) {
+	// We cannot pass invalid UTF-8 to the collator.
+	if !utf8.Valid(in) {
+		return nil, fmt.Errorf("cannot normalize string containing invalid UTF-8: %q", string(in))
+	}
+
+	// Ref: http://dev.mysql.com/doc/refman/5.6/en/char.html.
+	// Trailing spaces are ignored by MySQL.
+	in = bytes.TrimRight(in, " ")
+
+	// We use the collation key which can be used to
+	// perform lexical comparisons.
+	return col.Key(buf, in), nil
+}
+
+// pooledCollator pairs a Collator and a Buffer.
+// These pairs are pooled to avoid reallocating for every request,
+// which would otherwise be required because they can't be used concurrently.
+//
+// Note that you must ensure no active references into the buffer remain
+// before you return this pair back to the pool.
+// That is, either do your processing on the result first, or make a copy.
+type pooledCollator struct {
+	col *collate.Collator
+	buf *collate.Buffer
+}
+
+var collatorPool = sync.Pool{New: newPooledCollator}
+
+func newPooledCollator() interface{} {
+	// Ref: http://www.unicode.org/reports/tr10/#Introduction.
+	// Unicode seems to define a universal (or default) order.
+	// But various locales have conflicting order,
+	// which they have the right to override.
+	// Unfortunately, the Go library requires you to specify a locale.
+	// So, I chose English assuming that it won't override
+	// the Unicode universal order. But I couldn't find an easy
+	// way to verify this.
+	// Also, the locale differences are not an issue for level 1,
+	// because the conservative comparison makes them all equal.
+	return &pooledCollator{
+		col: collate.New(language.English, collate.Loose),
+		buf: new(collate.Buffer),
+	}
+}