diff --git a/go/test/endtoend/vtgate/main_test.go b/go/test/endtoend/vtgate/main_test.go index dea6fcc5f3a..84c066ef3b9 100644 --- a/go/test/endtoend/vtgate/main_test.go +++ b/go/test/endtoend/vtgate/main_test.go @@ -114,12 +114,28 @@ create table t6_id2_idx( keyspace_id varbinary(50), primary key(id1), key(id2) +) Engine=InnoDB; + +create table t7_xxhash( + uid varchar(50), + phone bigint, + msg varchar(100), + primary key(uid) +) Engine=InnoDB; + +create table t7_xxhash_idx( + phone bigint, + keyspace_id varbinary(50), + primary key(phone, keyspace_id) ) Engine=InnoDB;` VSchema = ` { "sharded": true, "vindexes": { + "unicode_loose_xxhash" : { + "type": "unicode_loose_xxhash" + }, "unicode_loose_md5" : { "type": "unicode_loose_md5" }, @@ -175,6 +191,16 @@ create table t6_id2_idx( "ignore_nulls": "true" }, "owner": "t6" + }, + "t7_xxhash_vdx": { + "type": "consistent_lookup", + "params": { + "table": "t7_xxhash_idx", + "from": "phone", + "to": "keyspace_id", + "ignore_nulls": "true" + }, + "owner": "t7_xxhash" } }, "tables": { @@ -307,6 +333,26 @@ create table t6_id2_idx( "type": "VARCHAR" } ] + }, + "t7_xxhash": { + "column_vindexes": [ + { + "column": "uid", + "name": "unicode_loose_xxhash" + }, + { + "column": "phone", + "name": "t7_xxhash_vdx" + } + ] + }, + "t7_xxhash_idx": { + "column_vindexes": [ + { + "column": "phone", + "name": "unicode_loose_xxhash" + } + ] } } }` diff --git a/go/test/endtoend/vtgate/misc_test.go b/go/test/endtoend/vtgate/misc_test.go index ec48411b6b0..db8fc24e825 100644 --- a/go/test/endtoend/vtgate/misc_test.go +++ b/go/test/endtoend/vtgate/misc_test.go @@ -257,6 +257,25 @@ func TestExplainPassthrough(t *testing.T) { // but we are trying to make the test less fragile } +func TestXXHash(t *testing.T) { + defer cluster.PanicHandler(t) + ctx := context.Background() + conn, err := mysql.Connect(ctx, &vtParams) + require.Nil(t, err) + defer conn.Close() + + exec(t, conn, "insert into t7_xxhash(uid, phone, msg) values('u-1', 1, 'message')") + assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where phone = 1", `[[VARCHAR("u-1") INT64(1) VARCHAR("message")]]`) + assertMatches(t, conn, "select phone, keyspace_id from t7_xxhash_idx", `[[INT64(1) VARBINARY("\x1cU^f\xbfyE^")]]`) + exec(t, conn, "update t7_xxhash set phone = 2 where uid = 'u-1'") + assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where phone = 1", `[]`) + assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where phone = 2", `[[VARCHAR("u-1") INT64(2) VARCHAR("message")]]`) + assertMatches(t, conn, "select phone, keyspace_id from t7_xxhash_idx", `[[INT64(2) VARBINARY("\x1cU^f\xbfyE^")]]`) + exec(t, conn, "delete from t7_xxhash where uid = 'u-1'") + assertMatches(t, conn, "select uid, phone, msg from t7_xxhash where uid = 'u-1'", `[]`) + assertMatches(t, conn, "select phone, keyspace_id from t7_xxhash_idx", `[]`) +} + func assertMatches(t *testing.T, conn *mysql.Conn, query, expected string) { t.Helper() qr := exec(t, conn, query) diff --git a/go/vt/vtgate/vindexes/binarymd5.go b/go/vt/vtgate/vindexes/binarymd5.go index c2327b9ae22..7cd99626919 100644 --- a/go/vt/vtgate/vindexes/binarymd5.go +++ b/go/vt/vtgate/vindexes/binarymd5.go @@ -62,7 +62,7 @@ func (vind *BinaryMD5) NeedsVCursor() bool { func (vind *BinaryMD5) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]byte) ([]bool, error) { out := make([]bool, len(ids)) for i := range ids { - out[i] = bytes.Equal(binHash(ids[i].ToBytes()), ksids[i]) + out[i] = bytes.Equal(vMD5Hash(ids[i].ToBytes()), ksids[i]) } return out, nil } @@ -71,12 +71,12 @@ func (vind *BinaryMD5) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]byte) ( func (vind *BinaryMD5) Map(cursor VCursor, ids []sqltypes.Value) ([]key.Destination, error) { out := make([]key.Destination, len(ids)) for i, id := range ids { - out[i] = key.DestinationKeyspaceID(binHash(id.ToBytes())) + out[i] = key.DestinationKeyspaceID(vMD5Hash(id.ToBytes())) } return out, nil } -func binHash(source []byte) []byte { +func vMD5Hash(source []byte) []byte { sum := md5.Sum(source) return sum[:] } diff --git a/go/vt/vtgate/vindexes/binarymd5_test.go b/go/vt/vtgate/vindexes/binarymd5_test.go index 7e214d8862d..7570f19dd52 100644 --- a/go/vt/vtgate/vindexes/binarymd5_test.go +++ b/go/vt/vtgate/vindexes/binarymd5_test.go @@ -17,6 +17,7 @@ limitations under the License. package vindexes import ( + "fmt" "reflect" "testing" @@ -88,3 +89,37 @@ func TestSQLValue(t *testing.T) { t.Errorf("Map(%#v): %#v, want %#v", val, out, want) } } + +func BenchmarkMD5Hash(b *testing.B) { + for _, benchSize := range []struct { + name string + n int + }{ + {"8B", 8}, + {"32B", 32}, + {"64B", 64}, + {"512B", 512}, + {"1KB", 1e3}, + {"4KB", 4e3}, + } { + input := make([]byte, benchSize.n) + for i := range input { + input[i] = byte(i) + } + + name := fmt.Sprintf("md5Hash,direct,bytes,n=%s", benchSize.name) + b.Run(name, func(b *testing.B) { + benchmarkMD5HashBytes(b, input) + }) + + } +} + +var sinkMD5 []byte + +func benchmarkMD5HashBytes(b *testing.B, input []byte) { + b.SetBytes(int64(len(input))) + for i := 0; i < b.N; i++ { + sinkMD5 = vMD5Hash(input) + } +} diff --git a/go/vt/vtgate/vindexes/lookup_unicodeloosemd5_hash.go b/go/vt/vtgate/vindexes/lookup_unicodeloosemd5_hash.go index a3a8401fc00..d3a2c8b00c7 100644 --- a/go/vt/vtgate/vindexes/lookup_unicodeloosemd5_hash.go +++ b/go/vt/vtgate/vindexes/lookup_unicodeloosemd5_hash.go @@ -382,7 +382,7 @@ func (lhu *LookupUnicodeLooseMD5HashUnique) MarshalJSON() ([]byte, error) { } func unicodeHashValue(value sqltypes.Value) (sqltypes.Value, error) { - hash, err := unicodeHash(value) + hash, err := unicodeHash(vMD5Hash, value) if err != nil { return sqltypes.NULL, err } diff --git a/go/vt/vtgate/vindexes/unicode.go b/go/vt/vtgate/vindexes/unicode.go new file mode 100644 index 00000000000..9c274495cbb --- /dev/null +++ b/go/vt/vtgate/vindexes/unicode.go @@ -0,0 +1,89 @@ +/* +Copyright 2020 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vindexes + +import ( + "bytes" + "fmt" + "sync" + "unicode/utf8" + + "vitess.io/vitess/go/sqltypes" + + "golang.org/x/text/collate" + "golang.org/x/text/language" +) + +// Shared functions for Unicode string normalization +// for Vindexes. + +func unicodeHash(hashFunc func([]byte) []byte, key sqltypes.Value) ([]byte, error) { + collator := collatorPool.Get().(*pooledCollator) + defer collatorPool.Put(collator) + + norm, err := normalize(collator.col, collator.buf, key.ToBytes()) + if err != nil { + return nil, err + } + return hashFunc(norm), nil +} + +func normalize(col *collate.Collator, buf *collate.Buffer, in []byte) ([]byte, error) { + // We cannot pass invalid UTF-8 to the collator. + if !utf8.Valid(in) { + return nil, fmt.Errorf("cannot normalize string containing invalid UTF-8: %q", string(in)) + } + + // Ref: http://dev.mysql.com/doc/refman/5.6/en/char.html. + // Trailing spaces are ignored by MySQL. + in = bytes.TrimRight(in, " ") + + // We use the collation key which can be used to + // perform lexical comparisons. + return col.Key(buf, in), nil +} + +// pooledCollator pairs a Collator and a Buffer. +// These pairs are pooled to avoid reallocating for every request, +// which would otherwise be required because they can't be used concurrently. +// +// Note that you must ensure no active references into the buffer remain +// before you return this pair back to the pool. +// That is, either do your processing on the result first, or make a copy. +type pooledCollator struct { + col *collate.Collator + buf *collate.Buffer +} + +var collatorPool = sync.Pool{New: newPooledCollator} + +func newPooledCollator() interface{} { + // Ref: http://www.unicode.org/reports/tr10/#Introduction. + // Unicode seems to define a universal (or default) order. + // But various locales have conflicting order, + // which they have the right to override. + // Unfortunately, the Go library requires you to specify a locale. + // So, I chose English assuming that it won't override + // the Unicode universal order. But I couldn't find an easy + // way to verify this. + // Also, the locale differences are not an issue for level 1, + // because the conservative comparison makes them all equal. + return &pooledCollator{ + col: collate.New(language.English, collate.Loose), + buf: new(collate.Buffer), + } +} diff --git a/go/vt/vtgate/vindexes/unicode_test.go b/go/vt/vtgate/vindexes/unicode_test.go new file mode 100644 index 00000000000..c79083247e0 --- /dev/null +++ b/go/vt/vtgate/vindexes/unicode_test.go @@ -0,0 +1,138 @@ +/* +Copyright 2020 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vindexes + +import ( + "strings" + "testing" + "time" +) + +func TestNormalization(t *testing.T) { + tcases := []struct { + in, out string + }{{ + in: "Test", + out: "\x18\x16\x16L\x17\xf3\x18\x16", + }, { + in: "TEST", + out: "\x18\x16\x16L\x17\xf3\x18\x16", + }, { + in: "Te\u0301st", + out: "\x18\x16\x16L\x17\xf3\x18\x16", + }, { + in: "Tést", + out: "\x18\x16\x16L\x17\xf3\x18\x16", + }, { + in: "Bést", + out: "\x16\x05\x16L\x17\xf3\x18\x16", + }, { + in: "Test ", + out: "\x18\x16\x16L\x17\xf3\x18\x16", + }, { + in: " Test", + out: "\x01\t\x18\x16\x16L\x17\xf3\x18\x16", + }, { + in: "Test\t", + out: "\x18\x16\x16L\x17\xf3\x18\x16\x01\x00", + }, { + in: "TéstLooong", + out: "\x18\x16\x16L\x17\xf3\x18\x16\x17\x11\x17q\x17q\x17q\x17O\x16\x91", + }, { + in: "T", + out: "\x18\x16", + }} + collator := newPooledCollator().(*pooledCollator) + for _, tcase := range tcases { + norm, err := normalize(collator.col, collator.buf, []byte(tcase.in)) + if err != nil { + t.Errorf("normalize(%#v) error: %v", tcase.in, err) + } + out := string(norm) + if out != tcase.out { + t.Errorf("normalize(%#v): %#v, want %#v", tcase.in, out, tcase.out) + } + } +} + +func TestInvalidUnicodeNormalization(t *testing.T) { + // These strings are known to contain invalid UTF-8. + inputs := []string{ + "\x99\xeb\x9d\x18\xa4G\x84\x04]\x87\xf3\xc6|\xf2'F", + "D\x86\x15\xbb\xda\b1?j\x8e\xb6h\xd2\v\xf5\x05", + "\x8a[\xdf,\u007fĄE\x92\xd2W+\xcd\x06h\xd2", + } + wantErr := "invalid UTF-8" + collator := newPooledCollator().(*pooledCollator) + + for _, in := range inputs { + // We've observed that infinite looping is a possible failure mode for the + // collator when given invalid UTF-8, so we detect that with a timer. + done := make(chan struct{}) + go func() { + defer close(done) + _, err := normalize(collator.col, collator.buf, []byte(in)) + if err == nil { + t.Errorf("normalize(%q) error = nil, expected error", in) + } + if !strings.Contains(err.Error(), wantErr) { + t.Errorf("normalize(%q) error = %q, want %q", in, err.Error(), wantErr) + } + }() + timer := time.NewTimer(100 * time.Millisecond) + select { + case <-done: + timer.Stop() + case <-timer.C: + t.Errorf("invalid input caused infinite loop: %q", in) + } + } +} + +// BenchmarkNormalizeSafe is the naive case where we create a new collator +// and buffer every time. +func BenchmarkNormalizeSafe(b *testing.B) { + input := []byte("testing") + + for i := 0; i < b.N; i++ { + collator := newPooledCollator().(*pooledCollator) + normalize(collator.col, collator.buf, input) + } +} + +// BenchmarkNormalizeShared is the ideal case where the collator and buffer +// are shared between iterations, assuming no concurrency. +func BenchmarkNormalizeShared(b *testing.B) { + input := []byte("testing") + collator := newPooledCollator().(*pooledCollator) + + for i := 0; i < b.N; i++ { + normalize(collator.col, collator.buf, input) + } +} + +// BenchmarkNormalizePooled should get us close to the performance of +// BenchmarkNormalizeShared, except that this way is safe for concurrent use. +func BenchmarkNormalizePooled(b *testing.B) { + input := []byte("testing") + + for i := 0; i < b.N; i++ { + collator := collatorPool.Get().(*pooledCollator) + normalize(collator.col, collator.buf, input) + collatorPool.Put(collator) + } +} diff --git a/go/vt/vtgate/vindexes/unicodeloosemd5.go b/go/vt/vtgate/vindexes/unicodeloosemd5.go index c62cf0ff80c..65fb537f412 100644 --- a/go/vt/vtgate/vindexes/unicodeloosemd5.go +++ b/go/vt/vtgate/vindexes/unicodeloosemd5.go @@ -19,14 +19,9 @@ package vindexes import ( "bytes" "fmt" - "sync" - "unicode/utf8" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/key" - - "golang.org/x/text/collate" - "golang.org/x/text/language" ) var ( @@ -71,7 +66,7 @@ func (vind *UnicodeLooseMD5) NeedsVCursor() bool { func (vind *UnicodeLooseMD5) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]byte) ([]bool, error) { out := make([]bool, len(ids)) for i := range ids { - data, err := unicodeHash(ids[i]) + data, err := unicodeHash(vMD5Hash, ids[i]) if err != nil { return nil, fmt.Errorf("UnicodeLooseMD5.Verify: %v", err) } @@ -84,7 +79,7 @@ func (vind *UnicodeLooseMD5) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]b func (vind *UnicodeLooseMD5) Map(cursor VCursor, ids []sqltypes.Value) ([]key.Destination, error) { out := make([]key.Destination, 0, len(ids)) for _, id := range ids { - data, err := unicodeHash(id) + data, err := unicodeHash(vMD5Hash, id) if err != nil { return nil, fmt.Errorf("UnicodeLooseMD5.Map: %v", err) } @@ -93,63 +88,6 @@ func (vind *UnicodeLooseMD5) Map(cursor VCursor, ids []sqltypes.Value) ([]key.De return out, nil } -func unicodeHash(key sqltypes.Value) ([]byte, error) { - collator := collatorPool.Get().(*pooledCollator) - defer collatorPool.Put(collator) - - norm, err := normalize(collator.col, collator.buf, key.ToBytes()) - if err != nil { - return nil, err - } - return binHash(norm), nil -} - -func normalize(col *collate.Collator, buf *collate.Buffer, in []byte) ([]byte, error) { - // We cannot pass invalid UTF-8 to the collator. - if !utf8.Valid(in) { - return nil, fmt.Errorf("cannot normalize string containing invalid UTF-8: %q", string(in)) - } - - // Ref: http://dev.mysql.com/doc/refman/5.6/en/char.html. - // Trailing spaces are ignored by MySQL. - in = bytes.TrimRight(in, " ") - - // We use the collation key which can be used to - // perform lexical comparisons. - return col.Key(buf, in), nil -} - -// pooledCollator pairs a Collator and a Buffer. -// These pairs are pooled to avoid reallocating for every request, -// which would otherwise be required because they can't be used concurrently. -// -// Note that you must ensure no active references into the buffer remain -// before you return this pair back to the pool. -// That is, either do your processing on the result first, or make a copy. -type pooledCollator struct { - col *collate.Collator - buf *collate.Buffer -} - -var collatorPool = sync.Pool{New: newPooledCollator} - -func newPooledCollator() interface{} { - // Ref: http://www.unicode.org/reports/tr10/#Introduction. - // Unicode seems to define a universal (or default) order. - // But various locales have conflicting order, - // which they have the right to override. - // Unfortunately, the Go library requires you to specify a locale. - // So, I chose English assuming that it won't override - // the Unicode universal order. But I couldn't find an easy - // way to verify this. - // Also, the locale differences are not an issue for level 1, - // because the conservative comparison makes them all equal. - return &pooledCollator{ - col: collate.New(language.English, collate.Loose), - buf: new(collate.Buffer), - } -} - func init() { Register("unicode_loose_md5", NewUnicodeLooseMD5) } diff --git a/go/vt/vtgate/vindexes/unicodeloosemd5_test.go b/go/vt/vtgate/vindexes/unicodeloosemd5_test.go index 04695d2e341..c851f5e6960 100644 --- a/go/vt/vtgate/vindexes/unicodeloosemd5_test.go +++ b/go/vt/vtgate/vindexes/unicodeloosemd5_test.go @@ -18,27 +18,25 @@ package vindexes import ( "reflect" - "strings" "testing" - "time" "github.com/stretchr/testify/assert" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/key" ) -var charVindex SingleColumn +var charVindexMD5 SingleColumn func init() { vindex, _ := CreateVindex("unicode_loose_md5", "utf8ch", nil) - charVindex = vindex.(SingleColumn) + charVindexMD5 = vindex.(SingleColumn) } func TestUnicodeLooseMD5Info(t *testing.T) { - assert.Equal(t, 1, charVindex.Cost()) - assert.Equal(t, "utf8ch", charVindex.String()) - assert.True(t, charVindex.IsUnique()) - assert.False(t, charVindex.NeedsVCursor()) + assert.Equal(t, 1, charVindexMD5.Cost()) + assert.Equal(t, "utf8ch", charVindexMD5.String()) + assert.True(t, charVindexMD5.IsUnique()) + assert.False(t, charVindexMD5.NeedsVCursor()) } func TestUnicodeLooseMD5Map(t *testing.T) { @@ -76,7 +74,7 @@ func TestUnicodeLooseMD5Map(t *testing.T) { out: "\xac\x0f\x91y\xf5\x1d\xb8\u007f\xe8\xec\xc0\xcf@ʹz", }} for _, tcase := range tcases { - got, err := charVindex.Map(nil, []sqltypes.Value{sqltypes.NewVarBinary(tcase.in)}) + got, err := charVindexMD5.Map(nil, []sqltypes.Value{sqltypes.NewVarBinary(tcase.in)}) if err != nil { t.Error(err) } @@ -90,127 +88,12 @@ func TestUnicodeLooseMD5Map(t *testing.T) { func TestUnicodeLooseMD5Verify(t *testing.T) { ids := []sqltypes.Value{sqltypes.NewVarBinary("Test"), sqltypes.NewVarBinary("TEst"), sqltypes.NewVarBinary("different")} ksids := [][]byte{[]byte("\v^۴\x01\xfdu$96\x90I\x1dd\xf1\xf5"), []byte("\v^۴\x01\xfdu$96\x90I\x1dd\xf1\xf5"), []byte("\v^۴\x01\xfdu$96\x90I\x1dd\xf1\xf5")} - got, err := charVindex.Verify(nil, ids, ksids) + got, err := charVindexMD5.Verify(nil, ids, ksids) if err != nil { t.Fatal(err) } want := []bool{true, true, false} if !reflect.DeepEqual(got, want) { - t.Errorf("binaryMD5.Verify: %v, want %v", got, want) - } -} - -func TestNormalization(t *testing.T) { - tcases := []struct { - in, out string - }{{ - in: "Test", - out: "\x18\x16\x16L\x17\xf3\x18\x16", - }, { - in: "TEST", - out: "\x18\x16\x16L\x17\xf3\x18\x16", - }, { - in: "Te\u0301st", - out: "\x18\x16\x16L\x17\xf3\x18\x16", - }, { - in: "Tést", - out: "\x18\x16\x16L\x17\xf3\x18\x16", - }, { - in: "Bést", - out: "\x16\x05\x16L\x17\xf3\x18\x16", - }, { - in: "Test ", - out: "\x18\x16\x16L\x17\xf3\x18\x16", - }, { - in: " Test", - out: "\x01\t\x18\x16\x16L\x17\xf3\x18\x16", - }, { - in: "Test\t", - out: "\x18\x16\x16L\x17\xf3\x18\x16\x01\x00", - }, { - in: "TéstLooong", - out: "\x18\x16\x16L\x17\xf3\x18\x16\x17\x11\x17q\x17q\x17q\x17O\x16\x91", - }, { - in: "T", - out: "\x18\x16", - }} - collator := newPooledCollator().(*pooledCollator) - for _, tcase := range tcases { - norm, err := normalize(collator.col, collator.buf, []byte(tcase.in)) - if err != nil { - t.Errorf("normalize(%#v) error: %v", tcase.in, err) - } - out := string(norm) - if out != tcase.out { - t.Errorf("normalize(%#v): %#v, want %#v", tcase.in, out, tcase.out) - } - } -} - -func TestInvalidUnicodeNormalization(t *testing.T) { - // These strings are known to contain invalid UTF-8. - inputs := []string{ - "\x99\xeb\x9d\x18\xa4G\x84\x04]\x87\xf3\xc6|\xf2'F", - "D\x86\x15\xbb\xda\b1?j\x8e\xb6h\xd2\v\xf5\x05", - "\x8a[\xdf,\u007fĄE\x92\xd2W+\xcd\x06h\xd2", - } - wantErr := "invalid UTF-8" - collator := newPooledCollator().(*pooledCollator) - - for _, in := range inputs { - // We've observed that infinite looping is a possible failure mode for the - // collator when given invalid UTF-8, so we detect that with a timer. - done := make(chan struct{}) - go func() { - defer close(done) - _, err := normalize(collator.col, collator.buf, []byte(in)) - if err == nil { - t.Errorf("normalize(%q) error = nil, expected error", in) - } - if !strings.Contains(err.Error(), wantErr) { - t.Errorf("normalize(%q) error = %q, want %q", in, err.Error(), wantErr) - } - }() - timer := time.NewTimer(100 * time.Millisecond) - select { - case <-done: - timer.Stop() - case <-timer.C: - t.Errorf("invalid input caused infinite loop: %q", in) - } - } -} - -// BenchmarkNormalizeSafe is the naive case where we create a new collator -// and buffer every time. -func BenchmarkNormalizeSafe(b *testing.B) { - input := []byte("testing") - - for i := 0; i < b.N; i++ { - collator := newPooledCollator().(*pooledCollator) - normalize(collator.col, collator.buf, input) - } -} - -// BenchmarkNormalizeShared is the ideal case where the collator and buffer -// are shared between iterations, assuming no concurrency. -func BenchmarkNormalizeShared(b *testing.B) { - input := []byte("testing") - collator := newPooledCollator().(*pooledCollator) - - for i := 0; i < b.N; i++ { - normalize(collator.col, collator.buf, input) - } -} - -// BenchmarkNormalizePooled should get us close to the performance of -// BenchmarkNormalizeShared, except that this way is safe for concurrent use. -func BenchmarkNormalizePooled(b *testing.B) { - input := []byte("testing") - - for i := 0; i < b.N; i++ { - collator := collatorPool.Get().(*pooledCollator) - normalize(collator.col, collator.buf, input) - collatorPool.Put(collator) + t.Errorf("UnicodeLooseMD5.Verify: %v, want %v", got, want) } } diff --git a/go/vt/vtgate/vindexes/unicodeloosexxhash.go b/go/vt/vtgate/vindexes/unicodeloosexxhash.go new file mode 100644 index 00000000000..ffd630a2c56 --- /dev/null +++ b/go/vt/vtgate/vindexes/unicodeloosexxhash.go @@ -0,0 +1,93 @@ +/* +Copyright 2020 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vindexes + +import ( + "bytes" + "fmt" + + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/key" +) + +var ( + _ SingleColumn = (*UnicodeLooseXXHash)(nil) +) + +// UnicodeLooseXXHash is a vindex that normalizes and hashes unicode strings +// to a keyspace id. It conservatively converts the string to its base +// characters before hashing. This is also known as UCA level 1. +// Ref: http://www.unicode.org/reports/tr10/#Multi_Level_Comparison. +// This is compatible with MySQL's utf8_unicode_ci collation. +type UnicodeLooseXXHash struct { + name string +} + +// NewUnicodeLooseXXHash creates a new UnicodeLooseXXHash struct. +func NewUnicodeLooseXXHash(name string, _ map[string]string) (Vindex, error) { + return &UnicodeLooseXXHash{name: name}, nil +} + +// String returns the name of the vindex. +func (vind *UnicodeLooseXXHash) String() string { + return vind.name +} + +// Cost returns the cost as 1. +func (vind *UnicodeLooseXXHash) Cost() int { + return 1 +} + +// IsUnique returns true since the Vindex is unique. +func (vind *UnicodeLooseXXHash) IsUnique() bool { + return true +} + +// NeedsVCursor satisfies the Vindex interface. +func (vind *UnicodeLooseXXHash) NeedsVCursor() bool { + return false +} + +// Verify returns true if ids maps to ksids. +func (vind *UnicodeLooseXXHash) Verify(_ VCursor, ids []sqltypes.Value, ksids [][]byte) ([]bool, error) { + out := make([]bool, len(ids)) + for i := range ids { + data, err := unicodeHash(vXXHash, ids[i]) + if err != nil { + return nil, fmt.Errorf("UnicodeLooseXXHash.Verify: %v", err) + } + out[i] = bytes.Equal(data, ksids[i]) + } + return out, nil +} + +// Map can map ids to key.Destination objects. +func (vind *UnicodeLooseXXHash) Map(cursor VCursor, ids []sqltypes.Value) ([]key.Destination, error) { + out := make([]key.Destination, 0, len(ids)) + for _, id := range ids { + data, err := unicodeHash(vXXHash, id) + if err != nil { + return nil, fmt.Errorf("UnicodeLooseXXHash.Map: %v", err) + } + out = append(out, key.DestinationKeyspaceID(data)) + } + return out, nil +} + +func init() { + Register("unicode_loose_xxhash", NewUnicodeLooseXXHash) +} diff --git a/go/vt/vtgate/vindexes/unicodeloosexxhash_test.go b/go/vt/vtgate/vindexes/unicodeloosexxhash_test.go new file mode 100644 index 00000000000..f8d132029cc --- /dev/null +++ b/go/vt/vtgate/vindexes/unicodeloosexxhash_test.go @@ -0,0 +1,99 @@ +/* +Copyright 2020 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vindexes + +import ( + "reflect" + "testing" + + "github.com/stretchr/testify/assert" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/key" +) + +var charVindexXXHash SingleColumn + +func init() { + vindex, _ := CreateVindex("unicode_loose_xxhash", "utf8ch", nil) + charVindexXXHash = vindex.(SingleColumn) +} + +func TestUnicodeLooseXXHashInfo(t *testing.T) { + assert.Equal(t, 1, charVindexXXHash.Cost()) + assert.Equal(t, "utf8ch", charVindexXXHash.String()) + assert.True(t, charVindexXXHash.IsUnique()) + assert.False(t, charVindexXXHash.NeedsVCursor()) +} + +func TestUnicodeLooseXXHashMap(t *testing.T) { + tcases := []struct { + in, out string + }{{ + in: "Test", + out: "B\xd2\x13a\bzL\a", + }, { + in: "TEst", + out: "B\xd2\x13a\bzL\a", + }, { + in: "Te\u0301st", + out: "B\xd2\x13a\bzL\a", + }, { + in: "Tést", + out: "B\xd2\x13a\bzL\a", + }, { + in: "Bést", + out: "\x92iu\xb9\xce.\xc3\x16", + }, { + in: "Test ", + out: "B\xd2\x13a\bzL\a", + }, { + in: " Test", + out: "Oˋ\xe3N\xc0Wu", + }, { + in: "Test\t", + out: " \xaf\x87\xfc6\xe3\xfdQ", + }, { + in: "TéstLooong", + out: "\xd3\xea\x879B\xb4\x84\xa7", + }, { + in: "T", + out: "\xf8\x1c;\xe2\xd5\x01\xfe\x18", + }} + for _, tcase := range tcases { + got, err := charVindexXXHash.Map(nil, []sqltypes.Value{sqltypes.NewVarBinary(tcase.in)}) + if err != nil { + t.Error(err) + } + out := string(got[0].(key.DestinationKeyspaceID)) + if out != tcase.out { + t.Errorf("Map(%#v): %#v, want %#v", tcase.in, out, tcase.out) + } + } +} + +func TestUnicodeLooseXXHashVerify(t *testing.T) { + ids := []sqltypes.Value{sqltypes.NewVarBinary("Test"), sqltypes.NewVarBinary("TEst"), sqltypes.NewVarBinary("different")} + ksids := [][]byte{[]byte("B\xd2\x13a\bzL\a"), []byte("B\xd2\x13a\bzL\a"), []byte(" \xaf\x87\xfc6\xe3\xfdQ")} + got, err := charVindexXXHash.Verify(nil, ids, ksids) + if err != nil { + t.Fatal(err) + } + want := []bool{true, true, false} + if !reflect.DeepEqual(got, want) { + t.Errorf("UnicodeLooseXXHash.Verify: %v, want %v", got, want) + } +} diff --git a/go/vt/vtgate/vindexes/xxhash_test.go b/go/vt/vtgate/vindexes/xxhash_test.go index 049ce56d862..1586f9a4d89 100644 --- a/go/vt/vtgate/vindexes/xxhash_test.go +++ b/go/vt/vtgate/vindexes/xxhash_test.go @@ -22,7 +22,6 @@ import ( "reflect" "testing" - "github.com/cespare/xxhash/v2" "github.com/stretchr/testify/assert" "vitess.io/vitess/go/sqltypes" @@ -113,6 +112,7 @@ func BenchmarkXXHash(b *testing.B) { n int }{ {"8B", 8}, + {"32B", 32}, {"64B", 64}, {"512B", 512}, {"1KB", 1e3}, @@ -125,17 +125,17 @@ func BenchmarkXXHash(b *testing.B) { name := fmt.Sprintf("xxHash,direct,bytes,n=%s", benchSize.name) b.Run(name, func(b *testing.B) { - benchmarkHashBytes(b, input) + benchmarkXXHashBytes(b, input) }) } } -var sink uint64 +var sinkXXHash []byte -func benchmarkHashBytes(b *testing.B, input []byte) { +func benchmarkXXHashBytes(b *testing.B, input []byte) { b.SetBytes(int64(len(input))) for i := 0; i < b.N; i++ { - sink = xxhash.Sum64(input) + sinkXXHash = vXXHash(input) } }