Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve hash index #2887

Merged
merged 25 commits into from
Jan 16, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d05f253
saving state
Jan 5, 2019
0043d6c
added new fingerprint func using BLAKE2b
Jan 8, 2019
437ed96
renamed function to Hash256 for clarity.
Jan 8, 2019
5617e04
replaced 64 fingerprint hash with Hash256
Jan 8, 2019
89eeabb
pickTokenizer use hash tokenizer when list is lossy.
Jan 10, 2019
4750b64
added tokenizer identifier list for enforcing tokenizer.
Jan 10, 2019
8433ccc
compare func using hash index if available and eq won't compare values
Jan 10, 2019
7682c55
fixed minor comment glitches
Jan 10, 2019
47fee98
use tokenizer identifier consts, change hash to non-lossy.
Jan 10, 2019
4cf914b
using non-lossy hash so no need for extra logic in handleCompareFunction
Jan 10, 2019
990c9bc
simplify pickTokenizer and
Jan 10, 2019
2af6293
simplify pickTokenizer
Jan 10, 2019
c9fa41a
using tokenizer id
Jan 10, 2019
c46b520
added id value for custom tokenizers, IdentCustom
Jan 10, 2019
a6d461c
using tokenizer ids when possible
Jan 10, 2019
be0064a
Merge branch 'master' of github.com:/dgraph-io/dgraph into srfrog/iss…
Jan 11, 2019
2d51a3d
added hash index tests
Jan 11, 2019
16e8a9f
Manish's review. Fixed a new bug introduced by this PR during IdentCu…
manishrjain Jan 14, 2019
0f164a2
Remove Long term for exact index warning.
manishrjain Jan 14, 2019
3db5ba5
fixed logic
Jan 14, 2019
5658ded
pickTokenizer return error when comparison func doesn't have non-loss…
Jan 14, 2019
8b914de
added warning for eq comparison without non-lossy tokenizer
Jan 14, 2019
41a3d4d
re-fixed this slippery lil bug
Jan 15, 2019
3bf1b92
removed extra glog
Jan 16, 2019
91f24de
Merge branch 'master' of github.com:/dgraph-io/dgraph into srfrog/iss…
Jan 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion posting/lists.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func Cleanup() {
// to lru cache and returns it.
//
// plist := Get(key, group)
// ... // Use plist
// ... Use plist
// TODO: This should take a node id and index. And just append all indices to a list.
// When doing a commit, it should update all the sync index watermarks.
// worker pkg would push the indices to the watermarks held by lists.
Expand Down
30 changes: 26 additions & 4 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,34 @@ import (
"plugin"
"time"

farm "github.com/dgryski/go-farm"
"github.com/golang/glog"
geom "github.com/twpayne/go-geom"

"github.com/dgraph-io/dgraph/types"
"github.com/dgraph-io/dgraph/x"
)

// Tokenizer identifiers are unique and can't be reused.
// The range 0x00 - 0x79 is system reserved.
// The range 0x80 - 0xff is for custom tokenizers.
// TODO: use these everywhere where we must ensure a system tokenizer.
const (
IdentNone = 0x0
IdentTerm = 0x1
IdentExact = 0x2
IdentYear = 0x4
IdentGeo = 0x5
IdentInt = 0x6
IdentFloat = 0x7
IdentFullText = 0x8
IdentBool = 0x9
IdentTrigram = 0xA
IdentHash = 0xB
IdentMonth = 0x41
IdentDay = 0x42
IdentHour = 0x43
)

// Tokenizer defines what a tokenizer must provide.
type Tokenizer interface {

Expand Down Expand Up @@ -344,9 +364,11 @@ func (t HashTokenizer) Tokens(v interface{}) ([]string, error) {
if !ok {
return nil, x.Errorf("Hash tokenizer only supported for string types")
}
var hash [8]byte
binary.BigEndian.PutUint64(hash[:], farm.Hash64([]byte(term)))
return []string{string(hash[:])}, nil
hash := x.Hash256([]byte(term))
if len(hash) == 0 {
return nil, x.Errorf("Hash tokenizer failed to create hash")
}
return []string{string(hash)}, nil
}
func (t HashTokenizer) Identifier() byte { return 0xB }
func (t HashTokenizer) IsSortable() bool { return false }
Expand Down
44 changes: 38 additions & 6 deletions worker/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -964,19 +964,39 @@ func (qs *queryState) handleRegexFunction(ctx context.Context, arg funcArgs) err
}

func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) error {
span := otrace.FromContext(ctx)
stop := x.SpanTimer(span, "handleCompareFunction")
defer stop()
if span != nil {
span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn)
}

attr := arg.q.Attr
span.Annotatef(nil, "Attr: %s. Fname: %s", attr, arg.srcFn.fname)
tokenizer, err := pickTokenizer(attr, arg.srcFn.fname)
// We should already have checked this in getInequalityTokens.
x.Check(err)
// Only if the tokenizer that we used IsLossy, then we need to fetch
// and compare the actual values.
span.Annotatef(nil, "Tokenizer: %s, Lossy: %t", tokenizer.Name(), tokenizer.IsLossy())
if tokenizer.IsLossy() {
// Need to evaluate inequality for entries in the first bucket.
typ, err := schema.State().TypeOf(attr)
if err != nil || !typ.IsScalar() {
return x.Errorf("Attribute not scalar: %s %v", attr, typ)
}

var keyFn func(int, uint64) []byte
if tokenizer.Identifier() == tok.IdentHash {
keyFn = func(row int, _ uint64) []byte {
return x.IndexKey(attr, arg.srcFn.tokens[row])
}
} else {
keyFn = func(_ int, uid uint64) []byte {
return x.DataKey(attr, uid)
}
}

x.AssertTrue(len(arg.out.UidMatrix) > 0)
rowsToFilter := 0
if arg.srcFn.fname == eq {
Expand All @@ -1000,8 +1020,9 @@ func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) e
algo.ApplyFilter(arg.out.UidMatrix[row], func(uid uint64, i int) bool {
switch lang {
case "":
// TODO: use hash index in list
if isList {
pl, err := posting.GetNoStore(x.DataKey(attr, uid))
pl, err := posting.GetNoStore(keyFn(row, uid))
if err != nil {
filterErr = err
return false
Expand All @@ -1023,11 +1044,15 @@ func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) e
return false
}

pl, err := posting.GetNoStore(x.DataKey(attr, uid))
pl, err := posting.GetNoStore(keyFn(row, uid))
if err != nil {
filterErr = err
return false
}
if arg.q.SrcFunc.Name == "eq" {
span.Annotate(nil, fmt.Sprintf("--- eq token: %d:%s", row, arg.srcFn.eqTokens[row].Value))
return true
}
sv, err := pl.Value(arg.q.ReadTs)
if err != nil {
if err != posting.ErrNoValue {
Expand All @@ -1039,7 +1064,7 @@ func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) e
return err == nil &&
types.CompareVals(arg.q.SrcFunc.Name, dst, arg.srcFn.eqTokens[row])
case ".":
pl, err := posting.GetNoStore(x.DataKey(attr, uid))
pl, err := posting.GetNoStore(keyFn(row, uid))
if err != nil {
filterErr = err
return false
Expand All @@ -1058,17 +1083,24 @@ func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) e
}
return false
default:
sv, err := fetchValue(uid, attr, arg.q.Langs, typ, arg.q.ReadTs)
pl, err := posting.GetNoStore(keyFn(row, uid))
if err != nil {
if err != posting.ErrNoValue {
filterErr = err
}
return false
}
if sv.Value == nil {
src, err := pl.ValueFor(arg.q.ReadTs, arg.q.Langs)
if err != nil {
filterErr = err
return false
}
dst, err := types.Convert(src, typ)
if err != nil {
filterErr = err
return false
}
return types.CompareVals(arg.q.SrcFunc.Name, sv, arg.srcFn.eqTokens[row])
return types.CompareVals(arg.q.SrcFunc.Name, dst, arg.srcFn.eqTokens[row])
}
})
if filterErr != nil {
Expand Down
24 changes: 15 additions & 9 deletions worker/tokens.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,22 @@ func pickTokenizer(attr string, f string) (tok.Tokenizer, error) {

tokenizers := schema.State().Tokenizer(attr)

var tokenizer tok.Tokenizer
for _, t := range tokenizers {
tokIdx := -1
for i, t := range tokenizers {
if !t.IsLossy() {
tokenizer = t
tokIdx = i
break
}
// prefer hash over other lossy tokenizers.
if t.Identifier() == tok.IdentHash {
tokIdx = i
}
}

// If function is eq and we found a tokenizer thats !Lossy(), lets return
// it to avoid the second lookup.
if f == "eq" && tokenizer != nil {
return tokenizer, nil
if f == "eq" && tokIdx != -1 {
return tokenizers[tokIdx], nil
}

// Lets try to find a sortable tokenizer.
Expand Down Expand Up @@ -178,21 +182,23 @@ func getInequalityTokens(readTs uint64, attr, f string,
continue
}
// if its lossy then we handle inequality comparison later
// on in handleCompareAttr
// on in handleCompareFunction
if tokenizer.IsLossy() {
out = append(out, k.Term)
} else {
// for non Lossy lets compare for inequality (gt & lt)
// to see if key needs to be included
if f == "gt" {
switch {
case f == "gt":
if bytes.Compare([]byte(k.Term), ineqTokenInBytes) > 0 {
out = append(out, k.Term)
}
} else if f == "lt" {
case f == "lt":
if bytes.Compare([]byte(k.Term), ineqTokenInBytes) < 0 {
out = append(out, k.Term)
}
} else { //for le or ge or any other fn consider the key
default:
// for le or ge or any other fn consider the key
out = append(out, k.Term)
}
}
Expand Down
24 changes: 24 additions & 0 deletions x/hash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Copyright 2018 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package x

import "golang.org/x/crypto/blake2b"

func Hash256(data []byte) []byte {
h := blake2b.Sum256(data)
return h[:]
}