Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve hash index #2887

Merged
merged 25 commits into from
Jan 16, 2019
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d05f253
saving state
Jan 5, 2019
0043d6c
added new fingerprint func using BLAKE2b
Jan 8, 2019
437ed96
renamed function to Hash256 for clarity.
Jan 8, 2019
5617e04
replaced 64 fingerprint hash with Hash256
Jan 8, 2019
89eeabb
pickTokenizer use hash tokenizer when list is lossy.
Jan 10, 2019
4750b64
added tokenizer identifier list for enforcing tokenizer.
Jan 10, 2019
8433ccc
compare func using hash index if available and eq won't compare values
Jan 10, 2019
7682c55
fixed minor comment glitches
Jan 10, 2019
47fee98
use tokenizer identifier consts, change hash to non-lossy.
Jan 10, 2019
4cf914b
using non-lossy hash so no need for extra logic in handleCompareFunction
Jan 10, 2019
990c9bc
simplify pickTokenizer and
Jan 10, 2019
2af6293
simplify pickTokenizer
Jan 10, 2019
c9fa41a
using tokenizer id
Jan 10, 2019
c46b520
added id value for custom tokenizers, IdentCustom
Jan 10, 2019
a6d461c
using tokenizer ids when possible
Jan 10, 2019
be0064a
Merge branch 'master' of github.com:/dgraph-io/dgraph into srfrog/iss…
Jan 11, 2019
2d51a3d
added hash index tests
Jan 11, 2019
16e8a9f
Manish's review. Fixed a new bug introduced by this PR during IdentCu…
manishrjain Jan 14, 2019
0f164a2
Remove Long term for exact index warning.
manishrjain Jan 14, 2019
3db5ba5
fixed logic
Jan 14, 2019
5658ded
pickTokenizer return error when comparison func doesn't have non-loss…
Jan 14, 2019
8b914de
added warning for eq comparison without non-lossy tokenizer
Jan 14, 2019
41a3d4d
re-fixed this slippery lil bug
Jan 15, 2019
3bf1b92
removed extra glog
Jan 16, 2019
91f24de
Merge branch 'master' of github.com:/dgraph-io/dgraph into srfrog/iss…
Jan 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions posting/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ func indexTokens(attr, lang string, src types.Val) ([]string, error) {
// Schema will know the mapping from attr to tokenizer.
var tokens []string
for _, it := range schema.State().Tokenizer(attr) {
if it.Name() == "exact" && schemaType == types.StringID && len(sv.Value.(string)) > 100 {
// Exact index can only be applied for strings so we can safely try to convert Value to
// string.
glog.Infof("Long term for exact index on predicate: [%s]. "+
"Consider switching to hash for better performance.\n", attr)
}
toks, err := tok.BuildTokens(sv.Value, tok.GetLangTokenizer(it, lang))
if err != nil {
return tokens, err
Expand Down
2 changes: 1 addition & 1 deletion posting/lists.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func Cleanup() {
// to lru cache and returns it.
//
// plist := Get(key, group)
// ... // Use plist
// ... Use plist
// TODO: This should take a node id and index. And just append all indices to a list.
// When doing a commit, it should update all the sync index watermarks.
// worker pkg would push the indices to the watermarks held by lists.
Expand Down
114 changes: 114 additions & 0 deletions systest/queries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func TestQuery(t *testing.T) {
t.Run("schema predicate names", wrap(SchemaQueryTestPredicate1))
t.Run("schema specific predicate fields", wrap(SchemaQueryTestPredicate2))
t.Run("schema specific predicate field", wrap(SchemaQueryTestPredicate3))
t.Run("hash index queries", wrap(QueryHashIndex))
t.Run("cleanup", wrap(SchemaQueryCleanup))
}

Expand Down Expand Up @@ -316,3 +317,116 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) {
}`
CompareJSON(t, js, string(m["data"]))
}

func QueryHashIndex(t *testing.T, c *dgo.Dgraph) {
ctx := context.Background()

require.NoError(t, c.Alter(ctx, &api.Operation{
Schema: `
name: string @index(hash) @lang .
`,
}))

txn := c.NewTxn()
_, err := txn.Mutate(ctx, &api.Mutation{
SetNquads: []byte(`
_:p0 <name> "" .
_:p1 <name> "0" .
_:p2 <name> "srfrog" .
_:p3 <name> "Lorem ipsum" .
_:p4 <name> "Lorem ipsum dolor sit amet" .
_:p5 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit" .
_:p6 <name> "Lorem ipsum"@en .
_:p7 <name> "Lorem ipsum dolor sit amet"@en .
_:p8 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit"@en .
_:p9 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed varius tellus ut sem bibendum, eu tristique augue congue. Praesent eget odio tincidunt, pellentesque ante sit amet, tempus sem. Donec et tellus et diam facilisis egestas ut ac risus. Proin feugiat risus tristique erat condimentum placerat. Nulla eget ligula tempus, blandit leo vel, accumsan tortor. Phasellus et felis in diam ultricies porta nec in ipsum. Phasellus id leo sagittis, bibendum enim ut, pretium lectus. Quisque ac ex viverra, suscipit turpis sed, scelerisque metus. Sed non dui facilisis, viverra leo eget, vulputate erat. Etiam nec enim sed nisi imperdiet cursus. Suspendisse sed ligula non nisi pharetra varius." .
_:pa <name> ""@fr .
`),
})
require.NoError(t, err)
require.NoError(t, txn.Commit(ctx))

tests := []struct {
in, out string
}{
{
in: `schema(pred: [name]) {}`,
out: `
{
"schema": [
{
"index": true,
"lang": true,
"predicate": "name",
"tokenizer": [
"hash"
],
"type": "string"
}
]
}`,
},
{
in: `{q(func:eq(name,"")){name}}`,
out: `{"q": [{"name":""}]}`,
},
{
in: `{q(func:eq(name,"0")){name}}`,
out: `{"q": [{"name":"0"}]}`,
},
{
in: `{q(func:eq(name,"srfrog")){name}}`,
out: `{"q": [{"name":"srfrog"}]}`,
},
{
in: `{q(func:eq(name,"Lorem ipsum")){name}}`,
out: `{"q": [{"name":"Lorem ipsum"}]}`,
},
{
in: `{q(func:eq(name,"Lorem ipsum dolor sit amet")){name}}`,
out: `{"q": [{"name":"Lorem ipsum dolor sit amet"}]}`,
},
{
in: `{q(func:eq(name@en,"Lorem ipsum")){name@en}}`,
out: `{"q": [{"name@en":"Lorem ipsum"}]}`,
},
{
in: `{q(func:eq(name@.,"Lorem ipsum dolor sit amet")){name@en}}`,
out: `{"q": [{"name@en":"Lorem ipsum dolor sit amet"}]}`,
},
{
in: `{q(func:eq(name,["srfrog"])){name}}`,
out: `{"q": [{"name":"srfrog"}]}`,
},
{
in: `{q(func:eq(name,["srfrog","srf","srfrogg","sr","s"])){name}}`,
out: `{"q": [{"name":"srfrog"}]}`,
},
{
in: `{q(func:eq(name,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name}}`,
out: `{"q": [{"name":""},{"name":"Lorem ipsum"},{"name":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`,
},
{
in: `{q(func:eq(name,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name}}`,
out: `{"q": [{"name":"Lorem ipsum"}]}`,
},
{
in: `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name@en}}`,
out: `{"q": [{"name@en":"Lorem ipsum"},{"name@en":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`,
},
{
in: `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name@en}}`,
out: `{"q": [{"name@en":"Lorem ipsum"}]}`,
},
{
in: `{q(func:eq(name@.,"")){name@fr}}`,
out: `{"q": [{"name@fr":""}]}`,
},
}

for _, tc := range tests {
resp, err := c.NewTxn().Query(ctx, tc.in)
require.NoError(t, err)
CompareJSON(t, tc.out, string(resp.Json))
}
}
67 changes: 49 additions & 18 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,36 @@ import (
"plugin"
"time"

farm "github.com/dgryski/go-farm"
"github.com/golang/glog"
geom "github.com/twpayne/go-geom"
"golang.org/x/crypto/blake2b"

"github.com/dgraph-io/dgraph/types"
"github.com/dgraph-io/dgraph/x"
)

// Tokenizer identifiers are unique and can't be reused.
// The range 0x00 - 0x7f is system reserved.
// The range 0x80 - 0xff is for custom tokenizers.
// TODO: use these everywhere where we must ensure a system tokenizer.
const (
IdentNone = 0x0
IdentTerm = 0x1
IdentExact = 0x2
IdentYear = 0x4
IdentMonth = 0x41
IdentDay = 0x42
IdentHour = 0x43
IdentGeo = 0x5
IdentInt = 0x6
IdentFloat = 0x7
IdentFullText = 0x8
IdentBool = 0x9
IdentTrigram = 0xA
IdentHash = 0xB
IdentCustom = 0x80
)

// Tokenizer defines what a tokenizer must provide.
type Tokenizer interface {

Expand Down Expand Up @@ -102,7 +124,7 @@ func LoadCustomTokenizer(soFile string) {
tokenizer := symb.(func() interface{})().(PluginTokenizer)

id := tokenizer.Identifier()
x.AssertTruef(id >= 0x80,
x.AssertTruef(id >= IdentCustom,
"custom tokenizer identifier byte must be >= 0x80, but was %#x", id)
registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer})
}
Expand All @@ -128,7 +150,7 @@ func (t GeoTokenizer) Type() string { return "geo" }
func (t GeoTokenizer) Tokens(v interface{}) ([]string, error) {
return types.IndexGeoTokens(v.(geom.T))
}
func (t GeoTokenizer) Identifier() byte { return 0x5 }
func (t GeoTokenizer) Identifier() byte { return IdentGeo }
func (t GeoTokenizer) IsSortable() bool { return false }
func (t GeoTokenizer) IsLossy() bool { return true }

Expand All @@ -139,7 +161,7 @@ func (t IntTokenizer) Type() string { return "int" }
func (t IntTokenizer) Tokens(v interface{}) ([]string, error) {
return []string{encodeInt(v.(int64))}, nil
}
func (t IntTokenizer) Identifier() byte { return 0x6 }
func (t IntTokenizer) Identifier() byte { return IdentInt }
func (t IntTokenizer) IsSortable() bool { return true }
func (t IntTokenizer) IsLossy() bool { return false }

Expand All @@ -150,7 +172,7 @@ func (t FloatTokenizer) Type() string { return "float" }
func (t FloatTokenizer) Tokens(v interface{}) ([]string, error) {
return []string{encodeInt(int64(v.(float64)))}, nil
}
func (t FloatTokenizer) Identifier() byte { return 0x7 }
func (t FloatTokenizer) Identifier() byte { return IdentFloat }
func (t FloatTokenizer) IsSortable() bool { return true }
func (t FloatTokenizer) IsLossy() bool { return true }

Expand All @@ -164,7 +186,7 @@ func (t YearTokenizer) Tokens(v interface{}) ([]string, error) {
binary.BigEndian.PutUint16(buf[0:2], uint16(tval.Year()))
return []string{string(buf)}, nil
}
func (t YearTokenizer) Identifier() byte { return 0x4 }
func (t YearTokenizer) Identifier() byte { return IdentYear }
func (t YearTokenizer) IsSortable() bool { return true }
func (t YearTokenizer) IsLossy() bool { return true }

Expand All @@ -179,7 +201,7 @@ func (t MonthTokenizer) Tokens(v interface{}) ([]string, error) {
binary.BigEndian.PutUint16(buf[2:4], uint16(tval.Month()))
return []string{string(buf)}, nil
}
func (t MonthTokenizer) Identifier() byte { return 0x41 }
func (t MonthTokenizer) Identifier() byte { return IdentMonth }
func (t MonthTokenizer) IsSortable() bool { return true }
func (t MonthTokenizer) IsLossy() bool { return true }

Expand All @@ -195,7 +217,7 @@ func (t DayTokenizer) Tokens(v interface{}) ([]string, error) {
binary.BigEndian.PutUint16(buf[4:6], uint16(tval.Day()))
return []string{string(buf)}, nil
}
func (t DayTokenizer) Identifier() byte { return 0x42 }
func (t DayTokenizer) Identifier() byte { return IdentDay }
func (t DayTokenizer) IsSortable() bool { return true }
func (t DayTokenizer) IsLossy() bool { return true }

Expand All @@ -212,7 +234,7 @@ func (t HourTokenizer) Tokens(v interface{}) ([]string, error) {
binary.BigEndian.PutUint16(buf[6:8], uint16(tval.Hour()))
return []string{string(buf)}, nil
}
func (t HourTokenizer) Identifier() byte { return 0x43 }
func (t HourTokenizer) Identifier() byte { return IdentHour }
func (t HourTokenizer) IsSortable() bool { return true }
func (t HourTokenizer) IsLossy() bool { return true }

Expand All @@ -228,7 +250,7 @@ func (t TermTokenizer) Tokens(v interface{}) ([]string, error) {
tokens := termAnalyzer.Analyze([]byte(str))
return uniqueTerms(tokens), nil
}
func (t TermTokenizer) Identifier() byte { return 0x1 }
func (t TermTokenizer) Identifier() byte { return IdentTerm }
func (t TermTokenizer) IsSortable() bool { return false }
func (t TermTokenizer) IsLossy() bool { return true }

Expand All @@ -242,7 +264,7 @@ func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) {
}
return nil, x.Errorf("Exact indices only supported for string types")
}
func (t ExactTokenizer) Identifier() byte { return 0x2 }
func (t ExactTokenizer) Identifier() byte { return IdentExact }
func (t ExactTokenizer) IsSortable() bool { return true }
func (t ExactTokenizer) IsLossy() bool { return false }

Expand All @@ -265,7 +287,7 @@ func (t FullTextTokenizer) Tokens(v interface{}) ([]string, error) {
// finally, return the terms.
return uniqueTerms(tokens), nil
}
func (t FullTextTokenizer) Identifier() byte { return 0x8 }
func (t FullTextTokenizer) Identifier() byte { return IdentFullText }
func (t FullTextTokenizer) IsSortable() bool { return false }
func (t FullTextTokenizer) IsLossy() bool { return true }

Expand Down Expand Up @@ -307,7 +329,7 @@ func (t BoolTokenizer) Tokens(v interface{}) ([]string, error) {
}
return []string{encodeInt(b)}, nil
}
func (t BoolTokenizer) Identifier() byte { return 0x9 }
func (t BoolTokenizer) Identifier() byte { return IdentBool }
func (t BoolTokenizer) IsSortable() bool { return false }
func (t BoolTokenizer) IsLossy() bool { return false }

Expand All @@ -331,7 +353,7 @@ func (t TrigramTokenizer) Tokens(v interface{}) ([]string, error) {
}
return nil, nil
}
func (t TrigramTokenizer) Identifier() byte { return 0xA }
func (t TrigramTokenizer) Identifier() byte { return IdentTrigram }
func (t TrigramTokenizer) IsSortable() bool { return false }
func (t TrigramTokenizer) IsLossy() bool { return true }

Expand All @@ -344,13 +366,22 @@ func (t HashTokenizer) Tokens(v interface{}) ([]string, error) {
if !ok {
return nil, x.Errorf("Hash tokenizer only supported for string types")
}
var hash [8]byte
binary.BigEndian.PutUint64(hash[:], farm.Hash64([]byte(term)))
// Blake2 is a hash function equivalent of SHA series, but faster. SHA is the best hash function
// for doing checksum of content, because they have low collision ratios. See issue #2776.
hash := blake2b.Sum256([]byte(term))
if len(hash) == 0 {
return nil, x.Errorf("Hash tokenizer failed to create hash")
}
return []string{string(hash[:])}, nil
}
func (t HashTokenizer) Identifier() byte { return 0xB }
func (t HashTokenizer) Identifier() byte { return IdentHash }
func (t HashTokenizer) IsSortable() bool { return false }
func (t HashTokenizer) IsLossy() bool { return true }

// We have switched HashTokenizer to be non-lossy. This allows us to avoid having to retrieve values
// for the returned results, and compare them against the value in the query, which is slow. There
// is very low probability of collisions with a 256-bit hash. We use that fact to speed up equality
// query operations using the hash index.
func (t HashTokenizer) IsLossy() bool { return false }

// PluginTokenizer is implemented by external plugins loaded dynamically via
// *.so files. It follows the implementation semantics of the Tokenizer
Expand Down
9 changes: 9 additions & 0 deletions worker/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -962,12 +962,21 @@ func (qs *queryState) handleRegexFunction(ctx context.Context, arg funcArgs) err
}

func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) error {
span := otrace.FromContext(ctx)
stop := x.SpanTimer(span, "handleCompareFunction")
defer stop()
if span != nil {
span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn)
}

attr := arg.q.Attr
span.Annotatef(nil, "Attr: %s. Fname: %s", attr, arg.srcFn.fname)
tokenizer, err := pickTokenizer(attr, arg.srcFn.fname)
// We should already have checked this in getInequalityTokens.
x.Check(err)
// Only if the tokenizer that we used IsLossy, then we need to fetch
// and compare the actual values.
span.Annotatef(nil, "Tokenizer: %s, Lossy: %t", tokenizer.Name(), tokenizer.IsLossy())
if tokenizer.IsLossy() {
// Need to evaluate inequality for entries in the first bucket.
typ, err := schema.State().TypeOf(attr)
Expand Down
Loading