Support vector data type, query + searcher (#1857)

+ Add support for indexing and querying vector data (array of float32) + Corresponding to each field of type vector, there will be associated properties like Dimensions and Similarity, which will be used to determine validity of a vector and a criteria for scoring search hits, respectively. + Supports vector reader interface (searcher) and the `knn` construct within the SearchRequest. Related PR: * blevesearch/bleve_index_api#34 --------- Co-authored-by: Abhi Dangeti <abhinav@couchbase.com> Co-authored-by: Aditi Ahuja <aditi.ahuja@couchbase.com>
blevesearch · Nov 3, 2023 · 80d9b18 · 80d9b18
1 parent 840fde5
commit 80d9b18
Show file tree

Hide file tree

Showing 22 changed files with 1,348 additions and 171 deletions.
diff --git a/document/field_vector.go b/document/field_vector.go
@@ -0,0 +1,138 @@
+//  Copyright (c) 2023 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors
+// +build vectors
+
+package document
+
+import (
+	"fmt"
+	"reflect"
+
+	"github.com/blevesearch/bleve/v2/size"
+	index "github.com/blevesearch/bleve_index_api"
+)
+
+var reflectStaticSizeVectorField int
+
+func init() {
+	var f VectorField
+	reflectStaticSizeVectorField = int(reflect.TypeOf(f).Size())
+}
+
+const DefaultVectorIndexingOptions = index.IndexField
+
+type VectorField struct {
+	name              string
+	dims              int    // Dimensionality of the vector
+	similarity        string // Similarity metric to use for scoring
+	options           index.FieldIndexingOptions
+	value             []float32
+	numPlainTextBytes uint64
+}
+
+func (n *VectorField) Size() int {
+	return reflectStaticSizeVectorField + size.SizeOfPtr +
+		len(n.name) +
+		int(numBytesFloat32s(n.value))
+}
+
+func (n *VectorField) Name() string {
+	return n.name
+}
+
+func (n *VectorField) ArrayPositions() []uint64 {
+	return nil
+}
+
+func (n *VectorField) Options() index.FieldIndexingOptions {
+	return n.options
+}
+
+func (n *VectorField) NumPlainTextBytes() uint64 {
+	return n.numPlainTextBytes
+}
+
+func (n *VectorField) AnalyzedLength() int {
+	// vectors aren't analyzed
+	return 0
+}
+
+func (n *VectorField) EncodedFieldType() byte {
+	return 'v'
+}
+
+func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies {
+	// vectors aren't analyzed
+	return nil
+}
+
+func (n *VectorField) Analyze() {
+	// vectors aren't analyzed
+}
+
+func (n *VectorField) Value() []byte {
+	return nil
+}
+
+func (n *VectorField) GoString() string {
+	return fmt.Sprintf("&document.VectorField{Name:%s, Options: %s, "+
+		"Value: %+v}", n.name, n.options, n.value)
+}
+
+// For the sake of not polluting the API, we are keeping arrayPositions as a
+// parameter, but it is not used.
+func NewVectorField(name string, arrayPositions []uint64,
+	vector []float32, dims int, similarity string) *VectorField {
+	return NewVectorFieldWithIndexingOptions(name, arrayPositions,
+		vector, dims, similarity, DefaultVectorIndexingOptions)
+}
+
+// For the sake of not polluting the API, we are keeping arrayPositions as a
+// parameter, but it is not used.
+func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64,
+	vector []float32, dims int, similarity string,
+	options index.FieldIndexingOptions) *VectorField {
+	options = options | DefaultVectorIndexingOptions
+
+	return &VectorField{
+		name:              name,
+		dims:              dims,
+		similarity:        similarity,
+		options:           options,
+		value:             vector,
+		numPlainTextBytes: numBytesFloat32s(vector),
+	}
+}
+
+func numBytesFloat32s(value []float32) uint64 {
+	return uint64(len(value) * size.SizeOfFloat32)
+}
+
+// -----------------------------------------------------------------------------
+// Following methods help in implementing the bleve_index_api's VectorField
+// interface.
+
+func (n *VectorField) Vector() []float32 {
+	return n.value
+}
+
+func (n *VectorField) Dims() int {
+	return n.dims
+}
+
+func (n *VectorField) Similarity() string {
+	return n.similarity
+}
diff --git a/geo/parse.go b/geo/parse.go
@@ -18,6 +18,8 @@ import (
 	"reflect"
 	"strconv"
 	"strings"
+
+	"github.com/blevesearch/bleve/v2/util"
 )
 
 // ExtractGeoPoint takes an arbitrary interface{} and tries it's best to
@@ -61,12 +63,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
 			first := thingVal.Index(0)
 			if first.CanInterface() {
 				firstVal := first.Interface()
-				lon, foundLon = extractNumericVal(firstVal)
+				lon, foundLon = util.ExtractNumericValFloat64(firstVal)
 			}
 			second := thingVal.Index(1)
 			if second.CanInterface() {
 				secondVal := second.Interface()
-				lat, foundLat = extractNumericVal(secondVal)
+				lat, foundLat = util.ExtractNumericValFloat64(secondVal)
 			}
 		}
 	}
@@ -105,12 +107,12 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
 	// is it a map
 	if l, ok := thing.(map[string]interface{}); ok {
 		if lval, ok := l["lon"]; ok {
-			lon, foundLon = extractNumericVal(lval)
+			lon, foundLon = util.ExtractNumericValFloat64(lval)
 		} else if lval, ok := l["lng"]; ok {
-			lon, foundLon = extractNumericVal(lval)
+			lon, foundLon = util.ExtractNumericValFloat64(lval)
 		}
 		if lval, ok := l["lat"]; ok {
-			lat, foundLat = extractNumericVal(lval)
+			lat, foundLat = util.ExtractNumericValFloat64(lval)
 		}
 	}
 
@@ -121,19 +123,19 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
 			if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
 				if thingVal.Field(i).CanInterface() {
 					fieldVal := thingVal.Field(i).Interface()
-					lon, foundLon = extractNumericVal(fieldVal)
+					lon, foundLon = util.ExtractNumericValFloat64(fieldVal)
 				}
 			}
 			if strings.HasPrefix(strings.ToLower(fieldName), "lng") {
 				if thingVal.Field(i).CanInterface() {
 					fieldVal := thingVal.Field(i).Interface()
-					lon, foundLon = extractNumericVal(fieldVal)
+					lon, foundLon = util.ExtractNumericValFloat64(fieldVal)
 				}
 			}
 			if strings.HasPrefix(strings.ToLower(fieldName), "lat") {
 				if thingVal.Field(i).CanInterface() {
 					fieldVal := thingVal.Field(i).Interface()
-					lat, foundLat = extractNumericVal(fieldVal)
+					lat, foundLat = util.ExtractNumericValFloat64(fieldVal)
 				}
 			}
 		}
@@ -157,25 +159,6 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
 	return lon, lat, foundLon && foundLat
 }
 
-// extract numeric value (if possible) and returns a float64
-func extractNumericVal(v interface{}) (float64, bool) {
-	val := reflect.ValueOf(v)
-	if !val.IsValid() {
-		return 0, false
-	}
-	typ := val.Type()
-	switch typ.Kind() {
-	case reflect.Float32, reflect.Float64:
-		return val.Float(), true
-	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
-		return float64(val.Int()), true
-	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
-		return float64(val.Uint()), true
-	}
-
-	return 0, false
-}
-
 // various support interfaces which can be used to find lat/lon
 type loner interface {
 	Lon() float64
@@ -209,12 +192,12 @@ func extractCoordinates(thing interface{}) []float64 {
 			first := thingVal.Index(0)
 			if first.CanInterface() {
 				firstVal := first.Interface()
-				lon, foundLon = extractNumericVal(firstVal)
+				lon, foundLon = util.ExtractNumericValFloat64(firstVal)
 			}
 			second := thingVal.Index(1)
 			if second.CanInterface() {
 				secondVal := second.Interface()
-				lat, foundLat = extractNumericVal(secondVal)
+				lat, foundLat = util.ExtractNumericValFloat64(secondVal)
 			}
 
 			if !foundLon || !foundLat {

diff --git a/index/scorch/snapshot_index_vr.go b/index/scorch/snapshot_index_vr.go
@@ -0,0 +1,156 @@
+//  Copyright (c) 2023 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build vectors
+// +build vectors
+
+package scorch
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"reflect"
+
+	"github.com/blevesearch/bleve/v2/size"
+	index "github.com/blevesearch/bleve_index_api"
+	segment_api "github.com/blevesearch/scorch_segment_api/v2"
+)
+
+var reflectStaticSizeIndexSnapshotVectorReader int
+
+func init() {
+	var istfr IndexSnapshotVectorReader
+	reflectStaticSizeIndexSnapshotVectorReader = int(reflect.TypeOf(istfr).Size())
+}
+
+type IndexSnapshotVectorReader struct {
+	vector        []float32
+	field         string
+	k             int64
+	snapshot      *IndexSnapshot
+	postings      []segment_api.VecPostingsList
+	iterators     []segment_api.VecPostingsIterator
+	segmentOffset int
+	currPosting   segment_api.VecPosting
+	currID        index.IndexInternalID
+	ctx           context.Context
+}
+
+func (i *IndexSnapshotVectorReader) Size() int {
+	sizeInBytes := reflectStaticSizeIndexSnapshotVectorReader + size.SizeOfPtr +
+		len(i.vector) + len(i.field) + len(i.currID)
+
+	for _, entry := range i.postings {
+		sizeInBytes += entry.Size()
+	}
+
+	for _, entry := range i.iterators {
+		sizeInBytes += entry.Size()
+	}
+
+	if i.currPosting != nil {
+		sizeInBytes += i.currPosting.Size()
+	}
+
+	return sizeInBytes
+}
+
+func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) (
+	*index.VectorDoc, error) {
+	rv := preAlloced
+	if rv == nil {
+		rv = &index.VectorDoc{}
+	}
+
+	for i.segmentOffset < len(i.iterators) {
+		next, err := i.iterators[i.segmentOffset].Next()
+		if err != nil {
+			return nil, err
+		}
+		if next != nil {
+			// make segment number into global number by adding offset
+			globalOffset := i.snapshot.offsets[i.segmentOffset]
+			nnum := next.Number()
+			rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset)
+			rv.Score = float64(next.Score())
+
+			i.currID = rv.ID
+			i.currPosting = next
+
+			return rv, nil
+		}
+		i.segmentOffset++
+	}
+
+	return nil, nil
+}
+
+func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
+	preAlloced *index.VectorDoc) (*index.VectorDoc, error) {
+
+	if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
+		i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k)
+		if err != nil {
+			return nil, err
+		}
+		// close the current term field reader before replacing it with a new one
+		_ = i.Close()
+		*i = *(i2.(*IndexSnapshotVectorReader))
+	}
+
+	num, err := docInternalToNumber(ID)
+	if err != nil {
+		return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
+	}
+	segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
+	if segIndex >= len(i.snapshot.segment) {
+		return nil, fmt.Errorf("computed segment index %d out of bounds %d",
+			segIndex, len(i.snapshot.segment))
+	}
+	// skip directly to the target segment
+	i.segmentOffset = segIndex
+	next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
+	if err != nil {
+		return nil, err
+	}
+	if next == nil {
+		// we jumped directly to the segment that should have contained it
+		// but it wasn't there, so reuse Next() which should correctly
+		// get the next hit after it (we moved i.segmentOffset)
+		return i.Next(preAlloced)
+	}
+
+	if preAlloced == nil {
+		preAlloced = &index.VectorDoc{}
+	}
+	preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
+		i.snapshot.offsets[segIndex])
+	i.currID = preAlloced.ID
+	i.currPosting = next
+	return preAlloced, nil
+}
+
+func (i *IndexSnapshotVectorReader) Count() uint64 {
+	var rv uint64
+	for _, posting := range i.postings {
+		rv += posting.Count()
+	}
+	return rv
+}
+
+func (i *IndexSnapshotVectorReader) Close() error {
+	// TODO Consider if any scope of recycling here.
+	return nil
+}