Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
change: use chromem's new (pending) WhereDocument filters
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed Aug 14, 2024
1 parent 7ac9bb3 commit 9ec5b6d
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 38 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ toolchain go1.22.4
replace (
github.com/hupe1980/golc => github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7 // nbformat extension
github.com/ledongthuc/pdf => github.com/iwilltry42/pdf v0.0.0-20240517145113-99fbaebc5dd3 // fix for reading some PDFs: https://github.com/ledongthuc/pdf/pull/36 + https://github.com/iwilltry42/pdf/pull/2
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20240813194839-d838df05b583 // OpenAI Compat Fixes
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20240814131328-6335997e8f1a // OpenAI Compat Fixes
github.com/tmc/langchaingo => github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b // Context-Aware Markdown Splitting
)

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,8 @@ github.com/hupe1980/go-tiktoken v0.0.9 h1:qNs/XGTe7UHDUaFkU+jAPbhGzyi9BusOpxrNC8
github.com/hupe1980/go-tiktoken v0.0.9/go.mod h1:NME6d8hrE+Jo+kLUZHhXShYV8e40hYkm4BbSLQKtvAo=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/iwilltry42/chromem-go v0.0.0-20240813194839-d838df05b583 h1:xTsr6cysGZGpu9xYaLiYItFu47Lh54jC49OwYX7fE2M=
github.com/iwilltry42/chromem-go v0.0.0-20240813194839-d838df05b583/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/chromem-go v0.0.0-20240814131328-6335997e8f1a h1:0D+3L3JfgkSTLXrD0lVLyD9tVcGmI+7zQdgzQb9ijnY=
github.com/iwilltry42/chromem-go v0.0.0-20240814131328-6335997e8f1a/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7 h1:2AzzbKVW1iP2F+ovqJKq801l6tgxYPt9m2zFKbs+i/Y=
github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7/go.mod h1:w692KzkSTSvXROfyu+jYauNXB4YaL1s8zHPDMnNW88o=
github.com/iwilltry42/pdf v0.0.0-20240517145113-99fbaebc5dd3 h1:rCVwFT7Q+HxpijWfSzKTYX4pCDMS7oy/I/WzU30VXyI=
Expand Down
39 changes: 36 additions & 3 deletions pkg/datastore/retrieve.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package datastore
import (
"context"
"github.com/gptscript-ai/knowledge/pkg/datastore/types"
"github.com/philippgille/chromem-go"
"log/slog"

"github.com/gptscript-ai/knowledge/pkg/datastore/defaults"
Expand All @@ -29,9 +30,41 @@ func (s *Datastore) Retrieve(ctx context.Context, datasetIDs []string, query str
}
retrievalFlow.FillDefaults(topK)

return retrievalFlow.Run(ctx, s, query, datasetIDs, &flows.RetrievalFlowOpts{Keywords: opts.Keywords})
var whereDocs []chromem.WhereDocument
if len(opts.Keywords) > 0 {
whereDoc := chromem.WhereDocument{
Operator: chromem.WhereDocumentOperatorOr,
WhereDocuments: []chromem.WhereDocument{},
}
whereDocNot := chromem.WhereDocument{
Operator: chromem.WhereDocumentOperatorAnd,
WhereDocuments: []chromem.WhereDocument{},
}
for _, kw := range opts.Keywords {
if kw[0] == '!' {
whereDocNot.WhereDocuments = append(whereDocNot.WhereDocuments, chromem.WhereDocument{
Operator: chromem.WhereDocumentOperatorContains,
Value: kw[1:],
})
} else {
whereDoc.WhereDocuments = append(whereDoc.WhereDocuments, chromem.WhereDocument{
Operator: chromem.WhereDocumentOperatorContains,
Value: kw,
})
}
}
if len(whereDoc.WhereDocuments) > 0 {
whereDocs = append(whereDocs, whereDoc)
}
if len(whereDocNot.WhereDocuments) > 0 {
whereDocs = append(whereDocs, whereDocNot)
}

}

return retrievalFlow.Run(ctx, s, query, datasetIDs, &flows.RetrievalFlowOpts{Where: nil, WhereDocument: whereDocs})
}

func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocuments int, datasetID string, keywords ...string) ([]vectorstore.Document, error) {
return s.Vectorstore.SimilaritySearch(ctx, query, numDocuments, datasetID, keywords...)
func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocuments int, datasetID string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vectorstore.Document, error) {
return s.Vectorstore.SimilaritySearch(ctx, query, numDocuments, datasetID, where, whereDocument)
}
7 changes: 4 additions & 3 deletions pkg/datastore/retrievers/retrievers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import (
"context"
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore/store"
"github.com/philippgille/chromem-go"
"log/slog"

"github.com/gptscript-ai/knowledge/pkg/datastore/defaults"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
)

type Retriever interface {
Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, keywords ...string) ([]vs.Document, error)
Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error)
Name() string
}

Expand Down Expand Up @@ -42,7 +43,7 @@ func (r *BasicRetriever) Name() string {
return BasicRetrieverName
}

func (r *BasicRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, keywords ...string) ([]vs.Document, error) {
func (r *BasicRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error) {

if len(datasetIDs) > 1 {
return nil, fmt.Errorf("basic retriever does not support querying multiple datasets")
Expand All @@ -60,5 +61,5 @@ func (r *BasicRetriever) Retrieve(ctx context.Context, store store.Store, query
log.Debug("[BasicRetriever] TopK not set, using default", "default", defaults.TopK)
r.TopK = defaults.TopK
}
return store.SimilaritySearch(ctx, query, r.TopK, datasetID, keywords...)
return store.SimilaritySearch(ctx, query, r.TopK, datasetID, where, whereDocument)
}
5 changes: 3 additions & 2 deletions pkg/datastore/retrievers/routing.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/gptscript-ai/knowledge/pkg/datastore/store"
"github.com/gptscript-ai/knowledge/pkg/llm"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/philippgille/chromem-go"
"log/slog"
)

Expand Down Expand Up @@ -35,7 +36,7 @@ type routingResp struct {
Result string `json:"result"`
}

func (r *RoutingRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, keywords ...string) ([]vs.Document, error) {
func (r *RoutingRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error) {
log := slog.With("component", "RoutingRetriever")

// TODO: properly handle the datasetIDs input
Expand Down Expand Up @@ -92,5 +93,5 @@ func (r *RoutingRetriever) Retrieve(ctx context.Context, store store.Store, quer

slog.Debug("Routing query to dataset", "query", query, "dataset", resp.Result)

return store.SimilaritySearch(ctx, query, r.TopK, resp.Result, keywords...)
return store.SimilaritySearch(ctx, query, r.TopK, resp.Result, where, whereDocument)
}
5 changes: 3 additions & 2 deletions pkg/datastore/retrievers/subquery.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/gptscript-ai/knowledge/pkg/datastore/store"
"github.com/gptscript-ai/knowledge/pkg/llm"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/philippgille/chromem-go"
"log/slog"
"strings"
)
Expand Down Expand Up @@ -37,7 +38,7 @@ type subqueryResp struct {
Results []string `json:"results"`
}

func (s SubqueryRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, keywords ...string) ([]vs.Document, error) {
func (s SubqueryRetriever) Retrieve(ctx context.Context, store store.Store, query string, datasetIDs []string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error) {

if len(datasetIDs) > 1 {
return nil, fmt.Errorf("basic retriever does not support querying multiple datasets")
Expand Down Expand Up @@ -84,7 +85,7 @@ func (s SubqueryRetriever) Retrieve(ctx context.Context, store store.Store, quer

var resultDocs []vs.Document
for _, q := range queries {
docs, err := store.SimilaritySearch(ctx, q, s.TopK, datasetID, keywords...)
docs, err := store.SimilaritySearch(ctx, q, s.TopK, datasetID, where, whereDocument)
if err != nil {
return nil, err
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/datastore/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ import (
"context"
"github.com/gptscript-ai/knowledge/pkg/index"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/philippgille/chromem-go"
)

type Store interface {
ListDatasets(ctx context.Context) ([]index.Dataset, error)
GetDataset(ctx context.Context, datasetID string) (*index.Dataset, error)
SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, keywords ...string) ([]vs.Document, error)
SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error)
}
6 changes: 4 additions & 2 deletions pkg/flows/flows.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"github.com/gptscript-ai/knowledge/pkg/datastore/store"
"github.com/philippgille/chromem-go"
"io"
"log/slog"
"slices"
Expand Down Expand Up @@ -115,7 +116,8 @@ func (f *RetrievalFlow) FillDefaults(topK int) {
}

type RetrievalFlowOpts struct {
Keywords []string
Where map[string]string
WhereDocument []chromem.WhereDocument
}

func (f *RetrievalFlow) Run(ctx context.Context, store store.Store, query string, datasetIDs []string, opts *RetrievalFlowOpts) (*dstypes.RetrievalResponse, error) {
Expand All @@ -140,7 +142,7 @@ func (f *RetrievalFlow) Run(ctx context.Context, store store.Store, query string
}
for _, q := range queries {

docs, err := f.Retriever.Retrieve(ctx, store, q, datasetIDs, opts.Keywords...)
docs, err := f.Retriever.Retrieve(ctx, store, q, datasetIDs, opts.Where, opts.WhereDocument)
if err != nil {
return nil, fmt.Errorf("failed to retrieve documents for query %q using retriever %q: %w", q, f.Retriever.Name(), err)
}
Expand Down
28 changes: 9 additions & 19 deletions pkg/vectorstore/chromem/chromem.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,16 @@ package chromem
import (
"context"
"fmt"
"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/env"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/gptscript-ai/knowledge/pkg/vectorstore/errors"
"github.com/philippgille/chromem-go"
"log/slog"
"maps"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/env"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/philippgille/chromem-go"
)

// VsChromemEmbeddingParallelThread can be set as an environment variable to control the number of parallel API calls to create embedding for documents. Default is 100
Expand Down Expand Up @@ -109,7 +107,7 @@ func convertStringMapToAnyMap(m map[string]string) map[string]any {
return convertedMap
}

func (s *Store) SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, keywords ...string) ([]vs.Document, error) {
func (s *Store) SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error) {
col := s.db.GetCollection(collection, s.embeddingFunc)
if col == nil {
return nil, fmt.Errorf("%w: %q", errors.ErrCollectionNotFound, collection)
Expand All @@ -124,7 +122,9 @@ func (s *Store) SimilaritySearch(ctx context.Context, query string, numDocuments
slog.Debug("Reduced number of documents to search for", "numDocuments", numDocuments)
}

qr, err := col.Query(ctx, query, numDocuments, nil, nil)
slog.Debug("filtering documents", "where", where, "whereDocument", whereDocument)

qr, err := col.Query(ctx, query, numDocuments, where, whereDocument)
if err != nil {
return nil, err
}
Expand All @@ -135,17 +135,7 @@ func (s *Store) SimilaritySearch(ctx context.Context, query string, numDocuments

var sDocs []vs.Document

slog.Debug("filtering documents by keywords", "keywords", keywords)

resultLoop:
for _, qrd := range qr {
for _, keyword := range keywords {
if !strings.Contains(qrd.Content, keyword) {
slog.Debug("Document does not contain keyword", "keyword", keyword, "documentID", qrd.ID)
continue resultLoop
}
}

sDocs = append(sDocs, vs.Document{
Metadata: convertStringMapToAnyMap(qrd.Metadata),
SimilarityScore: qrd.Similarity,
Expand All @@ -162,7 +152,7 @@ func (s *Store) RemoveCollection(_ context.Context, collection string) error {
return s.db.DeleteCollection(collection)
}

func (s *Store) RemoveDocument(ctx context.Context, documentID string, collection string, where, whereDocument map[string]string) error {
func (s *Store) RemoveDocument(ctx context.Context, documentID string, collection string, where map[string]string, whereDocument []chromem.WhereDocument) error {
col := s.db.GetCollection(collection, s.embeddingFunc)
if col == nil {
return fmt.Errorf("%w: %q", errors.ErrCollectionNotFound, collection)
Expand Down
7 changes: 4 additions & 3 deletions pkg/vectorstore/vectorstores.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@ package vectorstore

import (
"context"
"github.com/philippgille/chromem-go"
)

type VectorStore interface {
CreateCollection(ctx context.Context, collection string) error
AddDocuments(ctx context.Context, docs []Document, collection string) ([]string, error) // @return documentIDs, error
SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, keywords ...string) ([]Document, error) //nolint:lll
AddDocuments(ctx context.Context, docs []Document, collection string) ([]string, error) // @return documentIDs, error
SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, where map[string]string, whereDocument []chromem.WhereDocument) ([]Document, error) //nolint:lll
RemoveCollection(ctx context.Context, collection string) error
RemoveDocument(ctx context.Context, documentID string, collection string, where, whereDocument map[string]string) error
RemoveDocument(ctx context.Context, documentID string, collection string, where map[string]string, whereDocument []chromem.WhereDocument) error

ImportCollectionsFromFile(ctx context.Context, path string, collections ...string) error
ExportCollectionsToFile(ctx context.Context, path string, collections ...string) error
Expand Down

0 comments on commit 9ec5b6d

Please sign in to comment.