diff --git a/pkg/client/client.go b/pkg/client/client.go index 2951ac14..2e02ccd8 100644 --- a/pkg/client/client.go +++ b/pkg/client/client.go @@ -12,6 +12,7 @@ type IngestPathsOpts struct { IgnoreExtensions []string Concurrency int Recursive bool + TextSplitterOpts *datastore.TextSplitterOpts } type RetrieveOpts struct { diff --git a/pkg/client/default.go b/pkg/client/default.go index 9d6de418..a137eb15 100644 --- a/pkg/client/default.go +++ b/pkg/client/default.go @@ -143,6 +143,9 @@ func (c *DefaultClient) IngestPaths(ctx context.Context, datasetID string, opts }, IsDuplicateFuncName: "file_metadata", } + if opts != nil { + payload.TextSplitterOpts = opts.TextSplitterOpts + } _, err = c.Ingest(ctx, datasetID, content, payload) return err } diff --git a/pkg/client/standalone.go b/pkg/client/standalone.go index 1a72ba16..7049e24e 100644 --- a/pkg/client/standalone.go +++ b/pkg/client/standalone.go @@ -23,15 +23,19 @@ func NewStandaloneClient(ds *datastore.Datastore) (*StandaloneClient, error) { } func (c *StandaloneClient) CreateDataset(ctx context.Context, datasetID string) (types.Dataset, error) { - ds := types.Dataset{ + ds := index.Dataset{ ID: datasetID, - EmbedDimension: nil, + EmbedDimension: 0, + } + r := types.Dataset{ + ID: ds.ID, + EmbedDimension: z.Pointer(ds.EmbedDimension), } err := c.Datastore.NewDataset(ctx, ds) if err != nil { - return ds, err + return r, err } - return ds, nil + return r, nil } func (c *StandaloneClient) DeleteDataset(ctx context.Context, datasetID string) error { @@ -43,7 +47,18 @@ func (c *StandaloneClient) GetDataset(ctx context.Context, datasetID string) (*i } func (c *StandaloneClient) ListDatasets(ctx context.Context) ([]types.Dataset, error) { - return c.Datastore.ListDatasets(ctx) + ds, err := c.Datastore.ListDatasets(ctx) + if err != nil { + return nil, err + } + r := make([]types.Dataset, len(ds)) + for i, d := range ds { + r[i] = types.Dataset{ + ID: d.ID, + EmbedDimension: z.Pointer(d.EmbedDimension), + } + } + return r, nil } func (c *StandaloneClient) Ingest(ctx context.Context, datasetID string, data []byte, opts datastore.IngestOpts) ([]string, error) { @@ -67,7 +82,8 @@ func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, op if err != nil { return fmt.Errorf("failed to open file %s: %w", path, err) } - _, err = c.Datastore.Ingest(ctx, datasetID, file, datastore.IngestOpts{ + + iopts := datastore.IngestOpts{ Filename: z.Pointer(filepath.Base(path)), FileMetadata: &index.FileMetadata{ Name: filepath.Base(path), @@ -76,7 +92,13 @@ func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, op ModifiedAt: finfo.ModTime(), }, IsDuplicateFunc: datastore.DedupeByFileMetadata, - }) + } + + if opts != nil { + iopts.TextSplitterOpts = opts.TextSplitterOpts + } + + _, err = c.Datastore.Ingest(ctx, datasetID, file, iopts) return err } @@ -94,7 +116,7 @@ func (c *StandaloneClient) DeleteDocuments(ctx context.Context, datasetID string } func (c *StandaloneClient) Retrieve(ctx context.Context, datasetID string, query string, opts RetrieveOpts) ([]vectorstore.Document, error) { - return c.Datastore.Retrieve(ctx, datasetID, types.Query{Prompt: query, TopK: z.Pointer(opts.TopK)}) + return c.Datastore.Retrieve(ctx, datasetID, query, opts.TopK) } func (c *StandaloneClient) AskDirectory(ctx context.Context, path string, query string, opts *IngestPathsOpts, ropts *RetrieveOpts) ([]vectorstore.Document, error) { diff --git a/pkg/cmd/client.go b/pkg/cmd/client.go index f1986d25..b576a7ad 100644 --- a/pkg/cmd/client.go +++ b/pkg/cmd/client.go @@ -2,15 +2,15 @@ package cmd import ( "github.com/gptscript-ai/knowledge/pkg/client" + "github.com/gptscript-ai/knowledge/pkg/config" "github.com/gptscript-ai/knowledge/pkg/datastore" - "github.com/gptscript-ai/knowledge/pkg/types" ) type Client struct { Server string `usage:"URL of the Knowledge API Server" default:"" env:"KNOW_SERVER_URL"` - types.OpenAIConfig - types.DatabaseConfig - types.VectorDBConfig + config.OpenAIConfig + config.DatabaseConfig + config.VectorDBConfig } func (s *Client) getClient() (client.Client, error) { diff --git a/pkg/cmd/ingest.go b/pkg/cmd/ingest.go index 76529b9e..aeaee58c 100644 --- a/pkg/cmd/ingest.go +++ b/pkg/cmd/ingest.go @@ -3,6 +3,7 @@ package cmd import ( "fmt" "github.com/gptscript-ai/knowledge/pkg/client" + "github.com/gptscript-ai/knowledge/pkg/datastore" "github.com/spf13/cobra" "strings" ) @@ -11,6 +12,7 @@ type ClientIngest struct { Client Dataset string `usage:"Target Dataset ID" short:"d" default:"default" env:"KNOW_TARGET_DATASET"` ClientIngestOpts + datastore.TextSplitterOpts } type ClientIngestOpts struct { @@ -38,6 +40,7 @@ func (s *ClientIngest) Run(cmd *cobra.Command, args []string) error { IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","), Concurrency: s.Concurrency, Recursive: s.Recursive, + TextSplitterOpts: &s.TextSplitterOpts, } filesIngested, err := c.IngestPaths(cmd.Context(), datasetID, ingestOpts, filePath) diff --git a/pkg/cmd/server.go b/pkg/cmd/server.go index 26016350..b068b59c 100644 --- a/pkg/cmd/server.go +++ b/pkg/cmd/server.go @@ -2,9 +2,9 @@ package cmd import ( "fmt" + "github.com/gptscript-ai/knowledge/pkg/config" "github.com/gptscript-ai/knowledge/pkg/datastore" "github.com/gptscript-ai/knowledge/pkg/server" - "github.com/gptscript-ai/knowledge/pkg/types" "github.com/spf13/cobra" "os/signal" "syscall" @@ -16,9 +16,9 @@ type Server struct { ServerPort string `usage:"Server port" default:"8000" env:"KNOW_SERVER_PORT"` ServerAPIBase string `usage:"Server API base" default:"/v1" env:"KNOW_SERVER_API_BASE"` - types.OpenAIConfig - types.DatabaseConfig - types.VectorDBConfig + config.OpenAIConfig + config.DatabaseConfig + config.VectorDBConfig } func (s *Server) Run(cmd *cobra.Command, _ []string) error { diff --git a/pkg/types/config.go b/pkg/config/config.go similarity index 98% rename from pkg/types/config.go rename to pkg/config/config.go index bc81f5af..0e6d840d 100644 --- a/pkg/types/config.go +++ b/pkg/config/config.go @@ -1,4 +1,4 @@ -package types +package config type OpenAIConfig struct { APIBase string `usage:"OpenAI API base" default:"https://api.openai.com/v1" env:"OPENAI_BASE_URL"` // clicky-chats diff --git a/pkg/datastore/dataset.go b/pkg/datastore/dataset.go index 89f231a8..76ce9b9f 100644 --- a/pkg/datastore/dataset.go +++ b/pkg/datastore/dataset.go @@ -4,18 +4,16 @@ import ( "context" "errors" "fmt" - "github.com/acorn-io/z" "github.com/gptscript-ai/knowledge/pkg/index" - "github.com/gptscript-ai/knowledge/pkg/types" "github.com/gptscript-ai/knowledge/pkg/types/defaults" "gorm.io/gorm" "log/slog" ) -func (s *Datastore) NewDataset(ctx context.Context, dataset types.Dataset) error { +func (s *Datastore) NewDataset(ctx context.Context, dataset index.Dataset) error { // Set defaults - if dataset.EmbedDimension == nil || *dataset.EmbedDimension <= 0 { - dataset.EmbedDimension = z.Pointer(defaults.EmbeddingDimension) + if dataset.EmbedDimension <= 0 { + dataset.EmbedDimension = defaults.EmbeddingDimension } // Create dataset @@ -36,7 +34,7 @@ func (s *Datastore) NewDataset(ctx context.Context, dataset types.Dataset) error func (s *Datastore) DeleteDataset(ctx context.Context, datasetID string) error { // Delete dataset slog.Info("Deleting dataset", "id", datasetID) - tx := s.Index.WithContext(ctx).Delete(&types.Dataset{}, "id = ?", datasetID) + tx := s.Index.WithContext(ctx).Delete(&index.Dataset{}, "id = ?", datasetID) if tx.Error != nil { return tx.Error } @@ -63,13 +61,13 @@ func (s *Datastore) GetDataset(ctx context.Context, datasetID string) (*index.Da return dataset, nil } -func (s *Datastore) ListDatasets(ctx context.Context) ([]types.Dataset, error) { - tx := s.Index.WithContext(ctx).Find(&[]types.Dataset{}) +func (s *Datastore) ListDatasets(ctx context.Context) ([]index.Dataset, error) { + tx := s.Index.WithContext(ctx).Find(&[]index.Dataset{}) if tx.Error != nil { return nil, tx.Error } - var datasets []types.Dataset + var datasets []index.Dataset if err := tx.Scan(&datasets).Error; err != nil { return nil, err } diff --git a/pkg/datastore/datastore.go b/pkg/datastore/datastore.go index a3d6359a..55e901f4 100644 --- a/pkg/datastore/datastore.go +++ b/pkg/datastore/datastore.go @@ -5,8 +5,8 @@ import ( "fmt" "github.com/acorn-io/z" "github.com/adrg/xdg" + "github.com/gptscript-ai/knowledge/pkg/config" "github.com/gptscript-ai/knowledge/pkg/index" - "github.com/gptscript-ai/knowledge/pkg/types" "github.com/gptscript-ai/knowledge/pkg/vectorstore" "github.com/gptscript-ai/knowledge/pkg/vectorstore/chromem" cg "github.com/philippgille/chromem-go" @@ -41,7 +41,7 @@ func GetDatastorePaths(dsn, vectordbPath string) (string, string, error) { return dsn, vectordbPath, nil } -func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig types.OpenAIConfig) (*Datastore, error) { +func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfig config.OpenAIConfig) (*Datastore, error) { dsn, vectorDBPath, err := GetDatastorePaths(dsn, vectorDBPath) if err != nil { return nil, fmt.Errorf("failed to determine datastore paths: %w", err) @@ -80,7 +80,7 @@ func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfi } if defaultDS == nil { - err = ds.NewDataset(context.Background(), types.Dataset{ID: "default", EmbedDimension: nil}) + err = ds.NewDataset(context.Background(), index.Dataset{ID: "default"}) if err != nil { return nil, fmt.Errorf("failed to create default dataset: %w", err) } diff --git a/pkg/datastore/ingest.go b/pkg/datastore/ingest.go index e9c2482a..b44e6ca7 100644 --- a/pkg/datastore/ingest.go +++ b/pkg/datastore/ingest.go @@ -48,6 +48,7 @@ type IngestOpts struct { FileMetadata *index.FileMetadata IsDuplicateFuncName string IsDuplicateFunc IsDuplicateFunc + TextSplitterOpts *TextSplitterOpts } // Ingest loads a document from a reader and adds it to the dataset. @@ -118,7 +119,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, content []byte return nil, nil } - docs, err := GetDocuments(ctx, *opts.Filename, filetype, reader) + docs, err := GetDocuments(ctx, *opts.Filename, filetype, reader, opts.TextSplitterOpts) if err != nil { slog.Error("Failed to load documents", "error", err) return nil, fmt.Errorf("failed to load documents: %w", err) @@ -187,7 +188,12 @@ func mimetypeFromReader(reader io.Reader) (string, io.Reader, error) { return mtype.String(), newReader, err } -func GetDocuments(ctx context.Context, filename, filetype string, reader io.Reader) ([]vs.Document, error) { +func GetDocuments(ctx context.Context, filename, filetype string, reader io.Reader, textSplitterOpts *TextSplitterOpts) ([]vs.Document, error) { + if textSplitterOpts == nil { + textSplitterOpts = z.Pointer(NewTextSplitterOpts()) + } + lcgoTextSplitter := NewLcgoTextSplitter(*textSplitterOpts) + /* * Load documents from the content * For now, we're using documentloaders from both langchaingo and golc @@ -227,13 +233,13 @@ func GetDocuments(ctx context.Context, filename, filetype string, reader io.Read Metadata: rdoc.Metadata, } } - lcgodocs, err = lcgosplitter.SplitDocuments(defaultLcgoSplitter, lcgodocs) + lcgodocs, err = lcgosplitter.SplitDocuments(lcgoTextSplitter, lcgodocs) case ".html", "text/html": - lcgodocs, err = lcgodocloaders.NewHTML(reader).LoadAndSplit(ctx, defaultLcgoSplitter) + lcgodocs, err = lcgodocloaders.NewHTML(reader).LoadAndSplit(ctx, lcgoTextSplitter) case ".md", "text/markdown": - lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, defaultLcgoSplitter) + lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, lcgoTextSplitter) case ".txt", "text/plain": - lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, defaultLcgoSplitter) + lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, lcgoTextSplitter) case ".csv", "text/csv": golcdocs, err = golcdocloaders.NewCSV(reader).Load(ctx) if err != nil && errors.Is(err, csv.ErrBareQuote) { @@ -248,7 +254,7 @@ func GetDocuments(ctx context.Context, filename, filetype string, reader io.Read } } case ".json", "application/json": - lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, defaultLcgoSplitter) + lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, lcgoTextSplitter) case ".ipynb": golcdocs, err = golcdocloaders.NewNotebook(reader).Load(ctx) case ".docx", ".odt", ".rtf", "application/vnd.oasis.opendocument.text", "text/rtf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": @@ -260,7 +266,7 @@ func GetDocuments(ctx context.Context, filename, filetype string, reader io.Read if nerr != nil { return nil, fmt.Errorf("failed to extract text from %s: %w", filetype, nerr) } - lcgodocs, err = lcgodocloaders.NewText(strings.NewReader(text)).LoadAndSplit(ctx, defaultLcgoSplitter) + lcgodocs, err = lcgodocloaders.NewText(strings.NewReader(text)).LoadAndSplit(ctx, lcgoTextSplitter) default: // TODO(@iwilltry42): Fallback to plaintext reader? Example: Makefile, Dockerfile, Source Files, etc. slog.Error("Unsupported file type", "filename", filename, "type", filetype) diff --git a/pkg/datastore/retrieve.go b/pkg/datastore/retrieve.go index 177ef166..ccb35c03 100644 --- a/pkg/datastore/retrieve.go +++ b/pkg/datastore/retrieve.go @@ -2,20 +2,18 @@ package datastore import ( "context" - "github.com/acorn-io/z" - "github.com/gptscript-ai/knowledge/pkg/types" "github.com/gptscript-ai/knowledge/pkg/types/defaults" "github.com/gptscript-ai/knowledge/pkg/vectorstore" "log/slog" ) -func (s *Datastore) Retrieve(ctx context.Context, datasetID string, query types.Query) ([]vectorstore.Document, error) { - if query.TopK == nil { - query.TopK = z.Pointer(defaults.TopK) +func (s *Datastore) Retrieve(ctx context.Context, datasetID string, query string, topk int) ([]vectorstore.Document, error) { + if topk <= 0 { + topk = defaults.TopK } slog.Debug("Retrieving content from dataset", "dataset", datasetID, "query", query) - docs, err := s.Vectorstore.SimilaritySearch(ctx, query.Prompt, *query.TopK, datasetID) + docs, err := s.Vectorstore.SimilaritySearch(ctx, query, topk, datasetID) if err != nil { return nil, err } diff --git a/pkg/datastore/textsplitter.go b/pkg/datastore/textsplitter.go index 12fc1b20..e78ad105 100644 --- a/pkg/datastore/textsplitter.go +++ b/pkg/datastore/textsplitter.go @@ -2,6 +2,29 @@ package datastore import lcgosplitter "github.com/tmc/langchaingo/textsplitter" -var ( - defaultLcgoSplitter = lcgosplitter.NewTokenSplitter(lcgosplitter.WithChunkSize(defaultChunkSize), lcgosplitter.WithChunkOverlap(defaultChunkOverlap), lcgosplitter.WithModelName(defaultTokenModel), lcgosplitter.WithEncodingName(defaultTokenEncoding)) -) +type TextSplitterOpts struct { + ChunkSize int `usage:"Textsplitter Chunk Size" default:"1024" env:"KNOW_TEXTSPLITTER_CHUNK_SIZE"` + ChunkOverlap int `usage:"Textsplitter Chunk Overlap" default:"256" env:"KNOW_TEXTSPLITTER_CHUNK_OVERLAP"` + ModelName string `usage:"Textsplitter Model Name" default:"gpt-4" env:"KNOW_TEXTSPLITTER_MODEL_NAME"` + EncodingName string `usage:"Textsplitter Encoding Name" default:"cl100k_base" env:"KNOW_TEXTSPLITTER_ENCODING_NAME"` +} + +// NewTextSplitterOpts returns the default options for a text splitter. +func NewTextSplitterOpts() TextSplitterOpts { + return TextSplitterOpts{ + ChunkSize: defaultChunkSize, + ChunkOverlap: defaultChunkOverlap, + ModelName: defaultTokenModel, + EncodingName: defaultTokenEncoding, + } +} + +// NewLcgoTextSplitter returns a new langchain-go text splitter. +func NewLcgoTextSplitter(opts TextSplitterOpts) lcgosplitter.TokenSplitter { + return lcgosplitter.NewTokenSplitter( + lcgosplitter.WithChunkSize(opts.ChunkSize), + lcgosplitter.WithChunkOverlap(opts.ChunkOverlap), + lcgosplitter.WithModelName(opts.ModelName), + lcgosplitter.WithEncodingName(opts.EncodingName), + ) +} diff --git a/pkg/server/routes.go b/pkg/server/routes.go index fe385261..53924f81 100644 --- a/pkg/server/routes.go +++ b/pkg/server/routes.go @@ -4,8 +4,10 @@ import ( "encoding/base64" "errors" "fmt" + "github.com/acorn-io/z" "github.com/gin-gonic/gin" "github.com/gptscript-ai/knowledge/pkg/datastore" + "github.com/gptscript-ai/knowledge/pkg/index" "github.com/gptscript-ai/knowledge/pkg/types" "log/slog" "net/http" @@ -28,7 +30,7 @@ func (s *Server) CreateDS(c *gin.Context) { } // Create Dataset - if err := s.NewDataset(c, dataset); err != nil { + if err := s.NewDataset(c, index.Dataset{ID: dataset.ID, EmbedDimension: z.Dereference(dataset.EmbedDimension)}); err != nil { slog.Error("Failed to create dataset", "error", err) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -77,7 +79,7 @@ func (s *Server) RetrieveFromDS(c *gin.Context) { return } - docs, err := s.Retrieve(c, id, query) + docs, err := s.Retrieve(c, id, query.Prompt, z.Dereference(query.TopK)) if err != nil { slog.Error("Failed to retrieve documents", "error", err) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) @@ -119,8 +121,9 @@ func (s *Server) IngestIntoDS(c *gin.Context) { // ingest content docIDs, err := s.Ingest(c, id, data, datastore.IngestOpts{ - Filename: ingest.Filename, - FileMetadata: ingest.FileMetadata, + Filename: ingest.Filename, + FileMetadata: ingest.FileMetadata, + TextSplitterOpts: ingest.TextSplitterOpts, }) if err != nil { @@ -206,7 +209,15 @@ func (s *Server) ListDS(c *gin.Context) { return } - c.JSON(http.StatusOK, datasets) + datasetsResponse := make([]types.Dataset, len(datasets)) + for i, dataset := range datasets { + datasetsResponse[i] = types.Dataset{ + ID: dataset.ID, + EmbedDimension: z.Pointer(dataset.EmbedDimension), + } + } + + c.JSON(http.StatusOK, datasetsResponse) } // GetDS gets a dataset by ID. diff --git a/pkg/server/server.go b/pkg/server/server.go index 2ce66b59..882400fb 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -3,9 +3,9 @@ package server import ( "context" "github.com/gin-gonic/gin" + "github.com/gptscript-ai/knowledge/pkg/config" "github.com/gptscript-ai/knowledge/pkg/datastore" "github.com/gptscript-ai/knowledge/pkg/docs" - "github.com/gptscript-ai/knowledge/pkg/types" swaggerFiles "github.com/swaggo/files" ginSwagger "github.com/swaggo/gin-swagger" "log/slog" @@ -18,10 +18,10 @@ type Config struct { type Server struct { *datastore.Datastore - openAIConfig types.OpenAIConfig + openAIConfig config.OpenAIConfig } -func NewServer(d *datastore.Datastore, oaiconfig types.OpenAIConfig) *Server { +func NewServer(d *datastore.Datastore, oaiconfig config.OpenAIConfig) *Server { return &Server{Datastore: d, openAIConfig: oaiconfig} } diff --git a/pkg/types/types.go b/pkg/types/types.go index 7224a002..e4e64fbc 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -1,6 +1,9 @@ package types -import "github.com/gptscript-ai/knowledge/pkg/index" +import ( + "github.com/gptscript-ai/knowledge/pkg/datastore" + "github.com/gptscript-ai/knowledge/pkg/index" +) // Dataset represents a new knowledge vector space type Dataset struct { @@ -17,9 +20,10 @@ type Query struct { // Ingest represents incoming content that should be ingested type Ingest struct { - Filename *string `json:"filename" ` - Content string `json:"content" binding:"required,base64"` - FileMetadata *index.FileMetadata `json:"metadata"` + Filename *string `json:"filename" ` + Content string `json:"content" binding:"required,base64"` + FileMetadata *index.FileMetadata `json:"metadata"` + TextSplitterOpts *datastore.TextSplitterOpts `json:"text_splitter_opts"` } type IngestResponse struct {