-
Notifications
You must be signed in to change notification settings - Fork 7
feat: add dataset tools and libraries #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
dd42ac7
005dacd
67cc0ae
d1d0f38
5246522
b0f36a8
9a64849
cf4a0a7
2143262
8df2136
d655ee2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| test.gpt | ||
| bin/ | ||
| .idea/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| .PHONY: build | ||
| build: | ||
| CGO_ENABLED=0 go build -o bin/gptscript-go-tool -tags "${GO_TAGS}" -ldflags "-s -w" . |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| Name: dataset-context | ||
| Share Tools: * from ../tool.gpt | ||
|
|
||
| #!sys.echo | ||
|
|
||
| Some of the tools that you call might return a dataset. | ||
| A dataset is represented by a simple JSON string that contains a dataset ID. | ||
| It will look something like {"gptscript_dataset_id": "1234"}. | ||
| You can use the dataset ID to get data from the dataset using the dataset tools. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| module github.com/gptscript-ai/datasets | ||
|
|
||
| go 1.23.2 | ||
|
|
||
| require ( | ||
| github.com/stretchr/testify v1.9.0 | ||
| github.com/tidwall/gjson v1.18.0 | ||
| ) | ||
|
|
||
| require ( | ||
| github.com/davecgh/go-spew v1.1.1 // indirect | ||
| github.com/pmezard/go-difflib v1.0.0 // indirect | ||
| github.com/tidwall/match v1.1.1 // indirect | ||
| github.com/tidwall/pretty v1.2.0 // indirect | ||
| gopkg.in/yaml.v3 v3.0.1 // indirect | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= | ||
| github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
| github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||
| github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||
| github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= | ||
| github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= | ||
| github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= | ||
| github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= | ||
| github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= | ||
| github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= | ||
| github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= | ||
| github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= | ||
| gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | ||
| gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
| gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= | ||
| gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| package main | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "strconv" | ||
| "strings" | ||
|
|
||
| "github.com/gptscript-ai/datasets/pkg/dataset" | ||
| ) | ||
|
|
||
| func main() { | ||
| if len(os.Args) < 2 { | ||
| fmt.Println("missing argument") | ||
|
||
| os.Exit(1) | ||
| } | ||
|
|
||
| workspace := os.Getenv("GPTSCRIPT_WORKSPACE_DIR") | ||
| if workspace == "" { | ||
| fmt.Println("missing GPTSCRIPT_WORKSPACE_DIR") | ||
| os.Exit(1) | ||
| } | ||
|
|
||
| arg := os.Args[1] | ||
|
|
||
| var ( | ||
| result string | ||
| err error | ||
| ) | ||
| switch arg { | ||
| case "info": | ||
| result, err = info(os.Getenv("ID"), workspace) | ||
| case "load_one": | ||
| result, err = loadOne(os.Getenv("ID"), os.Getenv("INDEX"), workspace) | ||
| case "load_range": | ||
| result, err = loadRange(os.Getenv("ID"), os.Getenv("START"), os.Getenv("END"), workspace) | ||
| case "load_all": | ||
| result, err = loadAll(os.Getenv("ID"), workspace) | ||
| } | ||
|
|
||
g-linville marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if err != nil { | ||
| fmt.Println(err) | ||
| os.Exit(1) | ||
| } | ||
|
|
||
| fmt.Println(result) | ||
| } | ||
|
|
||
| func info(id, workspace string) (string, error) { | ||
| set, err := dataset.ParseDataset(id, workspace) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| return fmt.Sprintf("Dataset ID: %s, length: %d", set.GetID(), set.Length()), nil | ||
| } | ||
|
|
||
| func loadOne(id, index, workspace string) (string, error) { | ||
| set, err := dataset.ParseDataset(id, workspace) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| indexInt, err := strconv.Atoi(index) | ||
| if err != nil { | ||
| return "", fmt.Errorf("invalid index: %v", err) | ||
| } | ||
|
|
||
| data, err := set.Nth(indexInt) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
thedadams marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| return data, nil | ||
| } | ||
|
|
||
| func loadRange(id, start, end, workspace string) (string, error) { | ||
| set, err := dataset.ParseDataset(id, workspace) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| startInt, err := strconv.Atoi(start) | ||
| if err != nil { | ||
| return "", fmt.Errorf("invalid start: %v", err) | ||
| } | ||
| endInt, err := strconv.Atoi(end) | ||
| if err != nil { | ||
| return "", fmt.Errorf("invalid end: %v", err) | ||
| } | ||
|
|
||
| data, err := set.Range(startInt, endInt) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| return strings.Join(data, "\n"), nil | ||
| } | ||
|
|
||
| func loadAll(id, workspace string) (string, error) { | ||
| set, err := dataset.ParseDataset(id, workspace) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| data, err := set.Range(0, set.Length()-1) | ||
| if err != nil { | ||
| return "", err | ||
| } | ||
|
|
||
| return strings.Join(data, "\n"), nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,225 @@ | ||
| package dataset | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "fmt" | ||
| "os" | ||
| "strings" | ||
| ) | ||
|
|
||
| type Dataset interface { | ||
| GetID() string | ||
| Type() string | ||
| Length() int | ||
| Nth(i int) (string, error) | ||
| Range(i, j int) ([]string, error) | ||
| } | ||
|
|
||
| // ArrayDataset represents an array of generic JSON data. | ||
| type ArrayDataset struct { | ||
| ID string | ||
| Data []any | ||
| } | ||
|
|
||
| func (d *ArrayDataset) GetID() string { | ||
| return d.ID | ||
| } | ||
|
|
||
| func (d *ArrayDataset) Type() string { | ||
| return "array" | ||
g-linville marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func (d *ArrayDataset) Length() int { | ||
| return len(d.Data) | ||
| } | ||
|
|
||
| func (d *ArrayDataset) Nth(i int) (string, error) { | ||
| if i < 0 || i >= len(d.Data) { | ||
| return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID) | ||
| } | ||
|
|
||
| datum, err := json.Marshal(d.Data[i]) | ||
| if err != nil { | ||
| return "", fmt.Errorf("error marshalling data at index %d in dataset %s: %v", i, d.ID, err) | ||
| } | ||
|
|
||
| return string(datum), nil | ||
| } | ||
|
|
||
| func (d *ArrayDataset) Range(i, j int) ([]string, error) { | ||
| if i > j { | ||
| return nil, fmt.Errorf("invalid range %d - %d for dataset %s", i, j, d.ID) | ||
| } | ||
|
|
||
| if i < 0 || j >= len(d.Data) { | ||
| return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID) | ||
| } | ||
|
|
||
| var data []string | ||
| for k := i; k <= j; k++ { | ||
| datum, err := d.Nth(k) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| data = append(data, datum) | ||
| } | ||
|
|
||
| return data, nil | ||
| } | ||
|
|
||
| // FileDataset represents a single file in the workspace. | ||
| // This dataset supports three different iteration strategies: | ||
| // - LineMethod: each line in the file is a separate piece of data | ||
| // - SplitMethod: the file is split by a delimiter, specified in a metadata file | ||
| // - WholeMethod: the entire file is a single piece of data | ||
| type FileDataset struct { | ||
| Method IterationMethod | ||
| ID, Splitter string | ||
| Contents []byte | ||
| } | ||
|
|
||
| func (d *FileDataset) GetID() string { | ||
| return d.ID | ||
| } | ||
|
|
||
| func (d *FileDataset) Type() string { | ||
| return "file" | ||
| } | ||
|
|
||
| func (d *FileDataset) Length() int { | ||
| fileStr := string(d.Contents) | ||
| switch d.Method { | ||
| case LineMethod: | ||
| return len(strings.Split(fileStr, "\n")) | ||
| case SplitMethod: | ||
| return len(strings.Split(fileStr, d.Splitter)) | ||
| case WholeMethod: | ||
| return 1 | ||
| } | ||
| return 0 | ||
| } | ||
|
|
||
| func (d *FileDataset) Nth(i int) (string, error) { | ||
| fileStr := string(d.Contents) | ||
| switch d.Method { | ||
| case LineMethod: | ||
| lines := strings.Split(fileStr, "\n") | ||
| if i < 0 || i >= len(lines) { | ||
| return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID) | ||
| } | ||
| return lines[i], nil | ||
| case SplitMethod: | ||
| parts := strings.Split(fileStr, d.Splitter) | ||
| if i < 0 || i >= len(parts) { | ||
| return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID) | ||
| } | ||
| return parts[i], nil | ||
| case WholeMethod: | ||
| if i > 0 { | ||
| return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID) | ||
| } | ||
| return fileStr, nil | ||
| } | ||
| return "", fmt.Errorf("unknown iteration strategy %s for dataset %s", d.Method, d.ID) | ||
| } | ||
|
|
||
| func (d *FileDataset) Range(i, j int) ([]string, error) { | ||
| if i > j { | ||
| return nil, fmt.Errorf("invalid range %d - %d for dataset %s", i, j, d.ID) | ||
| } | ||
|
|
||
| fileStr := string(d.Contents) | ||
| switch d.Method { | ||
| case LineMethod: | ||
| lines := strings.Split(fileStr, "\n") | ||
| if i < 0 || j >= len(lines) { | ||
| return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID) | ||
| } | ||
| return lines[i : j+1], nil | ||
| case SplitMethod: | ||
| parts := strings.Split(fileStr, d.Splitter) | ||
| if i < 0 || j >= len(parts) { | ||
| return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID) | ||
| } | ||
| return parts[i : j+1], nil | ||
| case WholeMethod: | ||
| if i > 0 || j > 1 { | ||
| return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID) | ||
| } | ||
| return []string{fileStr}, nil | ||
| } | ||
| return nil, fmt.Errorf("unknown iteration strategy %s for dataset %s", d.Method, d.ID) | ||
| } | ||
|
|
||
| // FolderDataset represents a folder in the workspace, where each file is a single piece of data. | ||
| type FolderDataset struct { | ||
| ID string | ||
| Files []string | ||
| } | ||
|
|
||
| func (d *FolderDataset) GetID() string { | ||
| return d.ID | ||
| } | ||
|
|
||
| func (d *FolderDataset) Type() string { | ||
| return "folder" | ||
| } | ||
|
|
||
| func (d *FolderDataset) Length() int { | ||
| return len(d.Files) | ||
| } | ||
|
|
||
| func (d *FolderDataset) Nth(i int) (string, error) { | ||
| data, _, err := d.nthWithCurrentSize(i, 0) | ||
| return data, err | ||
| } | ||
|
|
||
| func (d *FolderDataset) nthWithCurrentSize(i int, currentSize int64) (string, int64, error) { | ||
| if i < 0 || i >= len(d.Files) { | ||
| return "", 0, fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID) | ||
| } | ||
|
|
||
| fileName := d.Files[i] | ||
| fileStat, err := os.Stat(fileName) | ||
| if err != nil { | ||
| return "", 0, fmt.Errorf("error getting info for file %s: %v", fileName, err) | ||
| } | ||
|
|
||
| if fileStat.Size()+currentSize > 100*1024*1024 { // 100 MiB | ||
| return "", 0, fmt.Errorf("dataset %s is too large to read (combined file size must be under 100 MiB)", d.ID) | ||
| } | ||
| currentSize += fileStat.Size() | ||
|
|
||
| contents, err := os.ReadFile(fileName) | ||
| if err != nil { | ||
| return "", 0, fmt.Errorf("error reading file %s: %v", fileName, err) | ||
| } | ||
|
|
||
| return string(contents), currentSize, nil | ||
| } | ||
|
|
||
| func (d *FolderDataset) Range(i, j int) ([]string, error) { | ||
| if i > j { | ||
| return nil, fmt.Errorf("invalid range %d - %d for dataset %s", i, j, d.ID) | ||
| } | ||
|
|
||
| if i < 0 || j >= len(d.Files) { | ||
| return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID) | ||
| } | ||
|
|
||
| var ( | ||
| data []string | ||
| contents string | ||
| size int64 | ||
| err error | ||
| ) | ||
| for k := i; k <= j; k++ { | ||
| contents, size, err = d.nthWithCurrentSize(k, size) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| data = append(data, contents) | ||
| } | ||
|
|
||
| return data, nil | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.