Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
test.gpt
bin/
.idea/
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.PHONY: build
build:
CGO_ENABLED=0 go build -o bin/gptscript-go-tool -tags "${GO_TAGS}" -ldflags "-s -w" .
9 changes: 9 additions & 0 deletions context/tool.gpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Name: dataset-context
Share Tools: * from ../tool.gpt

#!sys.echo

Some of the tools that you call might return a dataset.
A dataset is represented by a simple JSON string that contains a dataset ID.
It will look something like {"gptscript_dataset_id": "1234"}.
You can use the dataset ID to get data from the dataset using the dataset tools.
16 changes: 16 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module github.com/gptscript-ai/datasets

go 1.23.2

require (
github.com/stretchr/testify v1.9.0
github.com/tidwall/gjson v1.18.0
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
16 changes: 16 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
112 changes: 112 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package main

import (
"fmt"
"os"
"strconv"
"strings"

"github.com/gptscript-ai/datasets/pkg/dataset"
)

func main() {
if len(os.Args) < 2 {
fmt.Println("missing argument")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add what we expect the argument to be? I see the switch below, but maybe something like missing action as first argument would be more descriptive so the LLM can recover?

Copy link
Member Author

@g-linville g-linville Oct 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Darren had some specific instructions about what to do for this, so I just updated it to be the way he wants.

os.Exit(1)
}

workspace := os.Getenv("GPTSCRIPT_WORKSPACE_DIR")
if workspace == "" {
fmt.Println("missing GPTSCRIPT_WORKSPACE_DIR")
os.Exit(1)
}

arg := os.Args[1]

var (
result string
err error
)
switch arg {
case "info":
result, err = info(os.Getenv("ID"), workspace)
case "load_one":
result, err = loadOne(os.Getenv("ID"), os.Getenv("INDEX"), workspace)
case "load_range":
result, err = loadRange(os.Getenv("ID"), os.Getenv("START"), os.Getenv("END"), workspace)
case "load_all":
result, err = loadAll(os.Getenv("ID"), workspace)
}

if err != nil {
fmt.Println(err)
os.Exit(1)
}

fmt.Println(result)
}

func info(id, workspace string) (string, error) {
set, err := dataset.ParseDataset(id, workspace)
if err != nil {
return "", err
}

return fmt.Sprintf("Dataset ID: %s, length: %d", set.GetID(), set.Length()), nil
}

func loadOne(id, index, workspace string) (string, error) {
set, err := dataset.ParseDataset(id, workspace)
if err != nil {
return "", err
}

indexInt, err := strconv.Atoi(index)
if err != nil {
return "", fmt.Errorf("invalid index: %v", err)
}

data, err := set.Nth(indexInt)
if err != nil {
return "", err
}

return data, nil
}

func loadRange(id, start, end, workspace string) (string, error) {
set, err := dataset.ParseDataset(id, workspace)
if err != nil {
return "", err
}

startInt, err := strconv.Atoi(start)
if err != nil {
return "", fmt.Errorf("invalid start: %v", err)
}
endInt, err := strconv.Atoi(end)
if err != nil {
return "", fmt.Errorf("invalid end: %v", err)
}

data, err := set.Range(startInt, endInt)
if err != nil {
return "", err
}

return strings.Join(data, "\n"), nil
}

func loadAll(id, workspace string) (string, error) {
set, err := dataset.ParseDataset(id, workspace)
if err != nil {
return "", err
}

data, err := set.Range(0, set.Length()-1)
if err != nil {
return "", err
}

return strings.Join(data, "\n"), nil
}
225 changes: 225 additions & 0 deletions pkg/dataset/dataset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
package dataset

import (
"encoding/json"
"fmt"
"os"
"strings"
)

type Dataset interface {
GetID() string
Type() string
Length() int
Nth(i int) (string, error)
Range(i, j int) ([]string, error)
}

// ArrayDataset represents an array of generic JSON data.
type ArrayDataset struct {
ID string
Data []any
}

func (d *ArrayDataset) GetID() string {
return d.ID
}

func (d *ArrayDataset) Type() string {
return "array"
}

func (d *ArrayDataset) Length() int {
return len(d.Data)
}

func (d *ArrayDataset) Nth(i int) (string, error) {
if i < 0 || i >= len(d.Data) {
return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID)
}

datum, err := json.Marshal(d.Data[i])
if err != nil {
return "", fmt.Errorf("error marshalling data at index %d in dataset %s: %v", i, d.ID, err)
}

return string(datum), nil
}

func (d *ArrayDataset) Range(i, j int) ([]string, error) {
if i > j {
return nil, fmt.Errorf("invalid range %d - %d for dataset %s", i, j, d.ID)
}

if i < 0 || j >= len(d.Data) {
return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID)
}

var data []string
for k := i; k <= j; k++ {
datum, err := d.Nth(k)
if err != nil {
return nil, err
}
data = append(data, datum)
}

return data, nil
}

// FileDataset represents a single file in the workspace.
// This dataset supports three different iteration strategies:
// - LineMethod: each line in the file is a separate piece of data
// - SplitMethod: the file is split by a delimiter, specified in a metadata file
// - WholeMethod: the entire file is a single piece of data
type FileDataset struct {
Method IterationMethod
ID, Splitter string
Contents []byte
}

func (d *FileDataset) GetID() string {
return d.ID
}

func (d *FileDataset) Type() string {
return "file"
}

func (d *FileDataset) Length() int {
fileStr := string(d.Contents)
switch d.Method {
case LineMethod:
return len(strings.Split(fileStr, "\n"))
case SplitMethod:
return len(strings.Split(fileStr, d.Splitter))
case WholeMethod:
return 1
}
return 0
}

func (d *FileDataset) Nth(i int) (string, error) {
fileStr := string(d.Contents)
switch d.Method {
case LineMethod:
lines := strings.Split(fileStr, "\n")
if i < 0 || i >= len(lines) {
return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID)
}
return lines[i], nil
case SplitMethod:
parts := strings.Split(fileStr, d.Splitter)
if i < 0 || i >= len(parts) {
return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID)
}
return parts[i], nil
case WholeMethod:
if i > 0 {
return "", fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID)
}
return fileStr, nil
}
return "", fmt.Errorf("unknown iteration strategy %s for dataset %s", d.Method, d.ID)
}

func (d *FileDataset) Range(i, j int) ([]string, error) {
if i > j {
return nil, fmt.Errorf("invalid range %d - %d for dataset %s", i, j, d.ID)
}

fileStr := string(d.Contents)
switch d.Method {
case LineMethod:
lines := strings.Split(fileStr, "\n")
if i < 0 || j >= len(lines) {
return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID)
}
return lines[i : j+1], nil
case SplitMethod:
parts := strings.Split(fileStr, d.Splitter)
if i < 0 || j >= len(parts) {
return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID)
}
return parts[i : j+1], nil
case WholeMethod:
if i > 0 || j > 1 {
return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID)
}
return []string{fileStr}, nil
}
return nil, fmt.Errorf("unknown iteration strategy %s for dataset %s", d.Method, d.ID)
}

// FolderDataset represents a folder in the workspace, where each file is a single piece of data.
type FolderDataset struct {
ID string
Files []string
}

func (d *FolderDataset) GetID() string {
return d.ID
}

func (d *FolderDataset) Type() string {
return "folder"
}

func (d *FolderDataset) Length() int {
return len(d.Files)
}

func (d *FolderDataset) Nth(i int) (string, error) {
data, _, err := d.nthWithCurrentSize(i, 0)
return data, err
}

func (d *FolderDataset) nthWithCurrentSize(i int, currentSize int64) (string, int64, error) {
if i < 0 || i >= len(d.Files) {
return "", 0, fmt.Errorf("index %d out of bounds for dataset %s", i, d.ID)
}

fileName := d.Files[i]
fileStat, err := os.Stat(fileName)
if err != nil {
return "", 0, fmt.Errorf("error getting info for file %s: %v", fileName, err)
}

if fileStat.Size()+currentSize > 100*1024*1024 { // 100 MiB
return "", 0, fmt.Errorf("dataset %s is too large to read (combined file size must be under 100 MiB)", d.ID)
}
currentSize += fileStat.Size()

contents, err := os.ReadFile(fileName)
if err != nil {
return "", 0, fmt.Errorf("error reading file %s: %v", fileName, err)
}

return string(contents), currentSize, nil
}

func (d *FolderDataset) Range(i, j int) ([]string, error) {
if i > j {
return nil, fmt.Errorf("invalid range %d - %d for dataset %s", i, j, d.ID)
}

if i < 0 || j >= len(d.Files) {
return nil, fmt.Errorf("range %d - %d out of bounds for dataset %s", i, j, d.ID)
}

var (
data []string
contents string
size int64
err error
)
for k := i; k <= j; k++ {
contents, size, err = d.nthWithCurrentSize(k, size)
if err != nil {
return nil, err
}
data = append(data, contents)
}

return data, nil
}
Loading