Skip to content

Commit

Permalink
feat(data): Add support for managed dataset and provide shortcut for …
Browse files Browse the repository at this point in the history
…common framework (#751)

* add

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* refactor a little bit

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* add

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* fix

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* fix

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* fix

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* lint

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

* fix

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>

Signed-off-by: Jinjing.Zhou <allenzhou@tensorchord.ai>
  • Loading branch information
VoVAllen authored Aug 17, 2022
1 parent 7c2fed6 commit c49863e
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 4 deletions.
5 changes: 3 additions & 2 deletions examples/dgl/build.envd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def build():
# Select the shell environment you like
shell("zsh")

io.mount(src="~/.envd/data/dgl", dest="~/.dgl")
# io.mount(src="~/.envd/data/dgl", dest="~/.dgl")
io.mount(src=data.envd("dgl"), dest=data.path.dgl)

def build_gpu():
# Use ubuntu20.04 as base image and install python
Expand All @@ -25,4 +26,4 @@ def build_gpu():
# Select the shell environment you like
shell("zsh")

io.mount(src="~/.envd/data/dgl", dest="~/.dgl")
io.mount(src=data.envd("dgl"), dest=data.path.dgl)
22 changes: 22 additions & 0 deletions pkg/data/datasource.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2022 The envd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package data

type DataSource interface {
Init() error
GetHostDir() (string, error)
Type() string
Hash() (uint32, error)
}
54 changes: 54 additions & 0 deletions pkg/data/envd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright 2022 The envd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package data

import (
"fmt"

"github.com/tensorchord/envd/pkg/home"

"go.starlark.net/starlark"
)

type EnvdManagedDataSource struct {
name string
hostDataDir string
}

func (e *EnvdManagedDataSource) Init() error {
manager := home.GetManager()
hostDataDir, err := manager.InitDataDir(e.name)
if err != nil {
return err
}
e.hostDataDir = hostDataDir
return nil
}

func (e *EnvdManagedDataSource) GetHostDir() (string, error) {
return e.hostDataDir, nil
}

func (e *EnvdManagedDataSource) Type() string {
return "envd managed data source"
}

func (e *EnvdManagedDataSource) Hash() (uint32, error) {
return starlark.String(fmt.Sprintf("envd://%s", e.name)).Hash()
}

func NewEnvdManagedDataSource(name string) *EnvdManagedDataSource {
return &EnvdManagedDataSource{name: name}
}
41 changes: 41 additions & 0 deletions pkg/home/data.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright 2022 The envd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package home

import (
"os"
"path/filepath"

"github.com/cockroachdb/errors"
"github.com/sirupsen/logrus"
)

type dataManager interface {
InitDataDir(name string) (string, error)
}

func (m *generalManager) InitDataDir(name string) (string, error) {
newDataDir := filepath.Join(m.CacheDir(), "data", name)
if _, err := os.Stat(newDataDir); !os.IsNotExist(err) {
logrus.Infof("Data dir %s already exists, skipping creation", newDataDir)
return newDataDir, nil
}

err := os.Mkdir(newDataDir, 0777) // Avoid UID/GID issues
if err != nil {
return "", errors.Wrap(err, "failed to create data dir")
}
return newDataDir, nil
}
1 change: 1 addition & 0 deletions pkg/home/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type Manager interface {
configManager
contextManager
cacheManager
dataManager
}

type generalManager struct {
Expand Down
21 changes: 21 additions & 0 deletions pkg/lang/frontend/starlark/data/const.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright 2022 The envd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package data

const (
ruleEnvdManagedDataSource = "data.envd"
huggingFaceDatasetPath = "~/.cache/huggingface/datasets"
dglFaceDatasetPath = "~/.dgl"
)
54 changes: 54 additions & 0 deletions pkg/lang/frontend/starlark/data/rule.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright 2022 The envd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package data

import (
"github.com/sirupsen/logrus"
"go.starlark.net/starlark"
"go.starlark.net/starlarkstruct"

envddata "github.com/tensorchord/envd/pkg/data"
)

var (
logger = logrus.WithField("frontend", "starlark")
)

var Module = &starlarkstruct.Module{
Name: "data",
Members: starlark.StringDict{
"envd": starlark.NewBuiltin(ruleEnvdManagedDataSource, ruleValueEnvdManagedDataSource),
"path": &starlarkstruct.Module{
Name: "path",
Members: starlark.StringDict{
"huggingface": starlark.String(huggingFaceDatasetPath),
"dgl": starlark.String(dglFaceDatasetPath),
}},
},
}

func ruleValueEnvdManagedDataSource(thread *starlark.Thread, _ *starlark.Builtin,
args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
var name starlark.String

if err := starlark.UnpackArgs(ruleEnvdManagedDataSource, args, kwargs,
"name?", &name); err != nil {
return nil, err
}
logger.Debugf("rule `%s` is invoked, name=%s",
ruleEnvdManagedDataSource, name)

return NewDataSourceValue(envddata.NewEnvdManagedDataSource(name.String())), nil
}
53 changes: 53 additions & 0 deletions pkg/lang/frontend/starlark/data/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2022 The envd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package data

import (
"go.starlark.net/starlark"

envddata "github.com/tensorchord/envd/pkg/data"
)

type DataSourceValue struct {
source envddata.DataSource
}

func (d DataSourceValue) Init() error {
return d.source.Init()
}

func (d DataSourceValue) GetHostDir() (string, error) {
return d.source.GetHostDir()
}

func (d DataSourceValue) String() string {
return d.source.Type()
}

func (d DataSourceValue) Type() string {
return d.source.Type()
}

func (d DataSourceValue) Freeze() {}

func (d DataSourceValue) Truth() starlark.Bool { return true }

func (d DataSourceValue) Hash() (uint32, error) {
return d.source.Hash()
}

func NewDataSourceValue(source envddata.DataSource) *DataSourceValue {
return &DataSourceValue{source: source}
}
2 changes: 2 additions & 0 deletions pkg/lang/frontend/starlark/interpreter.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"go.starlark.net/starlark"

"github.com/tensorchord/envd/pkg/lang/frontend/starlark/config"
"github.com/tensorchord/envd/pkg/lang/frontend/starlark/data"
"github.com/tensorchord/envd/pkg/lang/frontend/starlark/install"
"github.com/tensorchord/envd/pkg/lang/frontend/starlark/io"
"github.com/tensorchord/envd/pkg/lang/frontend/starlark/runtime"
Expand Down Expand Up @@ -57,6 +58,7 @@ func NewInterpreter(buildContextDir string) Interpreter {
"config": config.Module,
"io": io.Module,
"runtime": runtime.Module,
"data": data.Module,
},
buildContextDir: buildContextDir,
}
Expand Down
25 changes: 23 additions & 2 deletions pkg/lang/frontend/starlark/io/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ import (
"path/filepath"
"strings"

"github.com/cockroachdb/errors"

"github.com/sirupsen/logrus"
"go.starlark.net/starlark"
"go.starlark.net/starlarkstruct"

"github.com/tensorchord/envd/pkg/lang/frontend/starlark/data"
"github.com/tensorchord/envd/pkg/lang/ir"
)

Expand All @@ -40,14 +43,32 @@ var Module = &starlarkstruct.Module{

func ruleFuncMount(thread *starlark.Thread, _ *starlark.Builtin,
args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
var source, destination starlark.String
var source starlark.Value
var destination starlark.String

if err := starlark.UnpackArgs(ruleMount, args, kwargs,
"src?", &source, "dest?", &destination); err != nil {
return nil, err
}

sourceStr := source.GoString()
var sourceStr string
var err error

if v, ok := source.(*data.DataSourceValue); ok {
err = v.Init()
if err != nil {
return starlark.None, err
}
sourceStr, err = v.GetHostDir()
if err != nil {
return starlark.None, err
}
} else if vs, ok := source.(starlark.String); ok {
sourceStr = vs.GoString()
} else {
return starlark.None, errors.New("invalid data source")
}

destinationStr := destination.GoString()

logger.Debugf("rule `%s` is invoked, src=%s, dest=%s",
Expand Down

0 comments on commit c49863e

Please sign in to comment.