From 7858ba790f279dddb92a971f31149ae0764f2d51 Mon Sep 17 00:00:00 2001 From: Brendan O'Brien Date: Tue, 7 Nov 2017 17:42:28 -0500 Subject: [PATCH] feat(core.Init): initialize a dataset from a url this feature expands dataset initialization to allow adding a dataset from a url that points to the data to add. Access to initialization comes via both an api url form value, or through the CLI by way of setting the --url flag. --- api/handlers/datasets.go | 26 ++++--- cmd/init.go | 152 +++++++++++---------------------------- cmd/repo.go | 7 +- core/datasets.go | 60 ++++++++++++---- core/datasets_test.go | 13 ++-- 5 files changed, 116 insertions(+), 142 deletions(-) diff --git a/api/handlers/datasets.go b/api/handlers/datasets.go index c3262c45c..4935192f2 100644 --- a/api/handlers/datasets.go +++ b/api/handlers/datasets.go @@ -84,7 +84,7 @@ func (h *DatasetHandlers) AddDatasetHandler(w http.ResponseWriter, r *http.Reque } func (h *DatasetHandlers) ZipDatasetHandler(w http.ResponseWriter, r *http.Request) { - res := &dataset.Dataset{} + res := &repo.DatasetRef{} args := &core.GetDatasetParams{ Path: datastore.NewKey(r.URL.Path[len("/download/"):]), Hash: r.FormValue("hash"), @@ -98,7 +98,7 @@ func (h *DatasetHandlers) ZipDatasetHandler(w http.ResponseWriter, r *http.Reque w.Header().Set("Content-Type", "application/zip") w.Header().Set("Content-Disposition", fmt.Sprintf("filename=\"%s.zip\"", "dataset")) - dsutil.WriteZipArchive(h.store, res, w) + dsutil.WriteZipArchive(h.store, res.Dataset, w) } func (h *DatasetHandlers) listDatasetsHandler(w http.ResponseWriter, r *http.Request) { @@ -116,7 +116,7 @@ func (h *DatasetHandlers) listDatasetsHandler(w http.ResponseWriter, r *http.Req } func (h *DatasetHandlers) getDatasetHandler(w http.ResponseWriter, r *http.Request) { - res := &dataset.Dataset{} + res := &repo.DatasetRef{} args := &core.GetDatasetParams{ Path: datastore.NewKey(r.URL.Path[len("/datasets/"):]), Hash: r.FormValue("hash"), @@ -126,7 +126,7 @@ func (h *DatasetHandlers) getDatasetHandler(w http.ResponseWriter, r *http.Reque util.WriteErrResponse(w, http.StatusInternalServerError, err) return } - util.WriteResponse(w, res) + util.WriteResponse(w, res.Dataset) } func (h *DatasetHandlers) saveDatasetHandler(w http.ResponseWriter, r *http.Request) { @@ -179,23 +179,27 @@ func (h *DatasetHandlers) saveStructureHandler(w http.ResponseWriter, r *http.Re } func (h *DatasetHandlers) initDatasetFileHandler(w http.ResponseWriter, r *http.Request) { + var f cafs.File infile, header, err := r.FormFile("file") - if err != nil { + if err != nil && err != http.ErrMissingFile { util.WriteErrResponse(w, http.StatusBadRequest, err) return + } else { + f = memfs.NewMemfileReader(header.Filename, infile) } p := &core.InitDatasetParams{ + Url: r.FormValue("url"), Name: r.FormValue("name"), - Data: memfs.NewMemfileReader(header.Filename, infile), + Data: f, } - res := &dataset.Dataset{} + res := &repo.DatasetRef{} if err := h.InitDataset(p, res); err != nil { h.log.Infof("error initializing dataset: %s", err.Error()) util.WriteErrResponse(w, http.StatusInternalServerError, err) return } - util.WriteResponse(w, res) + util.WriteResponse(w, res.Dataset) } func (h *DatasetHandlers) deleteDatasetHandler(w http.ResponseWriter, r *http.Request) { @@ -204,8 +208,8 @@ func (h *DatasetHandlers) deleteDatasetHandler(w http.ResponseWriter, r *http.Re Path: datastore.NewKey(r.URL.Path[len("/datasets"):]), } - ds := &dataset.Dataset{} - if err := h.Get(&core.GetDatasetParams{Name: p.Name, Path: p.Path}, ds); err != nil { + ref := &repo.DatasetRef{} + if err := h.Get(&core.GetDatasetParams{Name: p.Name, Path: p.Path}, ref); err != nil { return } @@ -216,7 +220,7 @@ func (h *DatasetHandlers) deleteDatasetHandler(w http.ResponseWriter, r *http.Re return } - util.WriteResponse(w, ds) + util.WriteResponse(w, ref.Dataset) } func (h *DatasetHandlers) getStructuredDataHandler(w http.ResponseWriter, r *http.Request) { diff --git a/cmd/init.go b/cmd/init.go index a89a1fe3e..b12faa587 100644 --- a/cmd/init.go +++ b/cmd/init.go @@ -17,25 +17,20 @@ package cmd import ( "flag" "fmt" - "io/ioutil" "os" "path/filepath" - "time" - "github.com/ipfs/go-datastore" - "github.com/qri-io/dataset" - "github.com/qri-io/dataset/detect" - "github.com/qri-io/dataset/dsfs" + "github.com/qri-io/qri/core" "github.com/qri-io/qri/repo" "github.com/spf13/cobra" ) var ( - initFile string - initMetaFile string - initName string - initPassive bool - initRescursive bool + initFile string + initMetaFile string + initName string + initUrl string + initPassive bool ) // initCmd represents the init command @@ -44,124 +39,63 @@ var initCmd = &cobra.Command{ Short: "Initialize a dataset, adding it to your local collection of datasets", Long: ``, Run: func(cmd *cobra.Command, args []string) { - if initFile == "" { - ErrExit(fmt.Errorf("please provide a file argument")) - } + var dataFile, metaFile *os.File - path, err := filepath.Abs(initFile) - ExitIfErr(err) + if initFile == "" && initUrl == "" { + ErrExit(fmt.Errorf("please provide either a file or a url argument")) + } - r := GetRepo(false) - // ns := LoadNamespaceGraph() - ds, err := GetIpfsFilestore(false) - ExitIfErr(err) + if initName == "" { + ErrExit(fmt.Errorf("please provide a --name")) + } - if initRescursive { - files, err := ioutil.ReadDir(path) + if initFile != "" { + filepath, err := filepath.Abs(initFile) ExitIfErr(err) - foundFiles := map[string]datastore.Key{} - for _, fi := range files { - if fi.IsDir() { - continue - } else { - initName = fi.Name() - st, err := detect.FromFile(initName) - ExitIfErr(err) - // Add to the namespace as the filename - // TODO - require this be a proper, no-space alphanumeric type thing - - datahash, err := ds.AddPath(filepath.Join(path, fi.Name()), true) - ExitIfErr(err) - datakey := datastore.NewKey("/ipfs/" + datahash) - - // rkey, dskey, err := datasets.AddFileStructure(ds, filepath.Join(path, fi.Name()), rsc) - d := &dataset.Dataset{ - Timestamp: time.Now().In(time.UTC), - Structure: st, - Data: datakey, - } - - dspath, err := dsfs.SaveDataset(ds, d, true) - ExitIfErr(err) - - foundFiles[initName] = dspath - r.PutName(initName, dspath) - } - } - } else { - file, err := os.Stat(path) + dataFile, err = os.Open(filepath) ExitIfErr(err) + } - // TODO - extract a default name from the file name - // TODO - require this be a proper, no-space alphanumeric type thing - if !initPassive && initName == "" { - initName = InputText(fmt.Sprintf("choose a variable name for %s", file.Name()), file.Name()) - if err != nil { - return - } - } else if initName == "" { - initName = repo.CoerceDatasetName(file.Name()) - } - - if !repo.ValidDatasetName(initName) { - ErrExit(fmt.Errorf("invalid dataset name: %s", initName)) - } - - st, err := detect.FromFile(path) + if initMetaFile != "" { + filepath, err := filepath.Abs(initMetaFile) ExitIfErr(err) - - datahash, err := ds.AddPath(path, true) + metaFile, err = os.Open(filepath) ExitIfErr(err) - datakey := datastore.NewKey("/ipfs/" + datahash) - - d := &dataset.Dataset{} - - // parse any provided metadata - if initMetaFile != "" { - mdata, err := ioutil.ReadFile(initMetaFile) - if err != nil { - ErrExit(fmt.Errorf("error opening metadata file: %s", err.Error())) - } - if err := d.UnmarshalJSON(mdata); err != nil { - ErrExit(fmt.Errorf("error parsing metadata file: %s", err.Error())) - } - } - - if d.Structure == nil { - d.Structure = &dataset.Structure{} - } - - // structure may have been set by the metadata file above - // by calling assign on ourselves with inferred structure in - // the middle, any user-contributed schema metadata will overwrite - // inferred metadata, but inferred schema properties will populate - // empty fields - d.Structure.Assign(st, d.Structure) - d.Timestamp = time.Now().In(time.UTC) - d.Data = datakey - d.Length = int(file.Size()) + } - dspath, err := dsfs.SaveDataset(ds, d, true) - ExitIfErr(err) + r := GetRepo(false) + store, err := GetIpfsFilestore(false) + ExitIfErr(err) + req := core.NewDatasetRequests(store, r) - // Add to the namespace as the filename - // TODO - require this be a proper, no-space alphanumeric type thing - // ns[initName] = dspath - err = r.PutName(initName, dspath) - ExitIfErr(err) + p := &core.InitDatasetParams{ + Name: initName, + Url: initUrl, + DataFilename: filepath.Base(initFile), + } - PrintSuccess("initialized dataset %s: %s", initName, dspath) - // PrintDatasetDetailedInfo(ds) + // this is because passing nil to interfaces is bad: https://golang.org/doc/faq#nil_error + if dataFile != nil { + p.Data = dataFile } + if metaFile != nil { + p.Metadata = metaFile + } + + ref := &repo.DatasetRef{} + err = req.InitDataset(p, ref) + ExitIfErr(err) + // req.Get(&core.GetDatasetParams{ Name: p.Name }, res) + PrintSuccess("initialized dataset %s: %s", ref.Name, ref.Path.String()) }, } func init() { flag.Parse() RootCmd.AddCommand(initCmd) + initCmd.Flags().StringVarP(&initUrl, "url", "u", "", "url to file to initialize from") initCmd.Flags().StringVarP(&initFile, "file", "f", "", "data file to initialize from") initCmd.Flags().StringVarP(&initName, "name", "n", "", "name to give dataset") initCmd.Flags().StringVarP(&initMetaFile, "meta", "m", "", "dataset metadata") - initCmd.Flags().BoolVarP(&initRescursive, "recursive", "r", false, "recursive add from a directory") initCmd.Flags().BoolVarP(&initPassive, "passive", "p", false, "disable interactive init") } diff --git a/cmd/repo.go b/cmd/repo.go index c4885887f..51286b7cf 100644 --- a/cmd/repo.go +++ b/cmd/repo.go @@ -19,7 +19,12 @@ func GetRepo(online bool) repo.Repo { fs, err := GetIpfsFilestore(online) ExitIfErr(err) - r, err := fs_repo.NewRepo(fs, viper.GetString(QriRepoPath), fs.Node().PeerHost.ID().Pretty()) + id := "" + if fs.Node().PeerHost != nil { + id = fs.Node().PeerHost.ID().Pretty() + } + + r, err := fs_repo.NewRepo(fs, viper.GetString(QriRepoPath), id) ExitIfErr(err) return r } diff --git a/core/datasets.go b/core/datasets.go index ce74164ef..075b38196 100644 --- a/core/datasets.go +++ b/core/datasets.go @@ -4,7 +4,10 @@ import ( "bytes" "encoding/json" "fmt" + "io" "io/ioutil" + "net/http" + "path/filepath" "strings" "time" @@ -79,35 +82,58 @@ type GetDatasetParams struct { Hash string } -func (d *DatasetRequests) Get(p *GetDatasetParams, res *dataset.Dataset) error { +func (d *DatasetRequests) Get(p *GetDatasetParams, res *repo.DatasetRef) error { ds, err := dsfs.LoadDataset(d.store, p.Path) if err != nil { return fmt.Errorf("error loading dataset: %s", err.Error()) } - *res = *ds + name := p.Name + if p.Path.String() != "" { + name, _ = d.repo.GetName(p.Path) + } + + *res = repo.DatasetRef{ + Name: name, + Path: p.Path, + Dataset: ds, + } return nil } type InitDatasetParams struct { - Data cafs.File - Metadata cafs.File - Name string + Name string + Url string + DataFilename string + Data io.Reader + Metadata io.Reader } -func (r *DatasetRequests) InitDataset(p *InitDatasetParams, res *dataset.Dataset) error { - // TODO - split this into some sort of re-readable reader instead - // of reading the entire file - if p.Data == nil { - return fmt.Errorf("data file is required") +func (r *DatasetRequests) InitDataset(p *InitDatasetParams, res *repo.DatasetRef) error { + var rdr io.Reader + var filename = p.DataFilename + if p.Url != "" { + res, err := http.Get(p.Url) + if err != nil { + return fmt.Errorf("error fetching url: %s", err.Error()) + } + filename = filepath.Base(p.Url) + defer res.Body.Close() + rdr = res.Body + } else if p.Data != nil { + rdr = p.Data + } else { + return fmt.Errorf("either a file or a url is required to create a dataset") } - data, err := ioutil.ReadAll(p.Data) + // TODO - split this into some sort of re-readable reader instead + // of reading the entire file + data, err := ioutil.ReadAll(rdr) if err != nil { return fmt.Errorf("error reading file: %s", err.Error()) } - st, err := detect.FromReader(p.Data.FileName(), bytes.NewReader(data)) + st, err := detect.FromReader(filename, bytes.NewReader(data)) if err != nil { return fmt.Errorf("error determining dataset schema: %s", err.Error()) } @@ -121,9 +147,9 @@ func (r *DatasetRequests) InitDataset(p *InitDatasetParams, res *dataset.Dataset return fmt.Errorf("error putting data file in store: %s", err.Error()) } - adr := detect.Camelize(p.Data.FileName()) + adr := detect.Camelize(filename) if p.Name != "" { - adr = detect.Camelize(p.Data.FileName()) + adr = detect.Camelize(filename) } ds := &dataset.Dataset{} @@ -165,7 +191,11 @@ func (r *DatasetRequests) InitDataset(p *InitDatasetParams, res *dataset.Dataset return fmt.Errorf("error reading dataset: %s", err.Error()) } - *res = *ds + *res = repo.DatasetRef{ + Name: p.Name, + Path: dskey, + Dataset: ds, + } return nil } diff --git a/core/datasets_test.go b/core/datasets_test.go index ccb8fc334..a9c56a01d 100644 --- a/core/datasets_test.go +++ b/core/datasets_test.go @@ -11,12 +11,13 @@ import ( func TestDatasetRequestsInit(t *testing.T) { cases := []struct { p *InitDatasetParams - res *dataset.Dataset + res *repo.DatasetRef err string }{ - {&InitDatasetParams{}, nil, "data file is required"}, - {&InitDatasetParams{Data: badDataFile}, nil, "error determining dataset schema: line 3, column 0: wrong number of fields in line"}, - {&InitDatasetParams{Data: jobsByAutomationFile}, nil, ""}, + {&InitDatasetParams{}, nil, "either a file or a url is required to create a dataset"}, + {&InitDatasetParams{Data: badDataFile}, nil, "error determining dataset schema: no file extension provided"}, + {&InitDatasetParams{DataFilename: badDataFile.FileName(), Data: badDataFile}, nil, "error determining dataset schema: EOF"}, + {&InitDatasetParams{DataFilename: jobsByAutomationFile.FileName(), Data: jobsByAutomationFile}, nil, ""}, } mr, ms, err := NewTestRepo() @@ -27,7 +28,7 @@ func TestDatasetRequestsInit(t *testing.T) { req := NewDatasetRequests(ms, mr) for i, c := range cases { - got := &dataset.Dataset{} + got := &repo.DatasetRef{} err := req.InitDataset(c.p, got) if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { @@ -97,7 +98,7 @@ func TestDatasetRequestsGet(t *testing.T) { req := NewDatasetRequests(ms, mr) for i, c := range cases { - got := &dataset.Dataset{} + got := &repo.DatasetRef{} err := req.Get(c.p, got) if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { t.Errorf("case %d error mismatch: expected: %s, got: %s", i, c.err, err)