Skip to content

Commit

Permalink
feat(preview): preview subcommand
Browse files Browse the repository at this point in the history
preview already exists on the API & in cloud contexts, this adds previews to CLI
  • Loading branch information
b5 committed Jul 25, 2020
1 parent bc87f00 commit 3303428
Show file tree
Hide file tree
Showing 13 changed files with 461 additions and 21 deletions.
71 changes: 71 additions & 0 deletions base/base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ func addNowTransformDataset(t *testing.T, r repo.Repo) dsref.Ref {
Schema: dataset.BaseSchemaArray,
},
Transform: &dataset.Transform{},
Readme: &dataset.Readme{},
}

script := `
Expand All @@ -157,6 +158,76 @@ def transform(ds, ctx):
ds.Transform.SetScriptFile(qfs.NewMemfileBytes("transform.star", []byte(script)))
ds.SetBodyFile(qfs.NewMemfileBytes("data.json", []byte("[]")))

readme := "# Oh hey there!\nI'm a readme! hello!\n"
ds.Readme.SetScriptFile(qfs.NewMemfileBytes("readme.md", []byte(readme)))

saved, err := CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, SaveSwitches{Pin: true, ShouldRender: true})
if err != nil {
t.Fatal(err.Error())
}
return dsref.ConvertDatasetToVersionInfo(saved).SimpleRef()
}

func addTurnstileDataset(t *testing.T, r repo.Repo) dsref.Ref {
ctx := context.Background()

ds := &dataset.Dataset{
Name: "turnstile_daily_counts_2020",
Peername: "peer",
Commit: &dataset.Commit{
Title: "update data for week ending April 18, 2020",
},
Meta: &dataset.Meta{
Title: "Turnstile Daily Counts 2020",
Description: "NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020. Updated weekly.",
},
Structure: &dataset.Structure{
Format: "json",
Schema: dataset.BaseSchemaArray,
},
Transform: &dataset.Transform{},
Readme: &dataset.Readme{},
}

script := `
load("time.star", "time")
def transform(ds, ctx):
ds.set_body([str(time.now())])`
ds.Transform.SetScriptFile(qfs.NewMemfileBytes("transform.star", []byte(script)))
ds.SetBodyFile(qfs.NewMemfileBytes("data.json", []byte("[]")))

readme := `# nyc-transit-data/turnstile_daily_counts_2020
NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020. Updated weekly.
## Where the Data Came From
This aggregation was created from weekly raw turnstile counts published by the New York MTA at [http://web.mta.info/developers/turnstile.html](http://web.mta.info/developers/turnstile.html)
The raw data were imported into a postgresql database for processing, and aggregated to calendar days for each station complex.
The process is outlined in [this blog post](https://medium.com/qri-io/taming-the-mtas-unruly-turnstile-data-c945f5f96ba0), and the code for the data pipeline is [available on github](https://github.com/qri-io/data-stories-scripts/tree/master/nyc-turnstile-counts).
## Caveats
This aggregation is a best-effort to make a clean and usable dataset of station-level counts. There were some assumptions and important decisions made to arrive at the finished product.
- The dataset excludes turnstile observation windows (4 hours) that resulted in entries or exits of over 10,000. This threshold excludes the obviously spurious numbers that come from the counters rolling over, but could include false readings that are within the threshold.
- The turnstile counts were aggregated to calendar day using the timestamp of the *end* of the 4-hour observation window + 2 hours. An observation window that ends at 2am would count for the same day, but a window ending between midnight and 1:59am would count for the previous day.
- The last date in the dataset contains a small number of entries and exits that will be aggregated into the next week's worth of data, and should not be used.
## PATH and Roosevelt Island Tramway
The dataset also includes turnstile counts for the PATH train system and the Roosevelt Island Tramway
## Spurious Data in early versions
Versions prior to QmPkGqJ318gcok69Noj3gw3coby8FDrab3x1hBisFcU3Yq were built with a pipeline that had a major error, causing inaccurate numbers near the transition between weekly input files.`
ds.Readme.SetScriptFile(qfs.NewMemfileBytes("readme.md", []byte(readme)))

ref, err := CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, SaveSwitches{Pin: true, ShouldRender: true})
if err != nil {
t.Fatal(err.Error())
Expand Down
37 changes: 22 additions & 15 deletions base/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,8 @@ func OpenDataset(ctx context.Context, fsys qfs.Filesystem, ds *dataset.Dataset)
}
}

if ds.Readme != nil && ds.Readme.ScriptFile() == nil {
readmeTimeoutCtx, cancel := context.WithTimeout(ctx, OpenFileTimeoutDuration)
defer cancel()

if err = ds.Readme.OpenScriptFile(readmeTimeoutCtx, fsys); err != nil {
if errors.Is(err, context.DeadlineExceeded) {
err = nil
} else if strings.Contains(err.Error(), "not found") {
log.Debug("skipping not-found readme script")
err = nil
} else {
log.Debug(err)
return err
}
}
if err = openReadme(ctx, fsys, ds); err != nil {
return err
}

if ds.Viz != nil && ds.Viz.RenderedFile() == nil {
Expand All @@ -110,6 +97,26 @@ func isMerkleDagError(err error) bool {
return err.Error() == "merkledag: not found"
}

func openReadme(ctx context.Context, fsys qfs.Filesystem, ds *dataset.Dataset) error {
if ds.Readme != nil && ds.Readme.ScriptFile() == nil {
readmeTimeoutCtx, cancel := context.WithTimeout(ctx, OpenFileTimeoutDuration)
defer cancel()

if err := ds.Readme.OpenScriptFile(readmeTimeoutCtx, fsys); err != nil {
if errors.Is(err, context.DeadlineExceeded) {
err = nil
} else if strings.Contains(err.Error(), "not found") {
log.Debug("skipping not-found readme script")
err = nil
} else {
log.Debug(err)
return err
}
}
}
return nil
}

// CloseDataset ensures all open dataset files are closed
func CloseDataset(ds *dataset.Dataset) (err error) {
if ds.BodyFile() != nil {
Expand Down
1 change: 1 addition & 0 deletions base/dsfs/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ func generateCommitDescriptions(store cafs.Filestore, prev, ds *dataset.Dataset,
err = ds.Readme.OpenScriptFile(ctx, fs)
if err != nil {
log.Errorf("ds.Readme.ScriptPath %q open err: %s", ds.Readme.ScriptPath, err)
err = nil
} else {
tfFile := ds.Readme.ScriptFile()
ds.Readme.ScriptBytes, err = ioutil.ReadAll(tfFile)
Expand Down
33 changes: 30 additions & 3 deletions base/preview.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,23 @@ import (
"context"
"encoding/json"
"fmt"
"io"
"io/ioutil"

"github.com/qri-io/dataset"
"github.com/qri-io/qri/base/dsfs"
"github.com/qri-io/qri/dsref"
"github.com/qri-io/qri/repo"
)

// MaxNumDatasetRowsInPreview is the highest number of rows a dataset preview
// can contain
const MaxNumDatasetRowsInPreview = 100
const (
// MaxNumDatasetRowsInPreview is the highest number of rows a dataset preview
// can contain
MaxNumDatasetRowsInPreview = 100
// MaxReadmePreviewBytes determines the maximum amount of bytes a readme
// preview can be. three bytes less than 1000 to make room for an elipsis
MaxReadmePreviewBytes = 997
)

// CreatePreview generates a preview for a dataset version
func CreatePreview(ctx context.Context, r repo.Repo, ref dsref.Ref) (ds *dataset.Dataset, err error) {
Expand All @@ -27,6 +34,26 @@ func CreatePreview(ctx context.Context, r repo.Repo, ref dsref.Ref) (ds *dataset
return nil, err
}

if ds.Readme != nil {
if err := openReadme(ctx, r.Filesystem(), ds); err != nil {
log.Errorf("OpeningReadme: %s", err.Error())
return nil, err
}

if readmeFile := ds.Readme.ScriptFile(); readmeFile != nil {
ds.Readme.ScriptBytes, err = ioutil.ReadAll(io.LimitReader(readmeFile, MaxReadmePreviewBytes))
if err != nil {
log.Errorf("Reading Readme: %s", err.Error())
return nil, err
}

if len(ds.Readme.ScriptBytes) == MaxReadmePreviewBytes {
ds.Readme.ScriptBytes = append(ds.Readme.ScriptBytes, []byte(`...`)...)
}
ds.Readme.SetScriptFile(nil)
}
}

if err = ds.OpenBodyFile(ctx, r.Store()); err != nil {
log.Errorf("CreatePreview opening body file: %s", err.Error())
return nil, err
Expand Down
139 changes: 139 additions & 0 deletions base/preview_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package base

import (
"context"
"encoding/json"
"testing"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/qri-io/dataset"
"github.com/qri-io/qri/base/dsfs"
)

func TestCreatePreview(t *testing.T) {

prevTs := dsfs.Timestamp
dsfs.Timestamp = func() time.Time { return new(time.Time).In(time.UTC) }
defer func() { dsfs.Timestamp = prevTs }()

r := newTestRepo(t)
turnstileRef := addTurnstileDataset(t, r)
ctx := context.Background()

got, err := CreatePreview(ctx, r, turnstileRef)
if err != nil {
t.Fatal(err)
}

ts, _ := time.Parse(time.RFC3339, "0001-01-01 00:00:00 +0000 UTC")

expect := &dataset.Dataset{
Qri: "ds:0",
Peername: "peer",
Name: "turnstile_daily_counts_2020",
Path: "/map/QmXrDtzEV7JXSZogXAqsmcj3497nZWRGMyJzEe1tmYV1cd",
Commit: &dataset.Commit{
Message: "created dataset",
Path: "/map/QmbuPg3d9Nguze3uwEpGEtgYBZNoRVz4UQewwr3HcXTadu",
Qri: "cm:0",
Signature: "Wj+Q8k+XVYSRc2kRNxfv1d6zJ/8Q+atH3bxOeQH/rYICovHI2D2OqUvI7Oaag4ka9f7vdjxnargmADDl3EuMUlx6vHsWbX64pQ2uMSOM7jya6T7o7URR9vyesko1rVTb8xVyDbZEcDY3+2hf2ZDgVCD5M0WSnqUTGRxT4O1kgOqIPn6GnzudYmNkV/jyi+U/uGzOUM6Au92gysc+vfIsXxgAYuJv3NVrHNYjI504L15nBAfnHPOYfWaUjBtJiyUN36auvP43+/aOxO/O8iK3TfPepO6ne+DMmSXymvqrbcBuuOQLUu8aOO7Z6YTDnUU/bl+9z349CxjhJ1nne8V3SA==",
Timestamp: ts,
Title: "update data for week ending April 18, 2020",
},
Meta: &dataset.Meta{
Description: "NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020. Updated weekly.",
Qri: "md:0",
Title: "Turnstile Daily Counts 2020",
},
Readme: &dataset.Readme{
Qri: "rm:0",
ScriptPath: "/map/QmQ93yKwktz778AiTjYPKwj1qqbvHDsWpYVff3Eicqn6Z5",
ScriptBytes: []byte(`# nyc-transit-data/turnstile_daily_counts_2020
NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020. Updated weekly.
## Where the Data Came From
This aggregation was created from weekly raw turnstile counts published by the New York MTA at [http://web.mta.info/developers/turnstile.html](http://web.mta.info/developers/turnstile.html)
The raw data were imported into a postgresql database for processing, and aggregated to calendar days for each station complex.
The process is outlined in [this blog post](https://medium.com/qri-io/taming-the-mtas-unruly-turnstile-data-c945f5f96ba0), and the code for the data pipeline is [available on github](https://github.com/qri-io/data-stories-scripts/tree/master/nyc-turnstile-counts).
## Caveats
This aggregation is a best-effort to make a clean and usable dataset of station-level counts. There were some assumptions and important decisions made to arrive at the finished product.
- The dataset excludes tur...`),
},
Transform: &dataset.Transform{
Qri: "tf:0",
ScriptPath: "/map/QmXSce6KDQHLvi4AKDU8z7s4ouynKXKpD6TY7wJgF6reWM",
},
Structure: &dataset.Structure{
Checksum: "QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
Depth: 1,
Format: "json",
Length: 2,
Qri: "st:0",
Schema: map[string]interface{}{"type": string("array")},
},
Body: json.RawMessage(`[]`),
BodyPath: "/map/QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
}

if diff := cmp.Diff(expect, got, cmpopts.IgnoreUnexported(dataset.Dataset{}, dataset.Meta{}, dataset.Readme{}, dataset.Transform{})); diff != "" {
t.Errorf("result mismatch. (-want +got):\n%s", diff)
}

nowTfRef := addNowTransformDataset(t, r)

got, err = CreatePreview(ctx, r, nowTfRef)
if err != nil {
t.Fatal(err)
}

expect = &dataset.Dataset{
Qri: "ds:0",
Peername: "peer",
Name: "now_tf",
Path: "/map/QmShMXWEJ56XyiRUWk8q7Nvzphk7n7Jm7hg32Uf92S6yfq",
Commit: &dataset.Commit{
Message: "created dataset",
Path: "/map/QmQ5gzPfZgw1PaSSS7hfzy3sp597pKfWqspsHwsdsR5DEG",
Qri: "cm:0",
Signature: "Wj+Q8k+XVYSRc2kRNxfv1d6zJ/8Q+atH3bxOeQH/rYICovHI2D2OqUvI7Oaag4ka9f7vdjxnargmADDl3EuMUlx6vHsWbX64pQ2uMSOM7jya6T7o7URR9vyesko1rVTb8xVyDbZEcDY3+2hf2ZDgVCD5M0WSnqUTGRxT4O1kgOqIPn6GnzudYmNkV/jyi+U/uGzOUM6Au92gysc+vfIsXxgAYuJv3NVrHNYjI504L15nBAfnHPOYfWaUjBtJiyUN36auvP43+/aOxO/O8iK3TfPepO6ne+DMmSXymvqrbcBuuOQLUu8aOO7Z6YTDnUU/bl+9z349CxjhJ1nne8V3SA==",
Timestamp: ts,
Title: "created dataset",
},
Meta: &dataset.Meta{
Qri: "md:0",
Title: "example transform",
},
Readme: &dataset.Readme{
Qri: "rm:0",
ScriptPath: "/map/QmfTcGiaJqhddaEGebrfAWH25YZkpPL7MMTC9swzNnb1FS",
ScriptBytes: []byte("# Oh hey there!\nI'm a readme! hello!\n"),
},
Transform: &dataset.Transform{
Qri: "tf:0",
ScriptPath: "/map/QmXSce6KDQHLvi4AKDU8z7s4ouynKXKpD6TY7wJgF6reWM",
},
Structure: &dataset.Structure{
Checksum: "QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
Depth: 1,
Format: "json",
Length: 2,
Qri: "st:0",
Schema: map[string]interface{}{"type": string("array")},
},
Body: json.RawMessage(`[]`),
BodyPath: "/map/QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
}

if diff := cmp.Diff(expect, got, cmpopts.IgnoreUnexported(dataset.Dataset{}, dataset.Meta{}, dataset.Readme{}, dataset.Transform{})); diff != "" {
t.Errorf("result mismatch. (-want +got):\n%s", diff)
}
}
Loading

0 comments on commit 3303428

Please sign in to comment.