feat(preview): preview subcommand

preview already exists on the API & in cloud contexts, this adds previews to CLI
qri-io · Jul 25, 2020 · 3303428 · 3303428
1 parent bc87f00
commit 3303428
Show file tree

Hide file tree

Showing 13 changed files with 461 additions and 21 deletions.
diff --git a/base/base_test.go b/base/base_test.go
@@ -147,6 +147,7 @@ func addNowTransformDataset(t *testing.T, r repo.Repo) dsref.Ref {
 			Schema: dataset.BaseSchemaArray,
 		},
 		Transform: &dataset.Transform{},
+		Readme:    &dataset.Readme{},
 	}
 
 	script := `
@@ -157,6 +158,76 @@ def transform(ds, ctx):
 	ds.Transform.SetScriptFile(qfs.NewMemfileBytes("transform.star", []byte(script)))
 	ds.SetBodyFile(qfs.NewMemfileBytes("data.json", []byte("[]")))
 
+	readme := "# Oh hey there!\nI'm a readme! hello!\n"
+	ds.Readme.SetScriptFile(qfs.NewMemfileBytes("readme.md", []byte(readme)))
+
+	saved, err := CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, SaveSwitches{Pin: true, ShouldRender: true})
+	if err != nil {
+		t.Fatal(err.Error())
+	}
+	return dsref.ConvertDatasetToVersionInfo(saved).SimpleRef()
+}
+
+func addTurnstileDataset(t *testing.T, r repo.Repo) dsref.Ref {
+	ctx := context.Background()
+
+	ds := &dataset.Dataset{
+		Name:     "turnstile_daily_counts_2020",
+		Peername: "peer",
+		Commit: &dataset.Commit{
+			Title: "update data for week ending April 18, 2020",
+		},
+		Meta: &dataset.Meta{
+			Title:       "Turnstile Daily Counts 2020",
+			Description: "NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020. Updated weekly.",
+		},
+		Structure: &dataset.Structure{
+			Format: "json",
+			Schema: dataset.BaseSchemaArray,
+		},
+		Transform: &dataset.Transform{},
+		Readme:    &dataset.Readme{},
+	}
+
+	script := `
+load("time.star", "time")
+
+def transform(ds, ctx):
+	ds.set_body([str(time.now())])`
+	ds.Transform.SetScriptFile(qfs.NewMemfileBytes("transform.star", []byte(script)))
+	ds.SetBodyFile(qfs.NewMemfileBytes("data.json", []byte("[]")))
+
+	readme := `# nyc-transit-data/turnstile_daily_counts_2020
+
+NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020.  Updated weekly.
+
+## Where the Data Came From
+
+This aggregation was created from weekly raw turnstile counts published by the New York MTA at [http://web.mta.info/developers/turnstile.html](http://web.mta.info/developers/turnstile.html)
+
+The raw data were imported into a postgresql database for processing, and aggregated to calendar days for each station complex.
+
+The process is outlined in [this blog post](https://medium.com/qri-io/taming-the-mtas-unruly-turnstile-data-c945f5f96ba0), and the code for the data pipeline is [available on github](https://github.com/qri-io/data-stories-scripts/tree/master/nyc-turnstile-counts).
+
+## Caveats
+
+This aggregation is a best-effort to make a clean and usable dataset of station-level counts.  There were some assumptions and important decisions made to arrive at the finished product.
+
+- The dataset excludes turnstile observation windows (4 hours)  that resulted in entries or exits of over 10,000.  This threshold excludes the obviously spurious numbers that come from the counters rolling over, but could include false readings that are within the threshold.
+
+- The turnstile counts were aggregated to calendar day using the timestamp of the *end* of the 4-hour observation window + 2 hours.  An observation window that ends at 2am would count for the same day, but a window ending between midnight and 1:59am would count for the previous day.
+
+- The last date in the dataset contains a small number of entries and exits that will be aggregated into the next week's worth of data, and should not be used.
+
+## PATH and Roosevelt Island Tramway
+
+The dataset also includes turnstile counts for the PATH train system and the Roosevelt Island Tramway
+
+## Spurious Data in early versions
+
+Versions prior to QmPkGqJ318gcok69Noj3gw3coby8FDrab3x1hBisFcU3Yq were built with a pipeline that had a major error, causing inaccurate numbers near the transition between weekly input files.`
+	ds.Readme.SetScriptFile(qfs.NewMemfileBytes("readme.md", []byte(readme)))
+
 	ref, err := CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, SaveSwitches{Pin: true, ShouldRender: true})
 	if err != nil {
 		t.Fatal(err.Error())

diff --git a/base/dataset.go b/base/dataset.go
@@ -70,21 +70,8 @@ func OpenDataset(ctx context.Context, fsys qfs.Filesystem, ds *dataset.Dataset)
 		}
 	}
 
-	if ds.Readme != nil && ds.Readme.ScriptFile() == nil {
-		readmeTimeoutCtx, cancel := context.WithTimeout(ctx, OpenFileTimeoutDuration)
-		defer cancel()
-
-		if err = ds.Readme.OpenScriptFile(readmeTimeoutCtx, fsys); err != nil {
-			if errors.Is(err, context.DeadlineExceeded) {
-				err = nil
-			} else if strings.Contains(err.Error(), "not found") {
-				log.Debug("skipping not-found readme script")
-				err = nil
-			} else {
-				log.Debug(err)
-				return err
-			}
-		}
+	if err = openReadme(ctx, fsys, ds); err != nil {
+		return err
 	}
 
 	if ds.Viz != nil && ds.Viz.RenderedFile() == nil {
@@ -110,6 +97,26 @@ func isMerkleDagError(err error) bool {
 	return err.Error() == "merkledag: not found"
 }
 
+func openReadme(ctx context.Context, fsys qfs.Filesystem, ds *dataset.Dataset) error {
+	if ds.Readme != nil && ds.Readme.ScriptFile() == nil {
+		readmeTimeoutCtx, cancel := context.WithTimeout(ctx, OpenFileTimeoutDuration)
+		defer cancel()
+
+		if err := ds.Readme.OpenScriptFile(readmeTimeoutCtx, fsys); err != nil {
+			if errors.Is(err, context.DeadlineExceeded) {
+				err = nil
+			} else if strings.Contains(err.Error(), "not found") {
+				log.Debug("skipping not-found readme script")
+				err = nil
+			} else {
+				log.Debug(err)
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // CloseDataset ensures all open dataset files are closed
 func CloseDataset(ds *dataset.Dataset) (err error) {
 	if ds.BodyFile() != nil {

diff --git a/base/dsfs/dataset.go b/base/dsfs/dataset.go
@@ -646,6 +646,7 @@ func generateCommitDescriptions(store cafs.Filestore, prev, ds *dataset.Dataset,
 		err = ds.Readme.OpenScriptFile(ctx, fs)
 		if err != nil {
 			log.Errorf("ds.Readme.ScriptPath %q open err: %s", ds.Readme.ScriptPath, err)
+			err = nil
 		} else {
 			tfFile := ds.Readme.ScriptFile()
 			ds.Readme.ScriptBytes, err = ioutil.ReadAll(tfFile)

diff --git a/base/preview.go b/base/preview.go
@@ -4,16 +4,23 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"io"
+	"io/ioutil"
 
 	"github.com/qri-io/dataset"
 	"github.com/qri-io/qri/base/dsfs"
 	"github.com/qri-io/qri/dsref"
 	"github.com/qri-io/qri/repo"
 )
 
-// MaxNumDatasetRowsInPreview is the highest number of rows a dataset preview
-// can contain
-const MaxNumDatasetRowsInPreview = 100
+const (
+	// MaxNumDatasetRowsInPreview is the highest number of rows a dataset preview
+	// can contain
+	MaxNumDatasetRowsInPreview = 100
+	// MaxReadmePreviewBytes determines the maximum amount of bytes a readme
+	// preview can be. three bytes less than 1000 to make room for an elipsis
+	MaxReadmePreviewBytes = 997
+)
 
 // CreatePreview generates a preview for a dataset version
 func CreatePreview(ctx context.Context, r repo.Repo, ref dsref.Ref) (ds *dataset.Dataset, err error) {
@@ -27,6 +34,26 @@ func CreatePreview(ctx context.Context, r repo.Repo, ref dsref.Ref) (ds *dataset
 		return nil, err
 	}
 
+	if ds.Readme != nil {
+		if err := openReadme(ctx, r.Filesystem(), ds); err != nil {
+			log.Errorf("OpeningReadme: %s", err.Error())
+			return nil, err
+		}
+
+		if readmeFile := ds.Readme.ScriptFile(); readmeFile != nil {
+			ds.Readme.ScriptBytes, err = ioutil.ReadAll(io.LimitReader(readmeFile, MaxReadmePreviewBytes))
+			if err != nil {
+				log.Errorf("Reading Readme: %s", err.Error())
+				return nil, err
+			}
+
+			if len(ds.Readme.ScriptBytes) == MaxReadmePreviewBytes {
+				ds.Readme.ScriptBytes = append(ds.Readme.ScriptBytes, []byte(`...`)...)
+			}
+			ds.Readme.SetScriptFile(nil)
+		}
+	}
+
 	if err = ds.OpenBodyFile(ctx, r.Store()); err != nil {
 		log.Errorf("CreatePreview opening body file: %s", err.Error())
 		return nil, err

diff --git a/base/preview_test.go b/base/preview_test.go
@@ -0,0 +1,139 @@
+package base
+
+import (
+	"context"
+	"encoding/json"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/qri-io/dataset"
+	"github.com/qri-io/qri/base/dsfs"
+)
+
+func TestCreatePreview(t *testing.T) {
+
+	prevTs := dsfs.Timestamp
+	dsfs.Timestamp = func() time.Time { return new(time.Time).In(time.UTC) }
+	defer func() { dsfs.Timestamp = prevTs }()
+
+	r := newTestRepo(t)
+	turnstileRef := addTurnstileDataset(t, r)
+	ctx := context.Background()
+
+	got, err := CreatePreview(ctx, r, turnstileRef)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ts, _ := time.Parse(time.RFC3339, "0001-01-01 00:00:00 +0000 UTC")
+
+	expect := &dataset.Dataset{
+		Qri:      "ds:0",
+		Peername: "peer",
+		Name:     "turnstile_daily_counts_2020",
+		Path:     "/map/QmXrDtzEV7JXSZogXAqsmcj3497nZWRGMyJzEe1tmYV1cd",
+		Commit: &dataset.Commit{
+			Message:   "created dataset",
+			Path:      "/map/QmbuPg3d9Nguze3uwEpGEtgYBZNoRVz4UQewwr3HcXTadu",
+			Qri:       "cm:0",
+			Signature: "Wj+Q8k+XVYSRc2kRNxfv1d6zJ/8Q+atH3bxOeQH/rYICovHI2D2OqUvI7Oaag4ka9f7vdjxnargmADDl3EuMUlx6vHsWbX64pQ2uMSOM7jya6T7o7URR9vyesko1rVTb8xVyDbZEcDY3+2hf2ZDgVCD5M0WSnqUTGRxT4O1kgOqIPn6GnzudYmNkV/jyi+U/uGzOUM6Au92gysc+vfIsXxgAYuJv3NVrHNYjI504L15nBAfnHPOYfWaUjBtJiyUN36auvP43+/aOxO/O8iK3TfPepO6ne+DMmSXymvqrbcBuuOQLUu8aOO7Z6YTDnUU/bl+9z349CxjhJ1nne8V3SA==",
+			Timestamp: ts,
+			Title:     "update data for week ending April 18, 2020",
+		},
+		Meta: &dataset.Meta{
+			Description: "NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020. Updated weekly.",
+			Qri:         "md:0",
+			Title:       "Turnstile Daily Counts 2020",
+		},
+		Readme: &dataset.Readme{
+			Qri:        "rm:0",
+			ScriptPath: "/map/QmQ93yKwktz778AiTjYPKwj1qqbvHDsWpYVff3Eicqn6Z5",
+			ScriptBytes: []byte(`# nyc-transit-data/turnstile_daily_counts_2020
+
+NYC Subway Turnstile Counts Data aggregated by day and station complex for the year 2020.  Updated weekly.
+
+## Where the Data Came From
+
+This aggregation was created from weekly raw turnstile counts published by the New York MTA at [http://web.mta.info/developers/turnstile.html](http://web.mta.info/developers/turnstile.html)
+
+The raw data were imported into a postgresql database for processing, and aggregated to calendar days for each station complex.
+
+The process is outlined in [this blog post](https://medium.com/qri-io/taming-the-mtas-unruly-turnstile-data-c945f5f96ba0), and the code for the data pipeline is [available on github](https://github.com/qri-io/data-stories-scripts/tree/master/nyc-turnstile-counts).
+
+## Caveats
+
+This aggregation is a best-effort to make a clean and usable dataset of station-level counts.  There were some assumptions and important decisions made to arrive at the finished product.
+
+- The dataset excludes tur...`),
+		},
+		Transform: &dataset.Transform{
+			Qri:        "tf:0",
+			ScriptPath: "/map/QmXSce6KDQHLvi4AKDU8z7s4ouynKXKpD6TY7wJgF6reWM",
+		},
+		Structure: &dataset.Structure{
+			Checksum: "QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
+			Depth:    1,
+			Format:   "json",
+			Length:   2,
+			Qri:      "st:0",
+			Schema:   map[string]interface{}{"type": string("array")},
+		},
+		Body:     json.RawMessage(`[]`),
+		BodyPath: "/map/QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
+	}
+
+	if diff := cmp.Diff(expect, got, cmpopts.IgnoreUnexported(dataset.Dataset{}, dataset.Meta{}, dataset.Readme{}, dataset.Transform{})); diff != "" {
+		t.Errorf("result mismatch. (-want +got):\n%s", diff)
+	}
+
+	nowTfRef := addNowTransformDataset(t, r)
+
+	got, err = CreatePreview(ctx, r, nowTfRef)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expect = &dataset.Dataset{
+		Qri:      "ds:0",
+		Peername: "peer",
+		Name:     "now_tf",
+		Path:     "/map/QmShMXWEJ56XyiRUWk8q7Nvzphk7n7Jm7hg32Uf92S6yfq",
+		Commit: &dataset.Commit{
+			Message:   "created dataset",
+			Path:      "/map/QmQ5gzPfZgw1PaSSS7hfzy3sp597pKfWqspsHwsdsR5DEG",
+			Qri:       "cm:0",
+			Signature: "Wj+Q8k+XVYSRc2kRNxfv1d6zJ/8Q+atH3bxOeQH/rYICovHI2D2OqUvI7Oaag4ka9f7vdjxnargmADDl3EuMUlx6vHsWbX64pQ2uMSOM7jya6T7o7URR9vyesko1rVTb8xVyDbZEcDY3+2hf2ZDgVCD5M0WSnqUTGRxT4O1kgOqIPn6GnzudYmNkV/jyi+U/uGzOUM6Au92gysc+vfIsXxgAYuJv3NVrHNYjI504L15nBAfnHPOYfWaUjBtJiyUN36auvP43+/aOxO/O8iK3TfPepO6ne+DMmSXymvqrbcBuuOQLUu8aOO7Z6YTDnUU/bl+9z349CxjhJ1nne8V3SA==",
+			Timestamp: ts,
+			Title:     "created dataset",
+		},
+		Meta: &dataset.Meta{
+			Qri:   "md:0",
+			Title: "example transform",
+		},
+		Readme: &dataset.Readme{
+			Qri:         "rm:0",
+			ScriptPath:  "/map/QmfTcGiaJqhddaEGebrfAWH25YZkpPL7MMTC9swzNnb1FS",
+			ScriptBytes: []byte("# Oh hey there!\nI'm a readme! hello!\n"),
+		},
+		Transform: &dataset.Transform{
+			Qri:        "tf:0",
+			ScriptPath: "/map/QmXSce6KDQHLvi4AKDU8z7s4ouynKXKpD6TY7wJgF6reWM",
+		},
+		Structure: &dataset.Structure{
+			Checksum: "QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
+			Depth:    1,
+			Format:   "json",
+			Length:   2,
+			Qri:      "st:0",
+			Schema:   map[string]interface{}{"type": string("array")},
+		},
+		Body:     json.RawMessage(`[]`),
+		BodyPath: "/map/QmTgK2uYPscacJ9KaBS8tryXRF5mvjuRbubF7h9bG2GgoN",
+	}
+
+	if diff := cmp.Diff(expect, got, cmpopts.IgnoreUnexported(dataset.Dataset{}, dataset.Meta{}, dataset.Readme{}, dataset.Transform{})); diff != "" {
+		t.Errorf("result mismatch. (-want +got):\n%s", diff)
+	}
+}