From ca16f3c08d360fd7cbfe1f048456270b2c29d6ca Mon Sep 17 00:00:00 2001 From: Asmir Avdicevic Date: Tue, 24 Nov 2020 00:20:48 +0100 Subject: [PATCH 1/3] feat(api): change report API --- api/api.go | 1 + api/datasets.go | 38 +++ changes/changes.go | 568 +++++++++++++++++++++++++++++++++ changes/changes_test.go | 672 ++++++++++++++++++++++++++++++++++++++++ fsi/status.go | 2 + go.mod | 2 +- go.sum | 2 + lib/changes.go | 49 +++ lib/lib.go | 2 +- 9 files changed, 1334 insertions(+), 2 deletions(-) create mode 100644 changes/changes.go create mode 100644 changes/changes_test.go create mode 100644 lib/changes.go diff --git a/api/api.go b/api/api.go index e2ac706c4..d1198cb44 100644 --- a/api/api.go +++ b/api/api.go @@ -167,6 +167,7 @@ func NewServerRoutes(s Server) *http.ServeMux { m.Handle("/get/", s.middleware(dsh.GetHandler)) m.Handle("/rename", s.middleware(dsh.RenameHandler)) m.Handle("/diff", s.middleware(dsh.DiffHandler)) + m.Handle("/changes", s.middleware(dsh.ChangesHandler)) // Deprecated, use /get/username/name?component=body or /get/username/name/body.csv m.Handle("/body/", s.middleware(dsh.BodyHandler)) m.Handle("/stats/", s.middleware(dsh.StatsHandler)) diff --git a/api/datasets.go b/api/datasets.go index d5eb71d84..730a38740 100644 --- a/api/datasets.go +++ b/api/datasets.go @@ -97,6 +97,20 @@ func (h *DatasetHandlers) DiffHandler(w http.ResponseWriter, r *http.Request) { } } +// ChangesHandler is a dataset single endpoint +func (h *DatasetHandlers) ChangesHandler(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodPost, http.MethodGet: + if h.ReadOnly { + readOnlyResponse(w, "/changereport") + return + } + h.changesHandler(w, r) + default: + util.NotFoundHandler(w, r) + } +} + // PeerListHandler is a dataset list endpoint func (h *DatasetHandlers) PeerListHandler(w http.ResponseWriter, r *http.Request) { switch r.Method { @@ -340,6 +354,30 @@ func (h *DatasetHandlers) diffHandler(w http.ResponseWriter, r *http.Request) { util.WritePageResponse(w, res, r, util.Page{}) } +func (h *DatasetHandlers) changesHandler(w http.ResponseWriter, r *http.Request) { + req := &lib.ChangeReportParams{} + switch r.Header.Get("Content-Type") { + case "application/json": + if err := json.NewDecoder(r.Body).Decode(req); err != nil { + util.WriteErrResponse(w, http.StatusBadRequest, fmt.Errorf("error decoding body into params: %s", err.Error())) + return + } + default: + req = &lib.ChangeReportParams{ + LeftRefstr: r.FormValue("left_path"), + RightRefstr: r.FormValue("right_path"), + } + } + + res := &lib.ChangeReport{} + if err := h.ChangeReport(req, res); err != nil { + util.WriteErrResponse(w, http.StatusInternalServerError, fmt.Errorf("error generating change report: %s", err.Error())) + return + } + + util.WritePageResponse(w, res, r, util.Page{}) +} + func (h *DatasetHandlers) peerListHandler(w http.ResponseWriter, r *http.Request) { log.Info(r.URL.Path) p := lib.ListParamsFromRequest(r) diff --git a/changes/changes.go b/changes/changes.go new file mode 100644 index 000000000..27c4a3441 --- /dev/null +++ b/changes/changes.go @@ -0,0 +1,568 @@ +package changes + +import ( + "context" + "encoding/json" + "errors" + "fmt" + + golog "github.com/ipfs/go-log" + "github.com/qri-io/dataset" + "github.com/qri-io/dataset/tabular" + "github.com/qri-io/qri/dsref" + qerr "github.com/qri-io/qri/errors" + "github.com/qri-io/qri/fsi" + "github.com/qri-io/qri/stats" +) + +var ( + log = golog.Logger("changes") +) + +// ChangeReportComponent is a generic component used to populate the change report +type ChangeReportComponent struct { + Left interface{} `json:"left"` + Right interface{} `json:"right"` + About map[string]interface{} `json:"about,omitempty"` +} + +// ChangeReportDeltaComponent is a subcomponent that can hold +// delta information between left and right +type ChangeReportDeltaComponent struct { + ChangeReportComponent + Title string `json:"title,omitempty"` + Delta interface{} `json:"delta"` +} + +// StatsChangeComponent represents the stats change report +type StatsChangeComponent struct { + Summary *ChangeReportDeltaComponent `json:"summary"` + Columns []*ChangeReportDeltaComponent `json:"columns"` +} + +// ChangeReportResponse is the result of a call to changereport +type ChangeReportResponse struct { + VersionInfo *ChangeReportComponent `json:"version_info,omitempty"` + Commit *ChangeReportComponent `json:"commit,omitempty"` + Meta *ChangeReportComponent `json:"meta,omitempty"` + Readme *ChangeReportComponent `json:"readme,omitempty"` + Structure *ChangeReportComponent `json:"structure,omitempty"` + Transform *ChangeReportComponent `json:"transform,omitempty"` + Stats *StatsChangeComponent `json:"stats,omitempty"` +} + +// StatsChangeSummaryFields represents the stats summary +type StatsChangeSummaryFields struct { + Entries int `json:"entries"` + Columns int `json:"columns"` + // NullValues int `json:"nullValues"` + TotalSize int `json:"totalSize"` +} + +// EmptyObject is used mostly as a placeholder in palces where it is required +// that a key is present in the response even if empty and not be nil +type EmptyObject map[string]interface{} + +// Service can generate a change report between two datasets +type Service struct { + loader dsref.Loader + stats *stats.Service +} + +// New allocates a Change service +func New(loader dsref.Loader, stats *stats.Service) *Service { + return &Service{ + loader: loader, + stats: stats, + } +} + +func (svc *Service) parseColumns(colItems *tabular.Columns, data *dataset.Dataset) (interface{}, error) { + var sErr error + if data.Structure != nil { + *colItems, _, sErr = tabular.ColumnsFromJSONSchema(data.Structure.Schema) + if sErr != nil { + return nil, sErr + } + return StatsChangeSummaryFields{ + Entries: data.Structure.Entries, + Columns: len(*colItems), + TotalSize: data.Structure.Length, + }, nil + } + return EmptyObject{}, nil +} + +// maybeLoadStats attempts to load stats if not alredy present +// errors out if it fails as stats are required and some datasets might not yet have +// a stats component attached to it +func (svc *Service) maybeLoadStats(ctx context.Context, ds *dataset.Dataset) error { + if ds.Stats != nil { + return nil + } + var statsErr error + ds.Stats, statsErr = svc.stats.Stats(ctx, ds) + if statsErr != nil { + return qerr.New(statsErr, "missing stats components") + } + return nil +} + +// parseStats uses json serializing > deserializing to easily parse the stats +// interface as we have little type safety in the dataset.stats component right now +func (svc *Service) parseStats(ds *dataset.Dataset) ([]EmptyObject, error) { + statsStr, err := json.Marshal(ds.Stats.Stats) + if err != nil { + log.Debugf("failed to load stats: %s", err.Error()) + return nil, qerr.New(err, "failed to load stats") + } + stats := []EmptyObject{} + err = json.Unmarshal(statsStr, &stats) + if err != nil { + log.Debugf("failed to parse stats: %s", err.Error()) + return nil, qerr.New(err, "failed to parse stats") + } + + return stats, nil +} + +func (svc *Service) statsDiff(ctx context.Context, leftDs *dataset.Dataset, rightDs *dataset.Dataset) (*StatsChangeComponent, error) { + res := &StatsChangeComponent{} + + res.Summary = &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{}, + } + + var leftColItems, rightColItems tabular.Columns + var sErr error + res.Summary.Left, sErr = svc.parseColumns(&leftColItems, leftDs) + if sErr != nil { + return &StatsChangeComponent{}, sErr + } + leftColCount := len(leftColItems) + + res.Summary.Right, sErr = svc.parseColumns(&rightColItems, rightDs) + if sErr != nil { + return &StatsChangeComponent{}, sErr + } + rightColCount := len(rightColItems) + + if leftDs.Structure != nil && rightDs.Structure != nil { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: res.Summary.Right.(StatsChangeSummaryFields).Entries - res.Summary.Left.(StatsChangeSummaryFields).Entries, + Columns: rightColCount - leftColCount, + TotalSize: res.Summary.Right.(StatsChangeSummaryFields).TotalSize - res.Summary.Left.(StatsChangeSummaryFields).TotalSize, + } + } else if leftDs.Structure != nil { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: -res.Summary.Left.(StatsChangeSummaryFields).Entries, + Columns: rightColCount - leftColCount, + TotalSize: -res.Summary.Left.(StatsChangeSummaryFields).TotalSize, + } + } else if rightDs.Structure != nil { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: res.Summary.Right.(StatsChangeSummaryFields).Entries, + Columns: rightColCount - leftColCount, + TotalSize: res.Summary.Right.(StatsChangeSummaryFields).TotalSize, + } + } else { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: 0, + Columns: 0, + TotalSize: 0, + } + } + + err := svc.maybeLoadStats(ctx, leftDs) + if err != nil { + return nil, err + } + err = svc.maybeLoadStats(ctx, rightDs) + if err != nil { + return nil, err + } + + leftStats, err := svc.parseStats(leftDs) + if err != nil { + return nil, err + } + rightStats, err := svc.parseStats(rightDs) + if err != nil { + return nil, err + } + + res.Columns, err = svc.matchColumns(leftColCount, rightColCount, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + log.Debugf("failed to calculate stats change report: %s", err.Error()) + return nil, qerr.New(err, "failed to calculate stats change report") + } + + return res, nil +} + +// matchColumns attempts to match up columns from the left and right side based on the column name +// this is not ideal as datasets without a header have generic column names and in case of adding a column +// before the end might shift the alignment and break comparison due to type differences of columns which +// are not properly handled yet +func (svc *Service) matchColumns(leftColCount, rightColCount int, leftColItems, rightColItems tabular.Columns, leftStats, rightStats []EmptyObject) ([]*ChangeReportDeltaComponent, error) { + maxColCount := leftColCount + if rightColCount > maxColCount { + maxColCount = rightColCount + } + + columns := make([]*ChangeReportDeltaComponent, maxColCount) + + type cIndex struct { + LPos int + RPos int + } + + colIndex := map[string]*cIndex{} + for i := 0; i < maxColCount; i++ { + if i < leftColCount { + if c, ok := colIndex[leftColItems[i].Title]; ok && c != nil { + colIndex[leftColItems[i].Title].LPos = i + } else { + colIndex[leftColItems[i].Title] = &cIndex{ + LPos: i, + RPos: -1, + } + } + } + if i < rightColCount { + if c, ok := colIndex[rightColItems[i].Title]; ok && c != nil { + colIndex[rightColItems[i].Title].RPos = i + } else { + colIndex[rightColItems[i].Title] = &cIndex{ + LPos: -1, + RPos: i, + } + } + } + } + + i := 0 + for k, v := range colIndex { + columns[i] = &ChangeReportDeltaComponent{ + Title: k, + } + var lCol, rCol *tabular.Column + if v.LPos >= 0 { + columns[i].Left = leftStats[v.LPos] + lCol = &leftColItems[v.LPos] + } else { + columns[i].Left = EmptyObject{} + } + if v.RPos >= 0 { + columns[i].Right = rightStats[v.RPos] + rCol = &rightColItems[v.RPos] + } else { + columns[i].Right = EmptyObject{} + } + deltaCol, aboutCol, err := svc.columnStatsDelta(columns[i].Left, columns[i].Right, lCol, rCol, v.LPos >= 0, v.RPos >= 0) + if err != nil { + log.Debugf("error calculating stats delta: %s", err.Error()) + return nil, qerr.New(err, fmt.Sprintf("failed to calculate stats column delta for %q", columns[i].Title)) + } + columns[i].Delta = deltaCol + columns[i].About = aboutCol + i++ + } + + return columns, nil +} + +func parseStatsMap(stats interface{}) (map[string]interface{}, error) { + statsMap := map[string]interface{}{} + serialized, err := json.Marshal(stats) + if err != nil { + log.Debugf("error serializing stats") + return nil, err + } + err = json.Unmarshal(serialized, &statsMap) + if err != nil { + log.Debugf("error deserializing stats") + return nil, err + } + return statsMap, nil +} + +func (svc *Service) columnStatsDelta(left, right interface{}, lCol, rCol *tabular.Column, hasLeft, hasRight bool) (map[string]interface{}, map[string]interface{}, error) { + var deltaCol map[string]interface{} + aboutCol := map[string]interface{}{} + var leftStatsMap, rightStatsMap map[string]interface{} + var err error + if hasLeft { + leftStatsMap, err = parseStatsMap(left) + if err != nil { + log.Debugf("error parsing stats map") + return nil, nil, err + } + } + if hasRight { + rightStatsMap, err = parseStatsMap(right) + if err != nil { + log.Debugf("error parsing stats map") + return nil, nil, err + } + } + + //determine shape + if (!hasRight || (hasRight && (rCol.Type.HasType("number") || rCol.Type.HasType("integer")))) && + (!hasLeft || (hasLeft && (lCol.Type.HasType("number") || lCol.Type.HasType("integer")))) { + deltaCol = map[string]interface{}{ + "count": float64(0), + "max": float64(0), + "min": float64(0), + "median": float64(0), + "mean": float64(0), + } + } else if (!hasRight || (hasRight && rCol.Type.HasType("string"))) && (!hasLeft || (hasLeft && lCol.Type.HasType("string"))) { + deltaCol = map[string]interface{}{ + "count": float64(0), + "maxLength": float64(0), + "minLength": float64(0), + "unique": float64(0), + } + } else if (!hasRight || (hasRight && rCol.Type.HasType("boolean"))) && (!hasLeft || (hasLeft && lCol.Type.HasType("boolean"))) { + deltaCol = map[string]interface{}{ + "count": float64(0), + "trueCount": float64(0), + "falseCount": float64(0), + } + } else { + log.Debugf("incompatible column types: %+v / %+v", rCol.Type, lCol.Type) + // TODO(arqu): improve handling of columns with different types + return nil, nil, errors.New("incompatible column types") + } + + // assign values + for k := range deltaCol { + if hasLeft { + if leftStatsMap[k] == nil { + log.Debugf("%s was nil", k) + } else { + deltaCol[k] = deltaCol[k].(float64) - leftStatsMap[k].(float64) + } + } + if hasRight { + if rightStatsMap[k] == nil { + log.Debugf("%s was nil", k) + } else { + deltaCol[k] = deltaCol[k].(float64) + rightStatsMap[k].(float64) + } + } + } + + if hasLeft && !hasRight { + aboutCol["status"] = fsi.STRemoved + } else if !hasLeft && hasRight { + aboutCol["status"] = fsi.STAdd + } else if hasLeft && hasRight { + sum := float64(0) + for k := range deltaCol { + sum += deltaCol[k].(float64) + } + if sum == 0 { + aboutCol["status"] = fsi.STUnmodified + } else { + aboutCol["status"] = fsi.STChange + } + } else { + aboutCol["status"] = fsi.STMissing + } + + return deltaCol, aboutCol, nil +} + +// Report computes the change report of two sources +// This takes some assumptions - we work only with tabular data, with header rows and functional structure.json +func (svc *Service) Report(ctx context.Context, leftRef, rightRef dsref.Ref, loadSource string) (*ChangeReportResponse, error) { + leftDs, err := svc.loader.LoadDataset(ctx, leftRef, loadSource) + if err != nil { + return nil, err + } + if rightRef.Path == "" { + rightRef.Path = leftDs.PreviousPath + } + rightDs, err := svc.loader.LoadDataset(ctx, rightRef, loadSource) + if err != nil { + return nil, err + } + + res := &ChangeReportResponse{} + + leftVi := dsref.ConvertDatasetToVersionInfo(leftDs) + rightVi := dsref.ConvertDatasetToVersionInfo(rightDs) + + res.VersionInfo = &ChangeReportComponent{} + res.VersionInfo.Left = leftVi + res.VersionInfo.Right = rightVi + res.VersionInfo.About = EmptyObject{} + + if leftVi.Path == rightVi.Path { + res.VersionInfo.About["status"] = fsi.STUnmodified + } else { + res.VersionInfo.About["status"] = fsi.STChange + } + + if leftDs.Commit != nil || rightDs.Commit != nil { + res.Commit = &ChangeReportComponent{} + if leftDs.Commit != nil { + res.Commit.Left = leftDs.Commit + } else { + res.Commit.Left = EmptyObject{} + } + if rightDs.Commit != nil { + res.Commit.Right = rightDs.Commit + } else { + res.Commit.Right = EmptyObject{} + } + res.Commit.About = EmptyObject{} + + if leftDs.Commit != nil && rightDs.Commit == nil { + res.Commit.About["status"] = fsi.STRemoved + } else if leftDs.Commit == nil && rightDs.Commit != nil { + res.Commit.About["status"] = fsi.STAdd + } else if leftDs.Commit != nil && rightDs.Commit != nil { + if leftDs.Commit.Path == rightDs.Commit.Path { + res.Commit.About["status"] = fsi.STUnmodified + } else { + res.Commit.About["status"] = fsi.STChange + } + } else { + res.Commit.About["status"] = fsi.STMissing + } + } + + if leftDs.Meta != nil || rightDs.Meta != nil { + res.Meta = &ChangeReportComponent{} + hasLeftMeta := leftDs.Meta != nil && !leftDs.Meta.IsEmpty() + hasRightMeta := rightDs.Meta != nil && !rightDs.Meta.IsEmpty() + + if hasLeftMeta { + res.Meta.Left = leftDs.Meta + } else { + res.Meta.Left = EmptyObject{} + } + if hasRightMeta { + res.Meta.Right = rightDs.Meta + } else { + res.Meta.Right = EmptyObject{} + } + res.Meta.About = EmptyObject{} + + if hasLeftMeta && !hasRightMeta { + res.Meta.About["status"] = fsi.STRemoved + } else if !hasLeftMeta && hasRightMeta { + res.Meta.About["status"] = fsi.STAdd + } else if hasLeftMeta && hasRightMeta { + if leftDs.Meta.Path == rightDs.Meta.Path { + res.Meta.About["status"] = fsi.STUnmodified + } else { + res.Meta.About["status"] = fsi.STChange + } + } else { + res.Meta.About["status"] = fsi.STMissing + } + } + + if leftDs.Readme != nil || rightDs.Readme != nil { + res.Readme = &ChangeReportComponent{} + if leftDs.Readme != nil { + res.Readme.Left = string(leftDs.Readme.ScriptBytes) + } else { + res.Readme.Left = "" + } + if rightDs.Readme != nil { + res.Readme.Right = string(rightDs.Readme.ScriptBytes) + } else { + res.Readme.Right = "" + } + res.Readme.About = EmptyObject{} + + if res.Readme.Left != "" && res.Readme.Right == "" { + res.Readme.About["status"] = fsi.STRemoved + } else if res.Readme.Left == "" && res.Readme.Right != "" { + res.Readme.About["status"] = fsi.STAdd + } else if res.Readme.Left != "" && res.Readme.Right != "" { + if res.Readme.Left == res.Readme.Right { + res.Readme.About["status"] = fsi.STUnmodified + } else { + res.Readme.About["status"] = fsi.STChange + } + } else { + res.Readme.About["status"] = fsi.STMissing + } + } + + if leftDs.Structure != nil || rightDs.Structure != nil { + res.Structure = &ChangeReportComponent{} + if leftDs.Structure != nil { + if leftDs.Structure.Format != "csv" { + return nil, errors.New("changes are only supported for CSV datasets") + } + res.Structure.Left = leftDs.Structure + } else { + res.Structure.Left = EmptyObject{} + } + if rightDs.Structure != nil { + if rightDs.Structure.Format != "csv" { + return nil, errors.New("changes are only supported for CSV datasets") + } + res.Structure.Right = rightDs.Structure + } else { + res.Structure.Right = EmptyObject{} + } + res.Structure.About = EmptyObject{} + + if leftDs.Structure != nil && rightDs.Structure == nil { + res.Structure.About["status"] = fsi.STRemoved + } else if leftDs.Structure == nil && rightDs.Structure != nil { + res.Structure.About["status"] = fsi.STAdd + } else if leftDs.Structure != nil && rightDs.Structure != nil { + if leftDs.Structure.Path == rightDs.Structure.Path { + res.Structure.About["status"] = fsi.STUnmodified + } else { + res.Structure.About["status"] = fsi.STChange + } + } else { + res.Structure.About["status"] = fsi.STMissing + } + } + + if leftDs.Transform != nil || rightDs.Transform != nil { + res.Transform = &ChangeReportComponent{} + if leftDs.Transform != nil { + res.Transform.Left = string(leftDs.Transform.ScriptBytes) + } else { + res.Transform.Left = "" + } + if rightDs.Transform != nil { + res.Transform.Right = string(rightDs.Transform.ScriptBytes) + } else { + res.Transform.Right = "" + } + res.Transform.About = EmptyObject{} + + if res.Transform.Left != "" && res.Transform.Right == "" { + res.Transform.About["status"] = fsi.STRemoved + } else if res.Transform.Left == "" && res.Transform.Right != "" { + res.Transform.About["status"] = fsi.STAdd + } else if res.Transform.Left != "" && res.Transform.Right != "" { + if res.Transform.Left == res.Transform.Right { + res.Transform.About["status"] = fsi.STUnmodified + } else { + res.Transform.About["status"] = fsi.STChange + } + } else { + res.Transform.About["status"] = fsi.STMissing + } + } + + res.Stats, err = svc.statsDiff(ctx, leftDs, rightDs) + if err != nil { + return nil, err + } + return res, nil +} diff --git a/changes/changes_test.go b/changes/changes_test.go new file mode 100644 index 000000000..c43407256 --- /dev/null +++ b/changes/changes_test.go @@ -0,0 +1,672 @@ +package changes + +import ( + "context" + "io/ioutil" + "os" + "sort" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/qri-io/dataset" + "github.com/qri-io/dataset/tabular" + "github.com/qri-io/qfs" + "github.com/qri-io/qri/base" + "github.com/qri-io/qri/base/dsfs" + "github.com/qri-io/qri/dsref" + "github.com/qri-io/qri/fsi" + "github.com/qri-io/qri/repo" + repotest "github.com/qri-io/qri/repo/test" + "github.com/qri-io/qri/stats" +) + +func newTestService(t *testing.T, r repo.Repo, workDir string) *Service { + cache, err := stats.NewLocalCache(workDir, 1000<<8) + if err != nil { + t.Fatal(err) + } + statsSvc := stats.New(cache) + loader := base.NewLocalDatasetLoader(r.Filesystem()) + + return New(loader, statsSvc) +} + +func updateDataset(t *testing.T, r repo.Repo, ds *dataset.Dataset, newBody string) dsref.Ref { + ctx := context.Background() + currRef := dsref.ConvertDatasetToVersionInfo(ds).SimpleRef() + + ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(newBody))) + ds.PreviousPath = currRef.Path + + // force recalculate structure as that is what we rely on for the change reports + ds.Structure = nil + if err := base.InferStructure(ds); err != nil { + t.Fatal(err.Error()) + } + + res, err := base.CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, dsfs.SaveSwitches{Pin: true, ShouldRender: true}) + if err != nil { + t.Fatal(err.Error()) + } + return dsref.ConvertDatasetToVersionInfo(res).SimpleRef() +} + +func getBaseCols() []*ChangeReportDeltaComponent { + return []*ChangeReportDeltaComponent{ + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "max": float64(65.25), + "min": float64(44.4), + "mean": float64(52.04), + "median": float64(50.65), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(44.4), + float64(50.65), + float64(55.5), + float64(65.25), + float64(66.25), + }, + "frequencies": []interface{}{ + float64(2), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + Right: EmptyObject{ + "count": float64(5), + "max": float64(5000.65), + "min": float64(44), + "mean": float64(1238.06), + "median": float64(440.4), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(44), + float64(55), + float64(440.4), + float64(650.25), + float64(5000.65), + float64(5001.65), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + About: map[string]interface{}{ + "status": fsi.STChange, + }, + }, + Title: "avg_age", + Delta: map[string]interface{}{ + "count": float64(0), + "max": float64(4935.4), + "mean": float64(1186.02), + "median": float64(389.75), + "min": float64(-0.3999999999999986), + }, + }, + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "maxLength": float64(8), + "minLength": float64(7), + "unique": float64(5), + "frequencies": map[string]interface{}{ + "chatham": float64(1), + "chicago": float64(1), + "new york": float64(1), + "raleigh": float64(1), + "toronto": float64(1), + }, + "type": "string", + }, + Right: EmptyObject{ + "count": float64(5), + "maxLength": float64(8), + "minLength": float64(7), + "unique": float64(5), + "frequencies": map[string]interface{}{ + "chatham": float64(1), + "chicago": float64(1), + "new york": float64(1), + "raleigh": float64(1), + "toronto": float64(1), + }, + "type": "string", + }, + About: map[string]interface{}{ + "status": fsi.STUnmodified, + }, + }, + Title: "city", + Delta: map[string]interface{}{ + "count": float64(0), + "maxLength": float64(0), + "minLength": float64(0), + "unique": float64(0), + }, + }, + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "falseCount": float64(1), + "trueCount": float64(4), + "type": "boolean", + }, + Right: EmptyObject{ + "count": float64(5), + "falseCount": float64(5), + "trueCount": float64(0), + "type": "boolean", + }, + About: map[string]interface{}{ + "status": fsi.STUnmodified, + }, + }, + Title: "in_usa", + Delta: map[string]interface{}{ + "count": float64(0), + "falseCount": float64(4), + "trueCount": float64(-4), + }, + }, + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "max": float64(40000000), + "min": float64(35000), + "mean": float64(9817000), + "median": float64(300000), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(35000), + float64(250000), + float64(300000), + float64(8500000), + float64(40000000), + float64(40000001), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + Right: EmptyObject{ + "count": float64(5), + "max": float64(4000000), + "min": float64(3500), + "mean": float64(981700), + "median": float64(30000), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(3500), + float64(25000), + float64(30000), + float64(850000), + float64(4000000), + float64(4000001), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + About: map[string]interface{}{ + "status": fsi.STChange, + }, + }, + Title: "pop", + Delta: map[string]interface{}{ + "count": float64(0), + "max": float64(-36000000), + "mean": float64(-8835300), + "median": float64(-270000), + "min": float64(-31500), + }, + }, + } +} + +func TestStatsDiff(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + ds.Name = "cities" + leftDs := *ds + + // alter body file + const alteredBodyData = `city,pop,avg_age,in_usa +toronto,4000000,55.0,false +new york,850000,44.0,false +chicago,30000,440.4,false +chatham,3500,650.25,false +raleigh,25000,5000.65,false` + + updateDataset(t, mr, ds, alteredBodyData) + + res, err := svc.statsDiff(ctx, &leftDs, ds) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(res.Columns, func(i, j int) bool { + return res.Columns[i].Title < res.Columns[j].Title + }) + + expect := &StatsChangeComponent{ + Summary: &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: StatsChangeSummaryFields{Entries: 5, Columns: 4, TotalSize: 155}, + Right: StatsChangeSummaryFields{Entries: 5, Columns: 4, TotalSize: 157}, + }, + Delta: StatsChangeSummaryFields{ + Entries: 0, + Columns: 0, + TotalSize: 2, + }, + }, + Columns: getBaseCols(), + } + + if diff := cmp.Diff(res, expect); diff != "" { + t.Errorf("stat component result mismatch. (-want +got):%s\n", diff) + } +} + +func TestParseColumns(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + var colItems tabular.Columns + summary, err := svc.parseColumns(&colItems, ds) + if err != nil { + t.Fatal(err) + } + + expectColItems := tabular.Columns{ + tabular.Column{ + Title: "city", + Type: &tabular.ColType{"string"}, + }, + tabular.Column{ + Title: "pop", + Type: &tabular.ColType{"integer"}, + }, + tabular.Column{ + Title: "avg_age", + Type: &tabular.ColType{"number"}, + }, + tabular.Column{ + Title: "in_usa", + Type: &tabular.ColType{"boolean"}, + }, + } + + if diff := cmp.Diff(colItems, expectColItems); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } + + expectSummary := StatsChangeSummaryFields{ + Entries: 5, + Columns: 4, + TotalSize: 155, + } + + if diff := cmp.Diff(summary, expectSummary); diff != "" { + t.Errorf("stats summary result mismatch. (-want +got):%s\n", diff) + } +} + +func TestMaybeLoadStats(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + if ds.Stats == nil { + t.Fatal("stats are nil") + } + + ds.Stats = nil + + svc.maybeLoadStats(ctx, ds) + if ds.Stats == nil { + t.Fatal("stats are nil") + } +} + +func TestMatchColumns(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + ds.Name = "cities" + leftDs := *ds + + // alter body file + const alteredBodyData = `city,pop,avg_age,in_usa +toronto,4000000,55.0,false +new york,850000,44.0,false +chicago,30000,440.4,false +chatham,3500,650.25,false +raleigh,25000,5000.65,false` + + updateDataset(t, mr, ds, alteredBodyData) + + var leftColItems tabular.Columns + _, err = svc.parseColumns(&leftColItems, &leftDs) + if err != nil { + t.Fatal(err) + } + leftStats, err := svc.parseStats(&leftDs) + if err != nil { + t.Fatal(err) + } + + var rightColItems tabular.Columns + _, err = svc.parseColumns(&rightColItems, ds) + if err != nil { + t.Fatal(err) + } + rightStats, err := svc.parseStats(ds) + if err != nil { + t.Fatal(err) + } + + report, err := svc.matchColumns(4, 4, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(report, func(i, j int) bool { + return report[i].Title < report[j].Title + }) + + expect := getBaseCols() + + if diff := cmp.Diff(report, expect); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } + + // alter body file - remove column + const alteredBodyDataColumns1 = `city,avg_age,in_usa +toronto,55.0,false +new york,44.0,false +chicago,440.4,false +chatham,650.25,false +raleigh,5000.65,false` + + ds.Name = "cities" + + updateDataset(t, mr, ds, alteredBodyDataColumns1) + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + _, err = svc.parseColumns(&rightColItems, ds) + if err != nil { + t.Fatal(err) + } + rightStats, err = svc.parseStats(ds) + if err != nil { + t.Fatal(err) + } + + report, err = svc.matchColumns(4, 3, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(report, func(i, j int) bool { + return report[i].Title < report[j].Title + }) + + expect = getBaseCols() + expect[3] = &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "max": float64(40000000), + "min": float64(35000), + "mean": float64(9817000), + "median": float64(300000), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(35000), + float64(250000), + float64(300000), + float64(8500000), + float64(40000000), + float64(40000001), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + Right: EmptyObject{}, + About: map[string]interface{}{ + "status": fsi.STRemoved, + }, + }, + Title: "pop", + Delta: map[string]interface{}{ + "count": float64(-5), + "max": float64(-40000000), + "mean": float64(-9817000), + "median": float64(-300000), + "min": float64(-35000), + }, + } + + if diff := cmp.Diff(report, expect); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } + + // alter body file - add column + const alteredBodyDataColumns2 = `city,pop,avg_age,in_usa,score +toronto,4000000,55.0,false,1 +new york,850000,44.0,false,2 +chicago,30000,440.4,false,3 +chatham,3500,650.25,false,4 +raleigh,25000,5000.65,false,5` + + ds.Name = "cities" + + updateDataset(t, mr, ds, alteredBodyDataColumns2) + + _, err = svc.parseColumns(&rightColItems, ds) + if err != nil { + t.Fatal(err) + } + rightStats, err = svc.parseStats(ds) + if err != nil { + t.Fatal(err) + } + + report, err = svc.matchColumns(4, 5, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(report, func(i, j int) bool { + return report[i].Title < report[j].Title + }) + + expect = getBaseCols() + expect = append(expect, &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{}, + Right: EmptyObject{ + "count": float64(5), + "max": float64(5), + "min": float64(1), + "mean": float64(3), + "median": float64(3), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(1), + float64(2), + float64(3), + float64(4), + float64(5), + float64(6), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + About: map[string]interface{}{ + "status": fsi.STAdd, + }, + }, + Title: "score", + Delta: map[string]interface{}{ + "count": float64(5), + "max": float64(5), + "mean": float64(3), + "median": float64(3), + "min": float64(1), + }, + }) + + if diff := cmp.Diff(report, expect); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } +} diff --git a/fsi/status.go b/fsi/status.go index f3b6a9fe9..bfe7bde42 100644 --- a/fsi/status.go +++ b/fsi/status.go @@ -25,6 +25,8 @@ var ( STRemoved = "removed" // STParseError is a component that didn't parse STParseError = "parse error" + // STMissing is a component that is missing + STMissing = "missing" // STConflictError is a component with a conflict STConflictError = "conflict error" // ErrWorkingDirectoryDirty is the error for when the working directory is not clean diff --git a/go.mod b/go.mod index 3644e4add..050b7c7cc 100644 --- a/go.mod +++ b/go.mod @@ -39,7 +39,7 @@ require ( github.com/olekukonko/tablewriter v0.0.4 github.com/pkg/errors v0.9.1 github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5 - github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6 + github.com/qri-io/dataset v0.2.1-0.20201201155506-9b4fc79ffde8 github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b github.com/qri-io/didmod v0.0.0-20201123165422-8b2e224c993a github.com/qri-io/doggos v0.1.0 diff --git a/go.sum b/go.sum index c5b0aa732..ac6f4f7bd 100644 --- a/go.sum +++ b/go.sum @@ -1128,6 +1128,8 @@ github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5 h1:xeMaT6fLTvdrFOOP2N github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5/go.mod h1:1AwOy3yhcZTAXzaF4wGSdnrp87u3PBOrsWXUjOtQCXo= github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6 h1:a9CYZQ+DCzwqg8BgEN5oKboBoxueaYf0EKPnXeR/Mhk= github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6/go.mod h1:HtwGskdCECbOON0iVQHEEm6fykwDqharlqabc1ssj3Y= +github.com/qri-io/dataset v0.2.1-0.20201201155506-9b4fc79ffde8 h1:/9pbWabRT9BbjFp1AjdAsXKT2NQp+mGmyvnTylPyEHY= +github.com/qri-io/dataset v0.2.1-0.20201201155506-9b4fc79ffde8/go.mod h1:HtwGskdCECbOON0iVQHEEm6fykwDqharlqabc1ssj3Y= github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b h1:T8qEIv+qLi5mVWvSS329wJ+HbN7cfMwCWjRVzh/+upo= github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b/go.mod h1:NrL/b7YvexgpGb4HEO3Rlx5RrMLDfxuKDf/XDAq5ac0= github.com/qri-io/didmod v0.0.0-20201123165422-8b2e224c993a h1:40BIa59lae2xZ7iieb3UU4/X57jZsWZ6QgqwdjDQhig= diff --git a/lib/changes.go b/lib/changes.go new file mode 100644 index 000000000..d056279db --- /dev/null +++ b/lib/changes.go @@ -0,0 +1,49 @@ +package lib + +import ( + "context" + + "github.com/qri-io/qri/changes" + "github.com/qri-io/qri/dsref" +) + +// ChangeReportParams defines parameters for diffing two sources +type ChangeReportParams struct { + LeftRefstr string `json:"left"` + RightRefstr string `json:"right"` +} + +// ChangeReport is a simple utility type declaration +type ChangeReport = changes.ChangeReportResponse + +// ChangeReport resolves the requested datasets and tries to generate a change report +func (m *DatasetMethods) ChangeReport(p *ChangeReportParams, res *ChangeReport) error { + ctx := context.TODO() + reportSource := "" + + if m.inst.rpc != nil { + return checkRPCError(m.inst.rpc.Call("DatasetMethods.ChangeReport", p, res)) + } + + right, _, err := m.inst.ParseAndResolveRef(ctx, p.RightRefstr, reportSource) + if err != nil { + return err + } + + var left dsref.Ref + if p.LeftRefstr != "" { + if left, _, err = m.inst.ParseAndResolveRef(ctx, p.LeftRefstr, reportSource); err != nil { + return err + } + } else { + left = dsref.Ref{Username: right.Username, Name: right.Name} + } + + report, err := changes.New(m.inst, m.inst.stats).Report(ctx, left, right, reportSource) + if err != nil { + return err + } + + *res = *report + return nil +} diff --git a/lib/lib.go b/lib/lib.go index 60aea7513..0cc5fb338 100644 --- a/lib/lib.go +++ b/lib/lib.go @@ -384,7 +384,7 @@ func NewInstance(ctx context.Context, repoPath string, opts ...Option) (qri *Ins // if logAll is enabled, turn on debug level logging for all qri packages. Packages need to // be explicitly enumerated here if o.logAll { - allPackages := []string{"qriapi", "qrip2p", "base", "cmd", "config", "dsref", "dsfs", "friendly", "fsi", "lib", "logbook", "profile", "repo"} + allPackages := []string{"qriapi", "qrip2p", "base", "changes", "cmd", "config", "dsref", "dsfs", "friendly", "fsi", "lib", "logbook", "profile", "repo"} for _, name := range allPackages { golog.SetLogLevel(name, "debug") } From f6153d403e0342249f982d33299e3bb1a6697a00 Mon Sep 17 00:00:00 2001 From: b5 Date: Wed, 9 Dec 2020 09:14:57 -0500 Subject: [PATCH 2/3] test(changes): refactor tests to use testRunner pattern, test Report reduce boilerplate by creating a test runner for the changes package, switch to a nil cache for the underlying Stats service, which removes the need for temp directories in each test. Adding a number of conveince methods to the runner makes tests more readable. Add an easy-path test to the Report function for a 20% test coverage bump. --- changes/changes_test.go | 290 +++++++++++++++++++--------------------- 1 file changed, 138 insertions(+), 152 deletions(-) diff --git a/changes/changes_test.go b/changes/changes_test.go index c43407256..4654083b2 100644 --- a/changes/changes_test.go +++ b/changes/changes_test.go @@ -2,8 +2,6 @@ package changes import ( "context" - "io/ioutil" - "os" "sort" "testing" @@ -20,37 +18,6 @@ import ( "github.com/qri-io/qri/stats" ) -func newTestService(t *testing.T, r repo.Repo, workDir string) *Service { - cache, err := stats.NewLocalCache(workDir, 1000<<8) - if err != nil { - t.Fatal(err) - } - statsSvc := stats.New(cache) - loader := base.NewLocalDatasetLoader(r.Filesystem()) - - return New(loader, statsSvc) -} - -func updateDataset(t *testing.T, r repo.Repo, ds *dataset.Dataset, newBody string) dsref.Ref { - ctx := context.Background() - currRef := dsref.ConvertDatasetToVersionInfo(ds).SimpleRef() - - ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(newBody))) - ds.PreviousPath = currRef.Path - - // force recalculate structure as that is what we rely on for the change reports - ds.Structure = nil - if err := base.InferStructure(ds); err != nil { - t.Fatal(err.Error()) - } - - res, err := base.CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, dsfs.SaveSwitches{Pin: true, ShouldRender: true}) - if err != nil { - t.Fatal(err.Error()) - } - return dsref.ConvertDatasetToVersionInfo(res).SimpleRef() -} - func getBaseCols() []*ChangeReportDeltaComponent { return []*ChangeReportDeltaComponent{ &ChangeReportDeltaComponent{ @@ -253,47 +220,16 @@ func getBaseCols() []*ChangeReportDeltaComponent { func TestStatsDiff(t *testing.T) { ctx := context.Background() + run := newTestRunner(t) + svc := run.Service - workDir, err := ioutil.TempDir("", "qri_test_changes_service") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(workDir) - - mr, err := repotest.NewTestRepo() - if err != nil { - t.Fatal(err) - } + cities1 := run.MustResolve(t, "peer/cities") + cities2 := run.updateCitiesDataset(t) - svc := newTestService(t, mr, workDir) - - ref := dsref.MustParse("peer/cities") - if _, err := mr.ResolveRef(ctx, &ref); err != nil { - t.Fatal(err) - } + leftDs := run.MustLoadRef(t, cities1) + rightDs := run.MustLoadRef(t, cities2) - ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) - if err != nil { - t.Fatal(err) - } - if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { - t.Fatal(err) - } - - ds.Name = "cities" - leftDs := *ds - - // alter body file - const alteredBodyData = `city,pop,avg_age,in_usa -toronto,4000000,55.0,false -new york,850000,44.0,false -chicago,30000,440.4,false -chatham,3500,650.25,false -raleigh,25000,5000.65,false` - - updateDataset(t, mr, ds, alteredBodyData) - - res, err := svc.statsDiff(ctx, &leftDs, ds) + res, err := svc.statsDiff(ctx, leftDs, rightDs) if err != nil { t.Fatal(err) } @@ -324,33 +260,11 @@ raleigh,25000,5000.65,false` } func TestParseColumns(t *testing.T) { - ctx := context.Background() - - workDir, err := ioutil.TempDir("", "qri_test_changes_service") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(workDir) + run := newTestRunner(t) + svc := run.Service - mr, err := repotest.NewTestRepo() - if err != nil { - t.Fatal(err) - } - - svc := newTestService(t, mr, workDir) - - ref := dsref.MustParse("peer/cities") - if _, err := mr.ResolveRef(ctx, &ref); err != nil { - t.Fatal(err) - } - - ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) - if err != nil { - t.Fatal(err) - } - if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { - t.Fatal(err) - } + ref := run.MustResolve(t, "peer/cities") + ds := run.MustLoadRef(t, ref) var colItems tabular.Columns summary, err := svc.parseColumns(&colItems, ds) @@ -394,32 +308,11 @@ func TestParseColumns(t *testing.T) { func TestMaybeLoadStats(t *testing.T) { ctx := context.Background() + run := newTestRunner(t) + svc := run.Service - workDir, err := ioutil.TempDir("", "qri_test_changes_service") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(workDir) - - mr, err := repotest.NewTestRepo() - if err != nil { - t.Fatal(err) - } - - svc := newTestService(t, mr, workDir) - - ref := dsref.MustParse("peer/cities") - if _, err := mr.ResolveRef(ctx, &ref); err != nil { - t.Fatal(err) - } - - ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) - if err != nil { - t.Fatal(err) - } - if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { - t.Fatal(err) - } + ref := run.MustResolve(t, "peer/cities") + ds := run.MustLoadRef(t, ref) if ds.Stats == nil { t.Fatal("stats are nil") @@ -435,32 +328,11 @@ func TestMaybeLoadStats(t *testing.T) { func TestMatchColumns(t *testing.T) { ctx := context.Background() + run := newTestRunner(t) + svc := run.Service - workDir, err := ioutil.TempDir("", "qri_test_changes_service") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(workDir) - - mr, err := repotest.NewTestRepo() - if err != nil { - t.Fatal(err) - } - - svc := newTestService(t, mr, workDir) - - ref := dsref.MustParse("peer/cities") - if _, err := mr.ResolveRef(ctx, &ref); err != nil { - t.Fatal(err) - } - - ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) - if err != nil { - t.Fatal(err) - } - if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { - t.Fatal(err) - } + ref := run.MustResolve(t, "peer/cities") + ds := run.MustLoadRef(t, ref) ds.Name = "cities" leftDs := *ds @@ -473,10 +345,10 @@ chicago,30000,440.4,false chatham,3500,650.25,false raleigh,25000,5000.65,false` - updateDataset(t, mr, ds, alteredBodyData) + run.updateDataset(t, ds, alteredBodyData) var leftColItems tabular.Columns - _, err = svc.parseColumns(&leftColItems, &leftDs) + _, err := svc.parseColumns(&leftColItems, &leftDs) if err != nil { t.Fatal(err) } @@ -521,8 +393,8 @@ raleigh,5000.65,false` ds.Name = "cities" - updateDataset(t, mr, ds, alteredBodyDataColumns1) - if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + run.updateDataset(t, ds, alteredBodyDataColumns1) + if err = base.OpenDataset(ctx, run.Repo.Filesystem(), ds); err != nil { t.Fatal(err) } @@ -602,7 +474,7 @@ raleigh,25000,5000.65,false,5` ds.Name = "cities" - updateDataset(t, mr, ds, alteredBodyDataColumns2) + run.updateDataset(t, ds, alteredBodyDataColumns2) _, err = svc.parseColumns(&rightColItems, ds) if err != nil { @@ -670,3 +542,117 @@ raleigh,25000,5000.65,false,5` t.Errorf("column items result mismatch. (-want +got):%s\n", diff) } } + +func TestReport(t *testing.T) { + ctx := context.Background() + run := newTestRunner(t) + svc := run.Service + + cities1 := dsref.MustParse("peer/cities") + if _, err := run.Repo.ResolveRef(ctx, &cities1); err != nil { + t.Fatal(err) + } + + cities2 := run.updateCitiesDataset(t) + + _, err := svc.Report(ctx, cities1, cities2, "") + if err != nil { + t.Fatal(err) + } +} + +type testRunner struct { + Repo repo.Repo + Service *Service +} + +func newTestRunner(t *testing.T) (run *testRunner) { + t.Helper() + + r, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + statsSvc := stats.New(nil) + loader := base.NewLocalDatasetLoader(r.Filesystem()) + + return &testRunner{ + Repo: r, + Service: New(loader, statsSvc), + } +} + +func (run *testRunner) MustResolve(t *testing.T, refstr string) dsref.Ref { + t.Helper() + ctx := context.Background() + + ref := dsref.MustParse("peer/cities") + if _, err := run.Repo.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + return ref +} + +func (run *testRunner) MustLoadRef(t *testing.T, ref dsref.Ref) *dataset.Dataset { + t.Helper() + ctx := context.Background() + + ds, err := dsfs.LoadDataset(ctx, run.Repo.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, run.Repo.Filesystem(), ds); err != nil { + t.Fatal(err) + } + return ds +} + +func (run *testRunner) updateDataset(t *testing.T, ds *dataset.Dataset, newBody string) dsref.Ref { + t.Helper() + r := run.Repo + + ctx := context.Background() + currRef := dsref.ConvertDatasetToVersionInfo(ds).SimpleRef() + + ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(newBody))) + ds.PreviousPath = currRef.Path + + // force recalculate structure as that is what we rely on for the change reports + ds.Structure = nil + if err := base.InferStructure(ds); err != nil { + t.Fatal(err.Error()) + } + + res, err := base.CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, dsfs.SaveSwitches{Pin: true, ShouldRender: true}) + if err != nil { + t.Fatal(err.Error()) + } + return dsref.ConvertDatasetToVersionInfo(res).SimpleRef() +} + +func (run *testRunner) updateCitiesDataset(t *testing.T) dsref.Ref { + t.Helper() + ctx := context.Background() + + ref := run.MustResolve(t, "peer/cities") + ds, err := dsfs.LoadDataset(ctx, run.Repo.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, run.Repo.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + ds.Name = "cities" + + // alter body file + const alteredBodyData = `city,pop,avg_age,in_usa +toronto,4000000,55.0,false +new york,850000,44.0,false +chicago,30000,440.4,false +chatham,3500,650.25,false +raleigh,25000,5000.65,false` + + return run.updateDataset(t, ds, alteredBodyData) +} From 9fb1b53c7efd5b72bd75f2ece15ac3f8c5b23c90 Mon Sep 17 00:00:00 2001 From: b5 Date: Wed, 9 Dec 2020 09:51:15 -0500 Subject: [PATCH 3/3] refactor(changes): export Service as an interface presents a cleaner API to package consumers --- changes/changes.go | 28 ++++++++++++++++++---------- changes/changes_test.go | 4 ++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/changes/changes.go b/changes/changes.go index 27c4a3441..5a6586ba0 100644 --- a/changes/changes.go +++ b/changes/changes.go @@ -63,21 +63,29 @@ type StatsChangeSummaryFields struct { // that a key is present in the response even if empty and not be nil type EmptyObject map[string]interface{} +// Service generates a change report between two datasets +type Service interface { + Report(ctx context.Context, leftRef, rightRef dsref.Ref, loadSource string) (*ChangeReportResponse, error) +} + // Service can generate a change report between two datasets -type Service struct { +type service struct { loader dsref.Loader stats *stats.Service } +// assert at compile time that service implements the Service interface +var _ Service = (*service)(nil) + // New allocates a Change service -func New(loader dsref.Loader, stats *stats.Service) *Service { - return &Service{ +func New(loader dsref.Loader, stats *stats.Service) Service { + return &service{ loader: loader, stats: stats, } } -func (svc *Service) parseColumns(colItems *tabular.Columns, data *dataset.Dataset) (interface{}, error) { +func (svc *service) parseColumns(colItems *tabular.Columns, data *dataset.Dataset) (interface{}, error) { var sErr error if data.Structure != nil { *colItems, _, sErr = tabular.ColumnsFromJSONSchema(data.Structure.Schema) @@ -96,7 +104,7 @@ func (svc *Service) parseColumns(colItems *tabular.Columns, data *dataset.Datase // maybeLoadStats attempts to load stats if not alredy present // errors out if it fails as stats are required and some datasets might not yet have // a stats component attached to it -func (svc *Service) maybeLoadStats(ctx context.Context, ds *dataset.Dataset) error { +func (svc *service) maybeLoadStats(ctx context.Context, ds *dataset.Dataset) error { if ds.Stats != nil { return nil } @@ -110,7 +118,7 @@ func (svc *Service) maybeLoadStats(ctx context.Context, ds *dataset.Dataset) err // parseStats uses json serializing > deserializing to easily parse the stats // interface as we have little type safety in the dataset.stats component right now -func (svc *Service) parseStats(ds *dataset.Dataset) ([]EmptyObject, error) { +func (svc *service) parseStats(ds *dataset.Dataset) ([]EmptyObject, error) { statsStr, err := json.Marshal(ds.Stats.Stats) if err != nil { log.Debugf("failed to load stats: %s", err.Error()) @@ -126,7 +134,7 @@ func (svc *Service) parseStats(ds *dataset.Dataset) ([]EmptyObject, error) { return stats, nil } -func (svc *Service) statsDiff(ctx context.Context, leftDs *dataset.Dataset, rightDs *dataset.Dataset) (*StatsChangeComponent, error) { +func (svc *service) statsDiff(ctx context.Context, leftDs *dataset.Dataset, rightDs *dataset.Dataset) (*StatsChangeComponent, error) { res := &StatsChangeComponent{} res.Summary = &ChangeReportDeltaComponent{ @@ -204,7 +212,7 @@ func (svc *Service) statsDiff(ctx context.Context, leftDs *dataset.Dataset, righ // this is not ideal as datasets without a header have generic column names and in case of adding a column // before the end might shift the alignment and break comparison due to type differences of columns which // are not properly handled yet -func (svc *Service) matchColumns(leftColCount, rightColCount int, leftColItems, rightColItems tabular.Columns, leftStats, rightStats []EmptyObject) ([]*ChangeReportDeltaComponent, error) { +func (svc *service) matchColumns(leftColCount, rightColCount int, leftColItems, rightColItems tabular.Columns, leftStats, rightStats []EmptyObject) ([]*ChangeReportDeltaComponent, error) { maxColCount := leftColCount if rightColCount > maxColCount { maxColCount = rightColCount @@ -287,7 +295,7 @@ func parseStatsMap(stats interface{}) (map[string]interface{}, error) { return statsMap, nil } -func (svc *Service) columnStatsDelta(left, right interface{}, lCol, rCol *tabular.Column, hasLeft, hasRight bool) (map[string]interface{}, map[string]interface{}, error) { +func (svc *service) columnStatsDelta(left, right interface{}, lCol, rCol *tabular.Column, hasLeft, hasRight bool) (map[string]interface{}, map[string]interface{}, error) { var deltaCol map[string]interface{} aboutCol := map[string]interface{}{} var leftStatsMap, rightStatsMap map[string]interface{} @@ -377,7 +385,7 @@ func (svc *Service) columnStatsDelta(left, right interface{}, lCol, rCol *tabula // Report computes the change report of two sources // This takes some assumptions - we work only with tabular data, with header rows and functional structure.json -func (svc *Service) Report(ctx context.Context, leftRef, rightRef dsref.Ref, loadSource string) (*ChangeReportResponse, error) { +func (svc *service) Report(ctx context.Context, leftRef, rightRef dsref.Ref, loadSource string) (*ChangeReportResponse, error) { leftDs, err := svc.loader.LoadDataset(ctx, leftRef, loadSource) if err != nil { return nil, err diff --git a/changes/changes_test.go b/changes/changes_test.go index 4654083b2..83b6d3562 100644 --- a/changes/changes_test.go +++ b/changes/changes_test.go @@ -563,7 +563,7 @@ func TestReport(t *testing.T) { type testRunner struct { Repo repo.Repo - Service *Service + Service *service } func newTestRunner(t *testing.T) (run *testRunner) { @@ -579,7 +579,7 @@ func newTestRunner(t *testing.T) (run *testRunner) { return &testRunner{ Repo: r, - Service: New(loader, statsSvc), + Service: New(loader, statsSvc).(*service), } }