Skip to content

Commit

Permalink
fix(save): If body is too large to diff, compare checksums
Browse files Browse the repository at this point in the history
Merge pull request #1253 from qri-io/compare-checksum
  • Loading branch information
dustmop authored Apr 5, 2020
2 parents c21419b + 37dc5c7 commit ddb7713
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 10 deletions.
34 changes: 24 additions & 10 deletions base/dsfs/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,9 @@ import (
// BodySizeSmallEnoughToDiff sets how small a body must be to generate a message from it
var BodySizeSmallEnoughToDiff = 20000000 // 20M or less is small

// TODO(dustmop): Limiting the body size for diffs causes an undesirable side-effect: if a user
// has a dataset body larger than this size, then any `save` operation will create a commit, even
// if no changes have been made, since our change detection will see BodyTooBig and assume
// something changed. Some solutions to this:
// * above 20M start using a less-accurate non-structured checksum instead of deepdiff
// * calculate a structured checksum and compare that to the last commit's checksum
// * immitate "redo" (https://apenwarr.ca/log/20181113), store file attributes, see if any change
// * at a large enough size, like 4G, just assume the file is always changed. don't be expensive
// We should make this algorithm agree with how `status` works.
// If a user has a dataset larger than the above limit, then instead of diffing we compare the
// checksum against the previous version. We should make this algorithm agree with how `status`
// works.
// See issue: https://github.com/qri-io/qri/issues/1150

// LoadDataset reads a dataset from a cafs and dereferences structure, transform, and commitMsg if they exist,
Expand Down Expand Up @@ -599,8 +593,14 @@ func generateCommitDescriptions(store cafs.Filestore, prev, ds *dataset.Dataset,
}
}

prevChecksum := ""
nextChecksum := ""

if prevStructure, ok := prevData["structure"]; ok {
if prevObject, ok := prevStructure.(map[string]interface{}); ok {
if checksum, ok := prevObject["checksum"].(string); ok {
prevChecksum = checksum
}
delete(prevObject, "checksum")
delete(prevObject, "entries")
delete(prevObject, "length")
Expand All @@ -609,13 +609,27 @@ func generateCommitDescriptions(store cafs.Filestore, prev, ds *dataset.Dataset,
}
if nextStructure, ok := nextData["structure"]; ok {
if nextObject, ok := nextStructure.(map[string]interface{}); ok {
if checksum, ok := nextObject["checksum"].(string); ok {
nextChecksum = checksum
}
delete(nextObject, "checksum")
delete(nextObject, "entries")
delete(nextObject, "length")
delete(nextObject, "depth")
}
}

// If the body is too big to diff, compare the checksums. If they differ, assume the
// body has changed.
assumeBodyChanged := false
if bodyAct == BodyTooBig {
prevBody = nil
nextBody = nil
if prevChecksum != nextChecksum {
assumeBodyChanged = true
}
}

var headDiff, bodyDiff deepdiff.Deltas
var bodyStat *deepdiff.Stats

Expand All @@ -632,7 +646,7 @@ func generateCommitDescriptions(store cafs.Filestore, prev, ds *dataset.Dataset,
}
}

shortTitle, longMessage := friendly.DiffDescriptions(headDiff, bodyDiff, bodyStat, bodyAct == BodyTooBig)
shortTitle, longMessage := friendly.DiffDescriptions(headDiff, bodyDiff, bodyStat, assumeBodyChanged)
if shortTitle == "" {
if forceIfNoChanges {
return "forced update", "forced update", nil
Expand Down
47 changes: 47 additions & 0 deletions cmd/save_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/qri-io/dataset"
"github.com/qri-io/qfs"
"github.com/qri-io/qfs/localfs"
"github.com/qri-io/qri/base/dsfs"
"github.com/qri-io/qri/dscache"
"github.com/qri-io/qri/errors"
)
Expand Down Expand Up @@ -565,3 +566,49 @@ func copyFile(t *testing.T, source, destin string) {
t.Fatal(err)
}
}

func TestSaveLargeBodyIsSame(t *testing.T) {
run := NewTestRunner(t, "test_peer", "qri_test_save_large_body")
defer run.Delete()

prevBodySizeLimit := dsfs.BodySizeSmallEnoughToDiff
defer func() { dsfs.BodySizeSmallEnoughToDiff = prevBodySizeLimit }()
dsfs.BodySizeSmallEnoughToDiff = 100

// Save a first version
run.MustExec(t, "qri save --body testdata/movies/body_ten.csv test_peer/my_ds")

// Try to save another, but no changes
err := run.ExecCommand("qri save --body testdata/movies/body_ten.csv test_peer/my_ds")
if err == nil {
t.Fatal("expected error trying to save, did not get an error")
}
expect := `error saving: no changes`
if err.Error() != expect {
t.Errorf("error mismatch, expect: %s, got: %s", expect, err.Error())
}

// Save a second version by making changes
run.MustExec(t, "qri save --body testdata/movies/body_twenty.csv test_peer/my_ds")

output := run.MustExec(t, "qri log test_peer/my_ds")
expect = `1 Commit: /ipfs/QmYi3KLDXFjjx8j29E7Y2gBXf71efcUZVvBCfsw4nriRwZ
Date: Sun Dec 31 20:05:01 EST 2000
Storage: local
Size: 532 B
body changed
2 Commit: /ipfs/QmeDzJEv6mHkQ42NXVLUNRmDsoWFkePovmCydts9M5Age9
Date: Sun Dec 31 20:02:01 EST 2000
Storage: local
Size: 224 B
created dataset
`
if diff := cmp.Diff(expect, output); diff != "" {
t.Errorf("result mismatch (-want +got):%s\n", diff)
}

}

0 comments on commit ddb7713

Please sign in to comment.