Skip to content

Commit 3ff3b75

Browse files
committed
feat(dsfs): compute & store stats component at save time
1 parent a8f2977 commit 3ff3b75

File tree

17 files changed

+378
-136
lines changed

17 files changed

+378
-136
lines changed

base/body_test.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99

1010
"github.com/qri-io/dataset"
1111
"github.com/qri-io/qfs"
12-
"github.com/qri-io/qri/base/dsfs"
1312
)
1413

1514
func TestReadBody(t *testing.T) {
@@ -39,9 +38,20 @@ func TestReadBody(t *testing.T) {
3938
}
4039
}
4140

41+
// BaseTabularSchema is the base schema for tabular data
42+
// NOTE: Do not use if possible, prefer github.com/qri-io/dataset/tabular
43+
// TODO(dustmop): Possibly move this to tabular package
44+
var BaseTabularSchema = map[string]interface{}{
45+
"type": "array",
46+
"items": map[string]interface{}{
47+
"type": "array",
48+
"items": []interface{}{},
49+
},
50+
}
51+
4252
func TestConvertBodyFormat(t *testing.T) {
4353
jsonStructure := &dataset.Structure{Format: "json", Schema: dataset.BaseSchemaArray}
44-
csvStructure := &dataset.Structure{Format: "csv", Schema: dsfs.BaseTabularSchema}
54+
csvStructure := &dataset.Structure{Format: "csv", Schema: BaseTabularSchema}
4555

4656
// CSV -> JSON
4757
body := qfs.NewMemfileBytes("", []byte("a,b,c"))

base/dsfs/commit.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,27 @@ const (
3535
BodyTooBig = BodyAction("too_big")
3636
)
3737

38+
// DerefCommit derferences a dataset's Commit element if required
39+
// should be a no-op if ds.Structure is nil or isn't a reference
40+
func DerefCommit(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
41+
if ds.Commit != nil && ds.Commit.IsEmpty() && ds.Commit.Path != "" {
42+
cm, err := loadCommit(ctx, store, ds.Commit.Path)
43+
if err != nil {
44+
log.Debug(err.Error())
45+
return fmt.Errorf("loading dataset commit: %w", err)
46+
}
47+
cm.Path = ds.Commit.Path
48+
ds.Commit = cm
49+
}
50+
return nil
51+
}
52+
3853
// loadCommit assumes the provided path is valid
3954
func loadCommit(ctx context.Context, fs qfs.Filesystem, path string) (st *dataset.Commit, err error) {
4055
data, err := fileBytes(fs.Get(ctx, path))
4156
if err != nil {
4257
log.Debug(err.Error())
43-
return nil, fmt.Errorf("error loading commit file: %s", err.Error())
58+
return nil, fmt.Errorf("loading commit file: %s", err.Error())
4459
}
4560
return dataset.UnmarshalCommit(data)
4661
}
@@ -51,7 +66,7 @@ func generateCommitTitleAndMessage(ctx context.Context, fs qfs.Filesystem, privK
5166
shortTitle, longMessage, err := generateCommitDescriptions(ctx, fs, ds, prev, bodyAct, forceIfNoChanges)
5267
if err != nil {
5368
log.Debugf("generateCommitDescriptions err: %s", err)
54-
return fmt.Errorf("error saving: %s", err)
69+
return fmt.Errorf("error saving: %w", err)
5570
}
5671

5772
if shortTitle == defaultCreatedDescription && fileHint != "" {

base/dsfs/compute_fields.go

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
crypto "github.com/libp2p/go-libp2p-core/crypto"
1212
"github.com/qri-io/dataset"
1313
"github.com/qri-io/dataset/dsio"
14+
"github.com/qri-io/dataset/dsstats"
1415
"github.com/qri-io/jsonschema"
1516
"github.com/qri-io/qfs"
1617
)
@@ -24,6 +25,9 @@ type computeFieldsFile struct {
2425

2526
ds, prev *dataset.Dataset
2627

28+
// body statistics accumulator
29+
acc *dsstats.Accumulator
30+
2731
// buffer of entries for diffing small datasets. will be set to nil if
2832
// body reads more than BodySizeSmallEnoughToDiff bytes
2933
diffMessageBuf *dsio.EntryBuffer
@@ -39,6 +43,11 @@ type computeFieldsFile struct {
3943
bytesRead int
4044
}
4145

46+
var (
47+
_ doneProcessingFile = (*computeFieldsFile)(nil)
48+
_ statsComponentFile = (*computeFieldsFile)(nil)
49+
)
50+
4251
func newComputeFieldsFile(ctx context.Context, dsLk *sync.Mutex, fs qfs.Filesystem, pk crypto.PrivKey, ds, prev *dataset.Dataset, sw SaveSwitches) (qfs.File, error) {
4352
var (
4453
bf = ds.BodyFile()
@@ -123,12 +132,23 @@ func (cff *computeFieldsFile) Close() error {
123132
return nil
124133
}
125134

135+
type doneProcessingFile interface {
136+
DoneProcessing() <-chan error
137+
}
138+
126139
func (cff *computeFieldsFile) DoneProcessing() <-chan error {
127140
return cff.done
128141
}
129142

130-
type doneProcessingFile interface {
131-
DoneProcessing() <-chan error
143+
type statsComponentFile interface {
144+
StatsComponent() (*dataset.Stats, error)
145+
}
146+
147+
func (cff *computeFieldsFile) StatsComponent() (*dataset.Stats, error) {
148+
return &dataset.Stats{
149+
Qri: dataset.KindStats.String(),
150+
Stats: dsstats.ToMap(cff.acc),
151+
}, nil
132152
}
133153

134154
// , store cafs.Filestore, ds, prev *dataset.Dataset, bodyR io.Reader, pk crypto.PrivKey, sw SaveSwitches, done chan error
@@ -141,18 +161,19 @@ func (cff *computeFieldsFile) handleRows(ctx context.Context) {
141161
depth = 0
142162
)
143163

144-
cff.Lock()
145-
// assign timestamp early. saving process on large files can take many minutes
146-
cff.ds.Commit.Timestamp = Timestamp()
147-
cff.Unlock()
148-
149164
r, err := dsio.NewEntryReader(st, cff.pipeReader)
150165
if err != nil {
151166
log.Debugf("creating entry reader: %s", err)
152167
cff.done <- fmt.Errorf("creating entry reader: %w", err)
153168
return
154169
}
155170

171+
cff.Lock()
172+
// assign timestamp early. saving process on large files can take many minutes
173+
cff.ds.Commit.Timestamp = Timestamp()
174+
cff.acc = dsstats.NewAccumulator(st)
175+
cff.Unlock()
176+
156177
jsch, err := st.JSONSchema()
157178
if err != nil {
158179
cff.done <- err
@@ -188,6 +209,9 @@ func (cff *computeFieldsFile) handleRows(ctx context.Context) {
188209
depth = d
189210
}
190211
entries++
212+
if err := cff.acc.WriteEntry(ent); err != nil {
213+
return err
214+
}
191215

192216
if i%batchSize == 0 && i != 0 {
193217
numValErrs, flushErr := cff.flushBatch(ctx, batchBuf, st, jsch)

base/dsfs/dataset.go

Lines changed: 25 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -91,118 +91,25 @@ func LoadDatasetRefs(ctx context.Context, fs qfs.Filesystem, path string) (*data
9191
// DerefDataset attempts to fully dereference a dataset
9292
func DerefDataset(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
9393
log.Debugf("DerefDataset path=%q", ds.Path)
94-
if err := DerefDatasetMeta(ctx, store, ds); err != nil {
94+
if err := DerefMeta(ctx, store, ds); err != nil {
9595
return err
9696
}
97-
if err := DerefDatasetStructure(ctx, store, ds); err != nil {
97+
if err := DerefStructure(ctx, store, ds); err != nil {
9898
return err
9999
}
100-
if err := DerefDatasetTransform(ctx, store, ds); err != nil {
100+
if err := DerefTransform(ctx, store, ds); err != nil {
101101
return err
102102
}
103-
if err := DerefDatasetViz(ctx, store, ds); err != nil {
103+
if err := DerefViz(ctx, store, ds); err != nil {
104104
return err
105105
}
106-
if err := DerefDatasetReadme(ctx, store, ds); err != nil {
106+
if err := DerefReadme(ctx, store, ds); err != nil {
107107
return err
108108
}
109-
return DerefDatasetCommit(ctx, store, ds)
110-
}
111-
112-
// DerefDatasetStructure derferences a dataset's structure element if required
113-
// should be a no-op if ds.Structure is nil or isn't a reference
114-
func DerefDatasetStructure(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
115-
if ds.Structure != nil && ds.Structure.IsEmpty() && ds.Structure.Path != "" {
116-
st, err := loadStructure(ctx, store, ds.Structure.Path)
117-
if err != nil {
118-
log.Debug(err.Error())
119-
return fmt.Errorf("error loading dataset structure: %s", err.Error())
120-
}
121-
// assign path to retain internal reference to path
122-
// st.Assign(dataset.NewStructureRef(ds.Structure.Path))
123-
ds.Structure = st
124-
}
125-
return nil
126-
}
127-
128-
// DerefDatasetViz dereferences a dataset's Viz element if required
129-
// should be a no-op if ds.Viz is nil or isn't a reference
130-
func DerefDatasetViz(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
131-
if ds.Viz != nil && ds.Viz.IsEmpty() && ds.Viz.Path != "" {
132-
vz, err := loadViz(ctx, store, ds.Viz.Path)
133-
if err != nil {
134-
log.Debug(err.Error())
135-
return fmt.Errorf("error loading dataset viz: %s", err.Error())
136-
}
137-
// assign path to retain internal reference to path
138-
// vz.Assign(dataset.NewVizRef(ds.Viz.Path))
139-
ds.Viz = vz
140-
}
141-
return nil
142-
}
143-
144-
// DerefDatasetReadme dereferences a dataset's Readme element if required
145-
// should be a no-op if ds.Readme is nil or isn't a reference
146-
func DerefDatasetReadme(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
147-
if ds.Readme != nil && ds.Readme.IsEmpty() && ds.Readme.Path != "" {
148-
rm, err := loadReadme(ctx, store, ds.Readme.Path)
149-
if err != nil {
150-
log.Debug(err.Error())
151-
return fmt.Errorf("error loading dataset readme: %s", err.Error())
152-
}
153-
// assign path to retain internal reference to path
154-
// rm.Assign(dataset.NewVizRef(ds.Readme.Path))
155-
ds.Readme = rm
156-
}
157-
return nil
158-
}
159-
160-
// DerefDatasetTransform derferences a dataset's transform element if required
161-
// should be a no-op if ds.Structure is nil or isn't a reference
162-
func DerefDatasetTransform(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
163-
if ds.Transform != nil && ds.Transform.IsEmpty() && ds.Transform.Path != "" {
164-
t, err := loadTransform(ctx, store, ds.Transform.Path)
165-
if err != nil {
166-
log.Debug(err.Error())
167-
return fmt.Errorf("error loading dataset transform: %s", err.Error())
168-
}
169-
// assign path to retain internal reference to path
170-
// t.Assign(dataset.NewTransformRef(ds.Transform.Path))
171-
ds.Transform = t
172-
}
173-
return nil
174-
}
175-
176-
// DerefDatasetMeta derferences a dataset's transform element if required
177-
// should be a no-op if ds.Structure is nil or isn't a reference
178-
func DerefDatasetMeta(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
179-
if ds.Meta != nil && ds.Meta.IsEmpty() && ds.Meta.Path != "" {
180-
md, err := loadMeta(ctx, store, ds.Meta.Path)
181-
if err != nil {
182-
log.Debug(err.Error())
183-
return fmt.Errorf("error loading dataset metadata: %s", err.Error())
184-
}
185-
// assign path to retain internal reference to path
186-
// md.Assign(dataset.NewMetaRef(ds.Meta.Path))
187-
ds.Meta = md
188-
}
189-
return nil
190-
}
191-
192-
// DerefDatasetCommit derferences a dataset's Commit element if required
193-
// should be a no-op if ds.Structure is nil or isn't a reference
194-
func DerefDatasetCommit(ctx context.Context, store qfs.Filesystem, ds *dataset.Dataset) error {
195-
if ds.Commit != nil && ds.Commit.IsEmpty() && ds.Commit.Path != "" {
196-
cm, err := loadCommit(ctx, store, ds.Commit.Path)
197-
if err != nil {
198-
log.Debug(err.Error())
199-
return fmt.Errorf("error loading dataset commit: %s", err.Error())
200-
}
201-
// assign path to retain internal reference to path
202-
cm.Assign(dataset.NewCommitRef(ds.Commit.Path))
203-
ds.Commit = cm
109+
if err := DerefStats(ctx, store, ds); err != nil {
110+
return err
204111
}
205-
return nil
112+
return DerefCommit(ctx, store, ds)
206113
}
207114

208115
// SaveSwitches represents options for saving a dataset
@@ -349,6 +256,21 @@ func buildFileGraph(fs qfs.Filesystem, ds *dataset.Dataset, privKey crypto.PrivK
349256
files = append(files, stf)
350257
}
351258

259+
// stats relies on a structure component & a body file
260+
if statsCompFile, ok := bdf.(statsComponentFile); ok {
261+
hook := func(ctx context.Context, f qfs.File, added map[string]string) (io.Reader, error) {
262+
sa, err := statsCompFile.StatsComponent()
263+
if err != nil {
264+
return nil, err
265+
}
266+
ds.Stats = sa
267+
return JSONFile(f.FullPath(), sa)
268+
}
269+
270+
hookFile := qfs.NewWriteHookFile(qfs.NewMemfileBytes(PackageFileStats.Filename(), []byte{}), hook, PackageFileStructure.Filename())
271+
files = append(files, hookFile)
272+
}
273+
352274
if ds.Meta != nil {
353275
ds.Meta.DropTransientValues()
354276
mdf, err := JSONFile(PackageFileMeta.Filename(), ds.Meta)
@@ -454,6 +376,8 @@ func buildFileGraph(fs qfs.Filesystem, ds *dataset.Dataset, privKey crypto.PrivK
454376
ds.Viz = dataset.NewVizRef(pathMap[comp])
455377
case PackageFileMeta.Filename():
456378
ds.Meta = dataset.NewMetaRef(pathMap[comp])
379+
case PackageFileStats.Filename():
380+
ds.Stats = dataset.NewStatsRef(pathMap[comp])
457381
case bdf.FullPath():
458382
ds.BodyPath = pathMap[comp]
459383
}

base/dsfs/dataset_test.go

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ func TestCreateDataset(t *testing.T) {
136136
err string
137137
}{
138138
{"invalid_reference",
139-
"", nil, "error loading dataset commit: error loading commit file: path not found"},
139+
"", nil, "loading dataset commit: loading commit file: path not found"},
140140
{"invalid",
141141
"", nil, "commit is required"},
142142
{"strict_fail",
@@ -175,13 +175,13 @@ func TestCreateDataset(t *testing.T) {
175175
repoFiles int // expected total count of files in repo after test execution
176176
}{
177177
{"cities",
178-
"/mem/QmYUqpRqrxUvtXgJ3NnPafUmyShERR9WaqLojxsumvrYpo", nil, 7},
178+
"/mem/QmPUMUixUxM1e6SKtgMaV7U5kvuy25W8yV4KvkcURd6LFg", nil, 8},
179179
{"all_fields",
180-
"/mem/QmVFJmziXeSsjByztA62dPpeGjLykAerP5uFC26Yj1o5CN", nil, 16},
180+
"/mem/Qmcf46vxtuCsbMV4i9d2ifCJnMjEBHXturUyD2xUD6qrn9", nil, 18},
181181
{"cities_no_commit_title",
182-
"/mem/QmULA7AoxdWjEfrsdCNZgXRNXKJQfsQVrUHKWp1s1K1R6i", nil, 19},
182+
"/mem/QmXFRBAWTBQZVJGZxtaCAsEYKRRQLcKZJhn5UPsQ2LoJLu", nil, 21},
183183
{"craigslist",
184-
"/mem/QmXVLv5BKuP1C5TgmFjxF51q6kbqd75CGrFcUMGutaDENQ", nil, 23},
184+
"/mem/QmWm6rGimuUFXgw9CQ9p3fT3h9mCnAXkPr8PHM1dhJRASm", nil, 26},
185185
}
186186

187187
for _, c := range good {
@@ -204,10 +204,10 @@ func TestCreateDataset(t *testing.T) {
204204

205205
if tc.Expect != nil {
206206
if err := dataset.CompareDatasets(tc.Expect, ds); err != nil {
207-
// expb, _ := json.Marshal(tc.Expect)
208-
// fmt.Println(string(expb))
209-
// dsb, _ := json.Marshal(ds)
210-
// fmt.Println(string(dsb))
207+
expb, _ := json.Marshal(tc.Expect)
208+
fmt.Println(string(expb))
209+
dsb, _ := json.Marshal(ds)
210+
fmt.Println(string(dsb))
211211
t.Errorf("dataset comparison error: %s", err.Error())
212212
}
213213
}
@@ -289,8 +289,8 @@ func TestCreateDataset(t *testing.T) {
289289
t.Fatalf("CreateDataset expected error got 'nil'. commit: %v", ds.Commit)
290290
}
291291

292-
if len(fs.Files) != 23 {
293-
t.Errorf("invalid number of entries: %d != %d", 23, len(fs.Files))
292+
if len(fs.Files) != 26 {
293+
t.Errorf("invalid number of entries: %d != %d", 26, len(fs.Files))
294294
_, err := fs.Print()
295295
if err != nil {
296296
panic(err)
@@ -301,6 +301,17 @@ func TestCreateDataset(t *testing.T) {
301301
// case: previous dataset isn't valid
302302
}
303303

304+
// BaseTabularSchema is the base schema for tabular data
305+
// NOTE: Do not use if possible, prefer github.com/qri-io/dataset/tabular
306+
// TODO(dustmop): Possibly move this to tabular package
307+
var BaseTabularSchema = map[string]interface{}{
308+
"type": "array",
309+
"items": map[string]interface{}{
310+
"type": "array",
311+
"items": []interface{}{},
312+
},
313+
}
314+
304315
// Test that if the body is too large, the commit message just assumes the body changed
305316
func TestCreateDatasetBodyTooLarge(t *testing.T) {
306317
ctx := context.Background()

0 commit comments

Comments
 (0)