Skip to content

Commit

Permalink
fix(stats): int was not capturing all integer options, add int32
Browse files Browse the repository at this point in the history
…or `int64`

when switching on csv entry reader value types, `int` wasn't capturing all the values that were of some integer type. This was causing any integers in csvs to come back as null.

This fix also checks for `int32` and `int64` in `newAccumulator()` and `numericAcc.Write()`

Also adds tests for csv. Added a test for csv with a full json schema, since it looks like our csv readers don't recognize "null", "NULL", or empty as null in csv.
  • Loading branch information
ramfox committed Nov 8, 2019
1 parent a8646f4 commit e00791f
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 33 deletions.
6 changes: 5 additions & 1 deletion stats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ func newAccumulator(val interface{}) accumulator {
return &nullAcc{}
case float64, float32:
return newNumericAcc("number")
case int:
case int, int32, int64:
return newNumericAcc("integer")
case string:
return newStringAcc()
Expand Down Expand Up @@ -342,6 +342,10 @@ func (acc *numericAcc) Write(e dsio.Entry) {
switch x := e.Value.(type) {
case int:
v = float64(x)
case int32:
v = float64(x)
case int64:
v = float64(x)
case float32:
v = float64(x)
case float64:
Expand Down
109 changes: 77 additions & 32 deletions stats/stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package stats
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"strings"
"testing"
Expand Down Expand Up @@ -314,15 +313,22 @@ func TestJSON(t *testing.T) {
}
}

goodCases := []*TestCase{
&TestCase{
"an array of strings",
goodCases := []struct {
Description string
Format string
Schema string
Input string
Expect []byte
}{
{
"json: an array of strings",
"json",
`{"type":"array"}`,
`["a","a","bb","ccc","dddd"]`,
`[{"count":5,"frequencies":{"a":2},"maxLength":4,"minLength":1,"type":"string","unique":3}]`,
},
&TestCase{
"all types identity schema array of object entries",
[]byte(`[{"count":5,"frequencies":{"a":2},"maxLength":4,"minLength":1,"type":"string","unique":3}]`),
}, {
"json: all types identity schema array of object entries",
"json",
`{"type":"array"}`,
`[
{"int": 1, "float": 1.1, "nil": null, "bool": false, "string": "a"},
Expand All @@ -331,44 +337,84 @@ func TestJSON(t *testing.T) {
{"int": 4, "float": 4.4, "nil": null, "bool": true, "string": "aaa"},
{"int": 5, "float": 5.5, "nil": null, "bool": false, "string": "aaaaa"}
]`,
`[{"count":5,"falseCount":3,"key":"bool","trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"1.1":2},"key":"float","max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"frequencies":{"1":2},"key":"int","max":5,"min":1,"type":"numeric","unique":3},{"count":5,"key":"nil","type":"null"},{"count":5,"frequencies":{"aaa":2},"key":"string","maxLength":5,"minLength":1,"type":"string","unique":3}]`,
},
&TestCase{
"all types identity schema object of array entries",
[]byte(`[{"count":5,"falseCount":3,"key":"bool","trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"1.1":2},"key":"float","max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"frequencies":{"1":2},"key":"int","max":5,"min":1,"type":"numeric","unique":3},{"count":5,"key":"nil","type":"null"},{"count":5,"frequencies":{"aaa":2},"key":"string","maxLength":5,"minLength":1,"type":"string","unique":3}]`),
}, {
"csv: an array of strings",
"csv",
`{"type":"array"}`,
"a\na\nbb\nccc\ndddd",
[]byte(`[{"count":5,"frequencies":{"a":2},"maxLength":4,"minLength":1,"type":"string","unique":3}]`),
}, {
"csv: all types identity schema array of object entries",
"csv",
`{
"items": {
"items": [
{
"title": "int",
"type": "integer"
},
{
"title": "float",
"type": "number"
},
{
"title": "nil",
"type": "null"
},
{
"title": "bool",
"type": "boolean"
},
{
"title": "string",
"type": "string"
}
],
"type": "array"
},
"type": "array"
}`,
"1,1.1,,false,a\n1,1.1,,true,aa\n3,3.3,,false,aaa\n4,4.4,,true,aaa\n5,5.5,,false,aaaaa",
[]byte(`[{"count":5,"frequencies":{"1":2},"max":5,"min":1,"type":"numeric","unique":3},{"count":5,"frequencies":{"1.1":2},"max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`),
}, {
"json: all types identity schema object of array entries",
"json",
`{"type":"object"}`,
`{
"a" : [1,1.1,null,false,"a"],
"b" : [1,2.2,null,true,"aa"],
"c" : [3,2.2,null,false,"aaa"],
"d" : [4,4.4,null,true,"aaa"],
"e" : [5,5.5,null,false,"aaaaa"]
}`,
`[{"count":5,"frequencies":{"1":2},"max":5,"min":1,"type":"numeric","unique":3},{"count":5,"frequencies":{"2.2":2},"max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`,
},
&TestCase{
"array of object of array of strings",
"a" : [1,1.1,null,false,"a"],
"b" : [1,2.2,null,true,"aa"],
"c" : [3,2.2,null,false,"aaa"],
"d" : [4,4.4,null,true,"aaa"],
"e" : [5,5.5,null,false,"aaaaa"]
}`,
[]byte(`[{"count":5,"frequencies":{"1":2},"max":5,"min":1,"type":"numeric","unique":3},{"count":5,"frequencies":{"2.2":2},"max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`),
}, {
"json: array of object of array of strings",
"json",
`{"type":"array"}`,
`[
{"ids": ["a","b","c"], "is_great": true },
{"ids": [1,2,3,4,5,6] },
{"ids": ["b",20,"c"] }
]`,
`[{"key":"ids","type":"array","values":[{"count":2,"maxLength":1,"minLength":1,"unique":2},{"count":1,"maxLength":1,"minLength":1,"unique":1},{"count":2,"frequencies":{"c":2},"maxLength":1,"minLength":1},{"count":1,"max":4,"min":4,"unique":1},{"count":1,"max":5,"min":5,"unique":1},{"count":1,"max":6,"min":6,"unique":1}]},{"count":1,"falseCount":0,"key":"is_great","trueCount":1,"type":"boolean"}]`,
{"ids": ["a","b","c"], "is_great": true },
{"ids": [1,2,3,4,5,6] },
{"ids": ["b",20,"c"] }
]`,
[]byte(`[{"key":"ids","type":"array","values":[{"count":2,"maxLength":1,"minLength":1,"unique":2},{"count":1,"maxLength":1,"minLength":1,"unique":1},{"count":2,"frequencies":{"c":2},"maxLength":1,"minLength":1},{"count":1,"max":4,"min":4,"unique":1},{"count":1,"max":5,"min":5,"unique":1},{"count":1,"max":6,"min":6,"unique":1}]},{"count":1,"falseCount":0,"key":"is_great","trueCount":1,"type":"boolean"}]`),
},
}
for i, c := range goodCases {
var sch map[string]interface{}
if err := json.Unmarshal([]byte(c.JSONSchema), &sch); err != nil {
if err := json.Unmarshal([]byte(c.Schema), &sch); err != nil {
t.Errorf("%d. %s error decoding schema: %s", i, c.Description, err)
continue
}
st := &dataset.Structure{
Format: "json",
Format: c.Format,
Schema: sch,
}
ds := &dataset.Dataset{Path: "path", Structure: st}
bodyFile := qfs.NewMemfileBytes("bodyfile", []byte(c.JSONInput))
bodyFile := qfs.NewMemfileBytes("bodyfile", []byte(c.Input))
ds.SetBodyFile(bodyFile)

s := New(nil)
r, err := s.JSON(ctx, ds)
if err != nil {
Expand All @@ -378,9 +424,8 @@ func TestJSON(t *testing.T) {
if err != nil {
t.Errorf("%d. %s unexpected read error: %s", i, c.Description, err)
}
if diff := cmp.Diff([]byte(fmt.Sprintf("%v", c.Expect)), got); diff != "" {
if diff := cmp.Diff(c.Expect, got); diff != "" {
t.Errorf("%d. '%s' result mismatch (-want +got):%s\n", i, c.Description, diff)
}

}
}

0 comments on commit e00791f

Please sign in to comment.