From 937d14583b9a4216064249cb40c3ecc4ed9601c0 Mon Sep 17 00:00:00 2001 From: Brendan O'Brien Date: Thu, 14 Nov 2019 14:38:26 -0500 Subject: [PATCH] feat(stats): add histogram, mean, meadian to numeric stat using a default of 10 bins for histogram calculation --- api/testdata/api.snapshot | Bin 204281 -> 205651 bytes go.mod | 1 + go.sum | 17 ++++++ lib/datasets_test.go | 4 +- stats/stats.go | 87 +++++++++++++++------------- stats/stats_test.go | 115 ++++++++++++++++++++++++-------------- 6 files changed, 141 insertions(+), 83 deletions(-) diff --git a/api/testdata/api.snapshot b/api/testdata/api.snapshot index 0dda7f11afb06d7c43f5b7f06ad4cff026135e5f..fddf53475f5866f2511b740e282f222192c4927d 100755 GIT binary patch delta 1440 zcmd6nze~eF6vy$G$Q@aZ!acu!bg)~i1p1P8 zmerC+-}I|>@YYhErkRG|1?<=sr%dr&ms1ma7Gs=Rif8kXQ(G}z!Ko6&W)Q@|=WH8- zxJL3QIoL&rSHxF1QKHSh4)t8V$8 zI=Coj)Y-oi>oGX=xo>9WUWMoV()6Hm0w5s!m0{t5s9 delta 149 zcmcb7jOXWWo`x32Elk1lrkgBa664IwQ?gPpGO#qAerF+*>~u{|CZ6fCirFof!rKupT(*+j+_1#^<#KoMOSTX(d3MON$ g$_~C}(w=_dB$McL0Y)Z)>FZZA@obM;$@H@l0BWHz1^@s6 diff --git a/go.mod b/go.mod index 490af93ad..e07fbcf6f 100644 --- a/go.mod +++ b/go.mod @@ -66,5 +66,6 @@ require ( go.starlark.net v0.0.0-20190528202925-30ae18b8564f golang.org/x/crypto v0.0.0-20190926180335-cea2066c6411 golang.org/x/sys v0.0.0-20190926180325-855e68c8590b + gonum.org/v1/gonum v0.6.0 gopkg.in/yaml.v2 v2.2.2 ) diff --git a/go.sum b/go.sum index 418b86d93..6532cc898 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,7 @@ github.com/Stebalien/go-bitfield v0.0.0-20180330043415-076a62f9ce6e/go.mod h1:3o github.com/Stebalien/go-bitfield v0.0.1 h1:X3kbSSPUaJK60wV2hjOPZwmpljr6VGCqdq4cBLhbQBo= github.com/Stebalien/go-bitfield v0.0.1/go.mod h1:GNjFpasyUVkHMsfEOk8EFLJ9syQ6SI+XWrX9Wf2XH0s= github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= @@ -89,6 +90,7 @@ github.com/facebookgo/atomicfile v0.0.0-20151019160806-2de1f203e7d5/go.mod h1:Jp github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fd/go-nat v1.0.0/go.mod h1:BTBu/CKvMmOMUPkKVef1pngt2WFH/lg7E6yQnulfp6E= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= @@ -108,6 +110,7 @@ github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/gogo/protobuf v1.3.0 h1:G8O7TerXerS4F6sx9OV7/nRfJdnXgHZu/S/7F2SN+UE= github.com/gogo/protobuf v1.3.0/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6 h1:ZgQEtGgCBiWRM39fZuwSd1LwSqqSW0hOdXCYYDX0R3I= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -336,6 +339,7 @@ github.com/jtolds/gls v4.2.1+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVY github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88 h1:uC1QfSlInpQF+M0ao65imhwqKnz3Q2z/d8PWZRMQvDM= github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k= github.com/kami-zh/go-capturer v0.0.0-20171211120116-e492ea43421d/go.mod h1:P2viExyCEfeWGU259JnaQ34Inuec4R38JCyBx2edgD0= @@ -873,7 +877,12 @@ golang.org/x/crypto v0.0.0-20190618222545-ea8f1a30c443/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392/go.mod h1:/lpIB1dKB+9EgE3H3cr1v9wB50oz8l4C4h62xy7jSTY= golang.org/x/crypto v0.0.0-20190926180335-cea2066c6411 h1:kuW9k4QvBJpRjC3rxEytsfIYPs8oGY3Jw7iR36h0FIY= golang.org/x/crypto v0.0.0-20190926180335-cea2066c6411/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= @@ -929,15 +938,22 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181130052023-1c3d964395ce/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7 h1:9zdDQZ7Thm29KFXgAX/+yaf3eVbP7djjWp/dXAppNCc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.6.0 h1:DJy6UzXbahnGUf1ujUNkh/NEtK14qMo2nvlBPs4U5yw= +gonum.org/v1/gonum v0.6.0/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -962,3 +978,4 @@ gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/lib/datasets_test.go b/lib/datasets_test.go index 4be19fb79..520c7ac9d 100644 --- a/lib/datasets_test.go +++ b/lib/datasets_test.go @@ -985,8 +985,8 @@ func TestDatasetRequestsStats(t *testing.T) { ref string expected []byte }{ - {"csv: me/cities", "me/cities", []byte(`[{"count":5,"maxLength":8,"minLength":7,"type":"string","unique":5},{"count":5,"max":40000000,"min":35000,"type":"numeric","unique":5},{"count":5,"frequencies":{"44.4":2},"max":65.25,"min":44.4,"type":"numeric","unique":3},{"count":5,"falseCount":1,"trueCount":4,"type":"boolean"}]`)}, - {"json: me/sitemap", "me/sitemap", []byte(`[{"count":10,"key":"contentLength","max":40079,"min":24515,"type":"numeric","unique":10},{"count":10,"frequencies":{"text/html; charset=utf-8":10},"key":"contentSniff","maxLength":24,"minLength":24,"type":"string"},{"count":10,"frequencies":{"text/html; charset=utf-8":10},"key":"contentType","maxLength":24,"minLength":24,"type":"string"},{"count":10,"key":"duration","max":4081577841,"min":74291866,"type":"numeric","unique":10},{"count":10,"key":"hash","maxLength":68,"minLength":68,"type":"string","unique":10},{"key":"links","type":"array","values":[{"count":10,"maxLength":58,"minLength":14,"unique":10},{"count":10,"maxLength":115,"minLength":19,"unique":10},{"count":10,"maxLength":68,"minLength":22,"unique":10},{"count":10,"maxLength":115,"minLength":14,"unique":10},{"count":9,"maxLength":70,"minLength":15,"unique":9},{"count":9,"maxLength":115,"minLength":37,"unique":9},{"count":9,"maxLength":52,"minLength":15,"unique":9},{"count":9,"maxLength":75,"minLength":19,"unique":9},{"count":9,"maxLength":66,"minLength":15,"unique":9},{"count":7,"maxLength":75,"minLength":19,"unique":7},{"count":7,"maxLength":66,"minLength":22,"unique":7},{"count":6,"maxLength":43,"minLength":19,"unique":6},{"count":6,"maxLength":77,"minLength":14,"unique":6},{"count":6,"maxLength":77,"minLength":21,"unique":6},{"count":4,"maxLength":43,"minLength":14,"unique":4},{"count":3,"maxLength":32,"minLength":21,"unique":3},{"count":3,"maxLength":42,"minLength":19,"unique":3},{"count":3,"maxLength":66,"minLength":32,"unique":3},{"count":3,"maxLength":46,"minLength":19,"unique":3},{"count":2,"maxLength":66,"minLength":22,"unique":2},{"count":2,"maxLength":32,"minLength":23,"unique":2},{"count":2,"maxLength":33,"minLength":22,"unique":2},{"count":2,"maxLength":32,"minLength":27,"unique":2},{"count":1,"maxLength":33,"minLength":33,"unique":1},{"count":1,"maxLength":27,"minLength":27,"unique":1}]},{"count":1,"key":"redirectTo","maxLength":18,"minLength":18,"type":"string","unique":1},{"count":11,"frequencies":{"200":10},"key":"status","max":301,"min":200,"type":"numeric","unique":1},{"count":11,"key":"timestamp","maxLength":35,"minLength":35,"type":"string","unique":11},{"count":10,"key":"title","maxLength":88,"minLength":53,"type":"string","unique":10},{"count":11,"key":"url","maxLength":78,"minLength":18,"type":"string","unique":11}]`)}, + {"csv: me/cities", "me/cities", []byte(`[{"count":5,"maxLength":8,"minLength":7,"type":"string","unique":5},{"count":5,"histogram":{"bins":[35000,4031500.1,8028000.2,12024500.3,16021000.4,20017500.5,24014000.6,28010500.7,32007000.8,36003500.9,40000001],"frequencies":[3,0,1,0,0,0,0,0,0,1]},"max":40000000,"mean":9817000,"median":300000,"min":35000,"type":"numeric"},{"count":5,"histogram":{"bins":[44.4,46.585,48.769999999999996,50.955,53.14,55.325,57.51,59.695,61.879999999999995,64.065,66.25],"frequencies":[2,0,1,0,0,1,0,0,0,1]},"max":65.25,"mean":52.04,"median":44.4,"min":44.4,"type":"numeric"},{"count":5,"falseCount":1,"trueCount":4,"type":"boolean"}]`)}, + {"json: me/sitemap", "me/sitemap", []byte(`[{"count":10,"histogram":{"bins":[24515,26071.5,27628,29184.5,30741,32297.5,33854,35410.5,36967,38523.5,40080],"frequencies":[4,0,3,1,0,0,1,0,0,1]},"key":"contentLength","max":40079,"mean":28825.8,"median":40079,"min":24515,"type":"numeric"},{"count":10,"frequencies":{"text/html; charset=utf-8":10},"key":"contentSniff","maxLength":24,"minLength":24,"type":"string"},{"count":10,"frequencies":{"text/html; charset=utf-8":10},"key":"contentType","maxLength":24,"minLength":24,"type":"string"},{"count":10,"histogram":{"bins":[74291866,475020463.6,875749061.2,1276477658.8000002,1677206256.4,2077934854,2478663451.6000004,2879392049.2000003,3280120646.8,3680849244.4,4081577842],"frequencies":[2,0,0,0,0,0,0,0,0,8]},"key":"duration","max":4081577841,"mean":3276899953.4,"median":89911449,"min":74291866,"type":"numeric"},{"count":10,"key":"hash","maxLength":68,"minLength":68,"type":"string","unique":10},{"key":"links","type":"array","values":[{"count":10,"maxLength":58,"minLength":14,"unique":10},{"count":10,"maxLength":115,"minLength":19,"unique":10},{"count":10,"maxLength":68,"minLength":22,"unique":10},{"count":10,"maxLength":115,"minLength":14,"unique":10},{"count":9,"maxLength":70,"minLength":15,"unique":9},{"count":9,"maxLength":115,"minLength":37,"unique":9},{"count":9,"maxLength":52,"minLength":15,"unique":9},{"count":9,"maxLength":75,"minLength":19,"unique":9},{"count":9,"maxLength":66,"minLength":15,"unique":9},{"count":7,"maxLength":75,"minLength":19,"unique":7},{"count":7,"maxLength":66,"minLength":22,"unique":7},{"count":6,"maxLength":43,"minLength":19,"unique":6},{"count":6,"maxLength":77,"minLength":14,"unique":6},{"count":6,"maxLength":77,"minLength":21,"unique":6},{"count":4,"maxLength":43,"minLength":14,"unique":4},{"count":3,"maxLength":32,"minLength":21,"unique":3},{"count":3,"maxLength":42,"minLength":19,"unique":3},{"count":3,"maxLength":66,"minLength":32,"unique":3},{"count":3,"maxLength":46,"minLength":19,"unique":3},{"count":2,"maxLength":66,"minLength":22,"unique":2},{"count":2,"maxLength":32,"minLength":23,"unique":2},{"count":2,"maxLength":33,"minLength":22,"unique":2},{"count":2,"maxLength":32,"minLength":27,"unique":2},{"count":1,"maxLength":33,"minLength":33,"unique":1},{"count":1,"maxLength":27,"minLength":27,"unique":1}]},{"count":1,"key":"redirectTo","maxLength":18,"minLength":18,"type":"string","unique":1},{"count":11,"histogram":{"bins":[200,210.2,220.4,230.6,240.8,251,261.2,271.4,281.6,291.8,302],"frequencies":[10,0,0,0,0,0,0,0,0,1]},"key":"status","max":301,"mean":209.1818181818182,"median":200,"min":200,"type":"numeric"},{"count":11,"key":"timestamp","maxLength":35,"minLength":35,"type":"string","unique":11},{"count":10,"key":"title","maxLength":88,"minLength":53,"type":"string","unique":10},{"count":11,"key":"url","maxLength":78,"minLength":18,"type":"string","unique":11}]`)}, } for i, c := range goodCases { res := &StatsResponse{} diff --git a/stats/stats.go b/stats/stats.go index b160ee7fe..72b321174 100644 --- a/stats/stats.go +++ b/stats/stats.go @@ -8,11 +8,12 @@ import ( "fmt" "io" "sort" - "strconv" logger "github.com/ipfs/go-log" "github.com/qri-io/dataset" "github.com/qri-io/dataset/dsio" + gonumfloats "gonum.org/v1/gonum/floats" + gonumstat "gonum.org/v1/gonum/stat" ) var ( @@ -314,27 +315,34 @@ func (acc *arrayAcc) Close() { } } -const maxUint = ^uint(0) -const maxInt = int(maxUint >> 1) -const minInt = -maxInt - 1 +const ( + maxUint = ^uint(0) + maxFloat = float64(maxUint >> 1) + maxInt = int(maxUint >> 1) + minInt = -maxInt - 1 +) type numericAcc struct { - typ string - count int - min float64 - max float64 - unique int - frequencies map[float64]int + typ string + count int + min float64 + max float64 + mean float64 + median float64 + dividers []float64 + histogram []float64 } var _ accumulator = (*numericAcc)(nil) func newNumericAcc(typ string) *numericAcc { return &numericAcc{ - typ: typ, - max: float64(minInt), - min: float64(maxInt), - frequencies: map[float64]int{}, + typ: typ, + max: float64(minInt), + min: float64(maxInt), + median: maxFloat, + // use histogram to accumulate values + histogram: make([]float64, 0, StopFreqCountThreshold*100), } } @@ -359,13 +367,14 @@ func (acc *numericAcc) Write(e dsio.Entry) { return } - if acc.frequencies != nil { - acc.frequencies[v]++ - if len(acc.frequencies) >= StopFreqCountThreshold { - acc.frequencies = nil + if acc.histogram != nil { + acc.histogram = append(acc.histogram, v) + if len(acc.histogram) == StopFreqCountThreshold*100 { + acc.histogram = nil } } + acc.mean += v acc.count++ if v > acc.max { acc.max = v @@ -383,23 +392,21 @@ func (acc *numericAcc) Map() map[string]interface{} { return map[string]interface{}{"count": 0} } m := map[string]interface{}{ + "mean": acc.mean, "count": acc.count, "min": acc.min, "max": acc.max, } - if acc.unique != 0 { - m["unique"] = acc.unique + if acc.median != maxFloat { + m["median"] = acc.median } - if acc.frequencies != nil { - // need to convert keys to strings b/c many serialization formats aren't - // down with numeric map keys - strFrq := map[string]int{} - for fl, freq := range acc.frequencies { - strFrq[strconv.FormatFloat(fl, 'f', -1, 64)] = freq + if acc.histogram != nil { + m["histogram"] = map[string][]float64{ + "bins": acc.dividers, + "frequencies": acc.histogram, } - m["frequencies"] = strFrq } return m @@ -407,17 +414,21 @@ func (acc *numericAcc) Map() map[string]interface{} { // Close finalizes the accumulator func (acc *numericAcc) Close() { - if acc.frequencies != nil { - // determine unique values - for key, freq := range acc.frequencies { - if freq == 1 { - acc.unique++ - delete(acc.frequencies, key) - } - } - if len(acc.frequencies) == 0 { - acc.frequencies = nil - } + // finalize avg + acc.mean = acc.mean / float64(acc.count) + + if len(acc.histogram) > 0 { + acc.median = acc.histogram[len(acc.histogram)/2] + + sort.Float64Slice(acc.histogram).Sort() + // turn values into a histogram + nBins := 10 + acc.dividers = make([]float64, nBins+1) + // Increase the maximum divider so that the maximum value of x is contained + // within the last bucket. + gonumfloats.Span(acc.dividers, acc.min, acc.max+1) + // Span includes the min and the max. Trim the dividers to create 10 buckets + acc.histogram = gonumstat.Histogram(nil, acc.dividers, acc.histogram, nil) } } diff --git a/stats/stats_test.go b/stats/stats_test.go index 6d5a67855..915f7688b 100644 --- a/stats/stats_test.go +++ b/stats/stats_test.go @@ -60,22 +60,30 @@ func TestAllTypesIdentitySchemaArray(t *testing.T) { "type": "boolean", }, { - "key": "float", - "count": 5, - "min": float64(1.1), - "max": float64(5.5), - "type": "numeric", - "unique": 3, - "frequencies": map[string]int{"1.1": 2}, + "key": "float", + "count": 5, + "min": float64(1.1), + "max": float64(5.5), + "mean": float64(3.08), + "median": float64(3.3), + "type": "numeric", + "histogram": map[string][]float64{ + "bins": {1.1, 1.6400000000000001, 2.18, 2.72, 3.2600000000000002, 3.8000000000000003, 4.34, 4.880000000000001, 5.42, 5.960000000000001, 6.5}, + "frequencies": {2, 0, 0, 0, 1, 0, 1, 0, 1, 0}, + }, }, { - "key": "int", - "count": 5, - "min": float64(1), - "max": float64(5), - "type": "numeric", - "unique": 3, - "frequencies": map[string]int{"1": 2}, + "key": "int", + "count": 5, + "min": float64(1), + "max": float64(5), + "mean": float64(2.8), + "median": float64(3), + "type": "numeric", + "histogram": map[string][]float64{ + "bins": {1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6}, + "frequencies": {2, 0, 0, 0, 1, 0, 1, 0, 1, 0}, + }, }, { "key": "nil", @@ -102,28 +110,36 @@ func TestAllTypesIdentitySchemaObject(t *testing.T) { "all types identity schema object of array entries", `{"type":"object"}`, `{ - "a" : [1,1.1,null,false,"a"], - "b" : [1,2.2,null,true,"aa"], + "a" : [5,1.1,null,false,"a"], + "b" : [4,2.2,null,true,"aa"], "c" : [3,2.2,null,false,"aaa"], - "d" : [4,4.4,null,true,"aaa"], - "e" : [5,5.5,null,false,"aaaaa"] + "d" : [1,4.4,null,true,"aaa"], + "e" : [1,5.5,null,false,"aaaaa"] }`, []map[string]interface{}{ { - "count": 5, - "min": float64(1), - "max": float64(5), - "type": "numeric", - "unique": 3, - "frequencies": map[string]int{"1": 2}, + "count": 5, + "min": float64(1), + "max": float64(5), + "mean": float64(2.8), + "median": float64(3), + "type": "numeric", + "histogram": map[string][]float64{ + "bins": {1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6}, + "frequencies": {2, 0, 0, 0, 1, 0, 1, 0, 1, 0}, + }, }, { - "count": 5, - "min": float64(1.1), - "max": float64(5.5), - "type": "numeric", - "unique": 3, - "frequencies": map[string]int{"2.2": 2}, + "count": 5, + "min": float64(1.1), + "max": float64(5.5), + "mean": float64(3.08), + "median": float64(2.2), + "type": "numeric", + "histogram": map[string][]float64{ + "bins": {1.1, 1.6400000000000001, 2.18, 2.72, 3.2600000000000002, 3.8000000000000003, 4.34, 4.880000000000001, 5.42, 5.960000000000001, 6.5}, + "frequencies": {1, 0, 2, 0, 0, 0, 1, 0, 1, 0}, + }, }, { "count": 5, @@ -173,11 +189,17 @@ func TestFreqThreshold(t *testing.T) { "frequencies": map[string]int{"abcdefghijk": 5}, }, { - "count": 5, - "min": float64(1), - "max": float64(1), - "type": "numeric", - "frequencies": map[string]int{"1": 5}, + "count": 5, + "min": float64(1), + "max": float64(1), + "mean": float64(1), + "median": float64(1), + // currently we're calculating historams at 100x the stop threshold, so this shows up + "histogram": map[string][]float64{ + "bins": {1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7000000000000002, 1.8, 1.9, 2}, + "frequencies": {5, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + }, + "type": "numeric", }, }, } @@ -200,10 +222,17 @@ func TestFreqThreshold(t *testing.T) { "type": "string", }, { - "count": 5, - "min": float64(1), - "max": float64(5), - "type": "numeric", + "count": 5, + "min": float64(1), + "max": float64(5), + "mean": float64(3), + "median": float64(3), + // currently we're calculating historams at 100x the stop threshold, so this shows up + "histogram": map[string][]float64{ + "bins": {1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6}, + "frequencies": {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, + }, + "type": "numeric", }, }, } @@ -336,7 +365,7 @@ func TestJSON(t *testing.T) { {"int": 4, "float": 4.4, "nil": null, "bool": true, "string": "aaa"}, {"int": 5, "float": 5.5, "nil": null, "bool": false, "string": "aaaaa"} ]`, - []byte(`[{"count":5,"falseCount":3,"key":"bool","trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"1.1":2},"key":"float","max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"frequencies":{"1":2},"key":"int","max":5,"min":1,"type":"numeric","unique":3},{"count":5,"key":"nil","type":"null"},{"count":5,"frequencies":{"aaa":2},"key":"string","maxLength":5,"minLength":1,"type":"string","unique":3}]`), + []byte(`[{"count":5,"falseCount":3,"key":"bool","trueCount":2,"type":"boolean"},{"count":5,"histogram":{"bins":[1.1,1.6400000000000001,2.18,2.72,3.2600000000000002,3.8000000000000003,4.34,4.880000000000001,5.42,5.960000000000001,6.5],"frequencies":[2,0,0,0,1,0,1,0,1,0]},"key":"float","max":5.5,"mean":3.08,"median":3.3,"min":1.1,"type":"numeric"},{"count":5,"histogram":{"bins":[1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6],"frequencies":[2,0,0,0,1,0,1,0,1,0]},"key":"int","max":5,"mean":2.8,"median":3,"min":1,"type":"numeric"},{"count":5,"key":"nil","type":"null"},{"count":5,"frequencies":{"aaa":2},"key":"string","maxLength":5,"minLength":1,"type":"string","unique":3}]`), }, { "csv: an array of strings", "csv", @@ -375,7 +404,7 @@ func TestJSON(t *testing.T) { "type": "array" }`, "1,1.1,,false,a\n1,1.1,,true,aa\n3,3.3,,false,aaa\n4,4.4,,true,aaa\n5,5.5,,false,aaaaa", - []byte(`[{"count":5,"frequencies":{"1":2},"max":5,"min":1,"type":"numeric","unique":3},{"count":5,"frequencies":{"1.1":2},"max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`), + []byte(`[{"count":5,"histogram":{"bins":[1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6],"frequencies":[2,0,0,0,1,0,1,0,1,0]},"max":5,"mean":2.8,"median":3,"min":1,"type":"numeric"},{"count":5,"histogram":{"bins":[1.1,1.6400000000000001,2.18,2.72,3.2600000000000002,3.8000000000000003,4.34,4.880000000000001,5.42,5.960000000000001,6.5],"frequencies":[2,0,0,0,1,0,1,0,1,0]},"max":5.5,"mean":3.08,"median":3.3,"min":1.1,"type":"numeric"},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`), }, { "json: all types identity schema object of array entries", "json", @@ -387,7 +416,7 @@ func TestJSON(t *testing.T) { "d" : [4,4.4,null,true,"aaa"], "e" : [5,5.5,null,false,"aaaaa"] }`, - []byte(`[{"count":5,"frequencies":{"1":2},"max":5,"min":1,"type":"numeric","unique":3},{"count":5,"frequencies":{"2.2":2},"max":5.5,"min":1.1,"type":"numeric","unique":3},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`), + []byte(`[{"count":5,"histogram":{"bins":[1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6],"frequencies":[2,0,0,0,1,0,1,0,1,0]},"max":5,"mean":2.8,"median":3,"min":1,"type":"numeric"},{"count":5,"histogram":{"bins":[1.1,1.6400000000000001,2.18,2.72,3.2600000000000002,3.8000000000000003,4.34,4.880000000000001,5.42,5.960000000000001,6.5],"frequencies":[1,0,2,0,0,0,1,0,1,0]},"max":5.5,"mean":3.08,"median":2.2,"min":1.1,"type":"numeric"},{"count":5,"type":"null"},{"count":5,"falseCount":3,"trueCount":2,"type":"boolean"},{"count":5,"frequencies":{"aaa":2},"maxLength":5,"minLength":1,"type":"string","unique":3}]`), }, { "json: array of object of array of strings", "json", @@ -397,7 +426,7 @@ func TestJSON(t *testing.T) { {"ids": [1,2,3,4,5,6] }, {"ids": ["b",20,"c"] } ]`, - []byte(`[{"key":"ids","type":"array","values":[{"count":2,"maxLength":1,"minLength":1,"unique":2},{"count":1,"maxLength":1,"minLength":1,"unique":1},{"count":2,"frequencies":{"c":2},"maxLength":1,"minLength":1},{"count":1,"max":4,"min":4,"unique":1},{"count":1,"max":5,"min":5,"unique":1},{"count":1,"max":6,"min":6,"unique":1}]},{"count":1,"falseCount":0,"key":"is_great","trueCount":1,"type":"boolean"}]`), + []byte(`[{"key":"ids","type":"array","values":[{"count":2,"maxLength":1,"minLength":1,"unique":2},{"count":1,"maxLength":1,"minLength":1,"unique":1},{"count":2,"frequencies":{"c":2},"maxLength":1,"minLength":1},{"count":1,"histogram":{"bins":[4,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5],"frequencies":[1,0,0,0,0,0,0,0,0,0]},"max":4,"mean":4,"median":4,"min":4},{"count":1,"histogram":{"bins":[5,5.1,5.2,5.3,5.4,5.5,5.6,5.7,5.8,5.9,6],"frequencies":[1,0,0,0,0,0,0,0,0,0]},"max":5,"mean":5,"median":5,"min":5},{"count":1,"histogram":{"bins":[6,6.1,6.2,6.3,6.4,6.5,6.6,6.7,6.8,6.9,7],"frequencies":[1,0,0,0,0,0,0,0,0,0]},"max":6,"mean":6,"median":6,"min":6}]},{"count":1,"falseCount":0,"key":"is_great","trueCount":1,"type":"boolean"}]`), }, } for i, c := range goodCases {