-
Notifications
You must be signed in to change notification settings - Fork 599
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement bucket and histogram numeric expr transforms, category synt…
…hetic type Per #33 and #34 Author: Wes McKinney <wes@cloudera.com> Closes #242 from wesm/bucket-transform and squashes the following commits: ee037d6 [Wes McKinney] Slighly better label docs and move to analytics module abb8797 [Wes McKinney] Add error checking for number of buckets 2bf3b51 [Wes McKinney] Implement label method for CategoryValue c3609b3 [Wes McKinney] Casting bucket category to int32 is a noop 9b3b971 [Wes McKinney] Handle bucket edge cases and no-bucket under/over case 3abdb9b [Wes McKinney] Fix list repr interactive mode bug and tweak histogram base to avoid some FP error issues' 8dac292 [Wes McKinney] Initial histogram implementation, but interactive mode repr problems dda0475 [Wes McKinney] Fix category type repr f0404e3 [Wes McKinney] More exhaustive bucket test cases, and move dimension creation to translate_expr code path cb90310 [Wes McKinney] Preliminary bucket implementation 013a5b9 [Wes McKinney] Implement basic category type and bucket and histogram APIs
- Loading branch information
Showing
14 changed files
with
755 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
# Copyright 2015 Cloudera Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import ibis.expr.types as ir | ||
import ibis.expr.operations as ops | ||
|
||
|
||
class BucketLike(ir.ValueNode): | ||
|
||
def _validate_closed(self, closed): | ||
closed = closed.lower() | ||
if closed not in ['left', 'right']: | ||
raise ValueError("closed must be 'left' or 'right'") | ||
return closed | ||
|
||
@property | ||
def nbuckets(self): | ||
return None | ||
|
||
def output_type(self): | ||
ctype = ir.CategoryType(self.nbuckets) | ||
return ctype.array_ctor() | ||
|
||
|
||
class Bucket(BucketLike): | ||
|
||
def __init__(self, arg, buckets, closed='left', close_extreme=True, | ||
include_under=False, include_over=False): | ||
self.arg = arg | ||
self.buckets = buckets | ||
self.closed = self._validate_closed(closed) | ||
|
||
self.close_extreme = bool(close_extreme) | ||
self.include_over = bool(include_over) | ||
self.include_under = bool(include_under) | ||
|
||
if len(buckets) == 0: | ||
raise ValueError('Must be at least one bucket edge') | ||
elif len(buckets) == 1: | ||
if not self.include_under or not self.include_over: | ||
raise ValueError('If one bucket edge provided, must have' | ||
' include_under=True and include_over=True') | ||
|
||
ir.ValueNode.__init__(self, [self.arg, self.buckets, self.closed, | ||
self.close_extreme, | ||
self.include_under, | ||
self.include_over]) | ||
|
||
@property | ||
def nbuckets(self): | ||
k = len(self.buckets) - 1 | ||
k += int(self.include_over) + int(self.include_under) | ||
return k | ||
|
||
|
||
class Histogram(BucketLike): | ||
|
||
def __init__(self, arg, nbins, binwidth, base, closed='left', | ||
aux_hash=None): | ||
self.arg = arg | ||
|
||
self.nbins = nbins | ||
self.binwidth = binwidth | ||
self.base = base | ||
|
||
if self.nbins is None: | ||
if self.binwidth is None: | ||
raise ValueError('Must indicate nbins or binwidth') | ||
elif self.binwidth is not None: | ||
raise ValueError('nbins and binwidth are mutually exclusive') | ||
|
||
self.closed = self._validate_closed(closed) | ||
|
||
self.aux_hash = aux_hash | ||
ir.ValueNode.__init__(self, [self.arg, self.nbins, self.binwidth, | ||
self.base, self.closed, self.aux_hash]) | ||
|
||
def output_type(self): | ||
# always undefined cardinality (for now) | ||
ctype = ir.CategoryType() | ||
return ctype.array_ctor() | ||
|
||
|
||
class CategoryLabel(ir.ValueNode): | ||
|
||
def __init__(self, arg, labels, nulls): | ||
self.arg = ops.as_value_expr(arg) | ||
self.labels = labels | ||
|
||
card = self.arg.type().cardinality | ||
if len(self.labels) != card: | ||
raise ValueError('Number of labels must match number of ' | ||
'categories: %d' % card) | ||
|
||
self.nulls = nulls | ||
ir.ValueNode.__init__(self, [self.arg, self.labels, self.nulls]) | ||
|
||
def output_type(self): | ||
return ops._shape_like(self.arg, 'string') | ||
|
||
|
||
def bucket(arg, buckets, closed='left', close_extreme=True, | ||
include_under=False, include_over=False): | ||
""" | ||
Parameters | ||
---------- | ||
arg : numeric array expression | ||
buckets : list | ||
closed : {'left', 'right'}, default 'left' | ||
Which side of each interval is closed. For example | ||
buckets = [0, 100, 200] | ||
closed = 'left': 100 falls in 2nd bucket | ||
closed = 'right': 100 falls in 1st bucket | ||
close_extreme : boolean, default True | ||
Returns | ||
------- | ||
bucketed : coded value expression | ||
""" | ||
op = Bucket(arg, buckets, closed=closed, close_extreme=close_extreme, | ||
include_under=include_under, include_over=include_over) | ||
return op.to_expr() | ||
|
||
|
||
def histogram(arg, nbins=None, binwidth=None, base=None, closed='left', | ||
aux_hash=None): | ||
""" | ||
Compute a histogram with fixed width bins | ||
Parameters | ||
---------- | ||
arg : numeric array expression | ||
nbins : int, default None | ||
If supplied, will be used to compute the binwidth | ||
binwidth : number, default None | ||
If not supplied, computed from the data (actual max and min values) | ||
base : number, default None | ||
closed : {'left', 'right'}, default 'left' | ||
Which side of each interval is closed | ||
Returns | ||
------- | ||
histogrammed : coded value expression | ||
""" | ||
op = Histogram(arg, nbins, binwidth, base, closed=closed, | ||
aux_hash=aux_hash) | ||
return op.to_expr() | ||
|
||
|
||
def category_label(arg, labels, nulls=None): | ||
""" | ||
Format a known number of categories as strings | ||
Parameters | ||
---------- | ||
labels : list of string | ||
nulls : string, optional | ||
How to label any null values among the categories | ||
Returns | ||
------- | ||
string_categories : string value expression | ||
""" | ||
op = CategoryLabel(arg, labels, nulls) | ||
return op.to_expr() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# Copyright 2014 Cloudera Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import unittest | ||
|
||
from ibis.expr.tests.mocks import MockConnection | ||
import ibis.expr.types as ir | ||
|
||
|
||
class TestAnalytics(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self.con = MockConnection() | ||
self.alltypes = self.con.table('functional_alltypes') | ||
|
||
def test_category_project(self): | ||
t = self.alltypes | ||
|
||
tier = t.double_col.bucket([0, 50, 100]).name('tier') | ||
expr = t[tier, t] | ||
|
||
assert isinstance(expr.tier, ir.CategoryArray) | ||
|
||
def test_bucket(self): | ||
d = self.alltypes.double_col | ||
bins = [0, 10, 50, 100] | ||
|
||
expr = d.bucket(bins) | ||
assert isinstance(expr, ir.CategoryArray) | ||
assert expr.op().nbuckets == 3 | ||
|
||
expr = d.bucket(bins, include_over=True) | ||
assert expr.op().nbuckets == 4 | ||
|
||
expr = d.bucket(bins, include_over=True, include_under=True) | ||
assert expr.op().nbuckets == 5 | ||
|
||
def test_bucket_error_cases(self): | ||
d = self.alltypes.double_col | ||
|
||
self.assertRaises(ValueError, d.bucket, []) | ||
self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo') | ||
|
||
# it works! | ||
d.bucket([10], include_under=True, include_over=True) | ||
|
||
self.assertRaises(ValueError, d.bucket, [10]) | ||
self.assertRaises(ValueError, d.bucket, [10], include_under=True) | ||
self.assertRaises(ValueError, d.bucket, [10], include_over=True) | ||
|
||
def test_histogram(self): | ||
d = self.alltypes.double_col | ||
|
||
self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5) | ||
self.assertRaises(ValueError, d.histogram) | ||
self.assertRaises(ValueError, d.histogram, 10, closed='foo') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.