Skip to content

Commit

Permalink
Implement bucket and histogram numeric expr transforms, category synt…
Browse files Browse the repository at this point in the history
…hetic type

Per #33 and #34

Author: Wes McKinney <wes@cloudera.com>

Closes #242 from wesm/bucket-transform and squashes the following commits:

ee037d6 [Wes McKinney] Slighly better label docs and move to analytics module
abb8797 [Wes McKinney] Add error checking for number of buckets
2bf3b51 [Wes McKinney] Implement label method for CategoryValue
c3609b3 [Wes McKinney] Casting bucket category to int32 is a noop
9b3b971 [Wes McKinney] Handle bucket edge cases and no-bucket under/over case
3abdb9b [Wes McKinney] Fix list repr interactive mode bug and tweak histogram base to avoid some FP error issues'
8dac292 [Wes McKinney] Initial histogram implementation, but interactive mode repr problems
dda0475 [Wes McKinney] Fix category type repr
f0404e3 [Wes McKinney] More exhaustive bucket test cases, and move dimension creation to translate_expr code path
cb90310 [Wes McKinney] Preliminary bucket implementation
013a5b9 [Wes McKinney] Implement basic category type and bucket and histogram APIs
  • Loading branch information
wesm committed May 31, 2015
1 parent 0138cee commit 3d4d2b4
Show file tree
Hide file tree
Showing 14 changed files with 755 additions and 32 deletions.
178 changes: 178 additions & 0 deletions ibis/expr/analytics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import ibis.expr.types as ir
import ibis.expr.operations as ops


class BucketLike(ir.ValueNode):

def _validate_closed(self, closed):
closed = closed.lower()
if closed not in ['left', 'right']:
raise ValueError("closed must be 'left' or 'right'")
return closed

@property
def nbuckets(self):
return None

def output_type(self):
ctype = ir.CategoryType(self.nbuckets)
return ctype.array_ctor()


class Bucket(BucketLike):

def __init__(self, arg, buckets, closed='left', close_extreme=True,
include_under=False, include_over=False):
self.arg = arg
self.buckets = buckets
self.closed = self._validate_closed(closed)

self.close_extreme = bool(close_extreme)
self.include_over = bool(include_over)
self.include_under = bool(include_under)

if len(buckets) == 0:
raise ValueError('Must be at least one bucket edge')
elif len(buckets) == 1:
if not self.include_under or not self.include_over:
raise ValueError('If one bucket edge provided, must have'
' include_under=True and include_over=True')

ir.ValueNode.__init__(self, [self.arg, self.buckets, self.closed,
self.close_extreme,
self.include_under,
self.include_over])

@property
def nbuckets(self):
k = len(self.buckets) - 1
k += int(self.include_over) + int(self.include_under)
return k


class Histogram(BucketLike):

def __init__(self, arg, nbins, binwidth, base, closed='left',
aux_hash=None):
self.arg = arg

self.nbins = nbins
self.binwidth = binwidth
self.base = base

if self.nbins is None:
if self.binwidth is None:
raise ValueError('Must indicate nbins or binwidth')
elif self.binwidth is not None:
raise ValueError('nbins and binwidth are mutually exclusive')

self.closed = self._validate_closed(closed)

self.aux_hash = aux_hash
ir.ValueNode.__init__(self, [self.arg, self.nbins, self.binwidth,
self.base, self.closed, self.aux_hash])

def output_type(self):
# always undefined cardinality (for now)
ctype = ir.CategoryType()
return ctype.array_ctor()


class CategoryLabel(ir.ValueNode):

def __init__(self, arg, labels, nulls):
self.arg = ops.as_value_expr(arg)
self.labels = labels

card = self.arg.type().cardinality
if len(self.labels) != card:
raise ValueError('Number of labels must match number of '
'categories: %d' % card)

self.nulls = nulls
ir.ValueNode.__init__(self, [self.arg, self.labels, self.nulls])

def output_type(self):
return ops._shape_like(self.arg, 'string')


def bucket(arg, buckets, closed='left', close_extreme=True,
include_under=False, include_over=False):
"""
Parameters
----------
arg : numeric array expression
buckets : list
closed : {'left', 'right'}, default 'left'
Which side of each interval is closed. For example
buckets = [0, 100, 200]
closed = 'left': 100 falls in 2nd bucket
closed = 'right': 100 falls in 1st bucket
close_extreme : boolean, default True
Returns
-------
bucketed : coded value expression
"""
op = Bucket(arg, buckets, closed=closed, close_extreme=close_extreme,
include_under=include_under, include_over=include_over)
return op.to_expr()


def histogram(arg, nbins=None, binwidth=None, base=None, closed='left',
aux_hash=None):
"""
Compute a histogram with fixed width bins
Parameters
----------
arg : numeric array expression
nbins : int, default None
If supplied, will be used to compute the binwidth
binwidth : number, default None
If not supplied, computed from the data (actual max and min values)
base : number, default None
closed : {'left', 'right'}, default 'left'
Which side of each interval is closed
Returns
-------
histogrammed : coded value expression
"""
op = Histogram(arg, nbins, binwidth, base, closed=closed,
aux_hash=aux_hash)
return op.to_expr()


def category_label(arg, labels, nulls=None):
"""
Format a known number of categories as strings
Parameters
----------
labels : list of string
nulls : string, optional
How to label any null values among the categories
Returns
-------
string_categories : string value expression
"""
op = CategoryLabel(arg, labels, nulls)
return op.to_expr()
33 changes: 25 additions & 8 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,17 @@
StringValue, StringScalar, StringArray,
DecimalValue, DecimalScalar, DecimalArray,
TimestampValue, TimestampScalar, TimestampArray,
unnamed)
CategoryValue, unnamed)

from ibis.expr.operations import (as_value_expr, table, literal, null,
value_list, desc)

from ibis.expr.temporal import *

import ibis.common as _com

from ibis.expr.analytics import bucket, histogram
import ibis.expr.analytics as _analytics
import ibis.expr.analysis as _L
import ibis.expr.operations as _ops
import ibis.expr.temporal as _T
Expand Down Expand Up @@ -266,7 +269,7 @@ def f(self):
return f


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Generic value API


Expand Down Expand Up @@ -516,7 +519,7 @@ def nullif(value, null_if_expr):
_add_methods(ArrayExpr, _generic_array_methods)


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Numeric API

def round(arg, digits=None):
Expand Down Expand Up @@ -588,13 +591,15 @@ def log(arg, base=None):
_numeric_array_methods = dict(
mean=mean,
sum=sum,
bucket=bucket,
histogram=histogram
)

_add_methods(NumericValue, _numeric_value_methods)
_add_methods(NumericArray, _numeric_array_methods)


#----------------------------------------------------------------------
# ----------------------------------------------------------------------
# Boolean API


Expand All @@ -620,7 +625,7 @@ def log(arg, base=None):
_add_methods(BooleanArray, _boolean_array_methods)


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# String API

def _string_substr(self, start, length=None):
Expand Down Expand Up @@ -731,7 +736,7 @@ def _string_dunder_contains(arg, substr):
_add_methods(StringValue, _string_value_methods)


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Timestamp API

_timestamp_value_methods = dict(
Expand All @@ -748,7 +753,7 @@ def _string_dunder_contains(arg, substr):
_add_methods(TimestampValue, _timestamp_value_methods)


#----------------------------------------------------------------------
# ---------------------------------------------------------------------
# Decimal API

_decimal_value_methods = dict(
Expand All @@ -759,7 +764,19 @@ def _string_dunder_contains(arg, substr):

_add_methods(DecimalValue, _decimal_value_methods)

#----------------------------------------------------------------------

# ----------------------------------------------------------------------
# Category API


_category_value_methods = dict(
label=_analytics.category_label
)

_add_methods(CategoryValue, _category_value_methods)


# ---------------------------------------------------------------------
# Table API

_join_classes = {
Expand Down
4 changes: 2 additions & 2 deletions ibis/expr/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def __contains__(self, obj):
return self._key(obj) in self.formatted

def _key(self, obj):
return repr(obj)
return obj._repr()

def observe(self, obj, formatter=repr):
def observe(self, obj, formatter=lambda x: x._repr()):
key = self._key(obj)
if key not in self.formatted:
self.aliases[key] = 'ref_%d' % len(self.formatted)
Expand Down
67 changes: 67 additions & 0 deletions ibis/expr/tests/test_analytics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright 2014 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from ibis.expr.tests.mocks import MockConnection
import ibis.expr.types as ir


class TestAnalytics(unittest.TestCase):

def setUp(self):
self.con = MockConnection()
self.alltypes = self.con.table('functional_alltypes')

def test_category_project(self):
t = self.alltypes

tier = t.double_col.bucket([0, 50, 100]).name('tier')
expr = t[tier, t]

assert isinstance(expr.tier, ir.CategoryArray)

def test_bucket(self):
d = self.alltypes.double_col
bins = [0, 10, 50, 100]

expr = d.bucket(bins)
assert isinstance(expr, ir.CategoryArray)
assert expr.op().nbuckets == 3

expr = d.bucket(bins, include_over=True)
assert expr.op().nbuckets == 4

expr = d.bucket(bins, include_over=True, include_under=True)
assert expr.op().nbuckets == 5

def test_bucket_error_cases(self):
d = self.alltypes.double_col

self.assertRaises(ValueError, d.bucket, [])
self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo')

# it works!
d.bucket([10], include_under=True, include_over=True)

self.assertRaises(ValueError, d.bucket, [10])
self.assertRaises(ValueError, d.bucket, [10], include_under=True)
self.assertRaises(ValueError, d.bucket, [10], include_over=True)

def test_histogram(self):
d = self.alltypes.double_col

self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5)
self.assertRaises(ValueError, d.histogram)
self.assertRaises(ValueError, d.histogram, 10, closed='foo')
8 changes: 8 additions & 0 deletions ibis/expr/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1876,3 +1876,11 @@ def test_interactive_non_compilable_repr_not_fail(self):
# it works!
with config.option_context('interactive', True):
repr(expr)

def test_histogram_repr_no_query_execute(self):
t = self.con.table('functional_alltypes')
tier = t.double_col.histogram(10).name('bucket')
expr = t.group_by(tier).size()
with config.option_context('interactive', True):
expr._repr()
assert self.con.last_executed_expr is None
Loading

0 comments on commit 3d4d2b4

Please sign in to comment.