Fix/uniform discretisation (#65)

* Fix uniform discretiser * Fix uniform discretiser Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com>
mckinsey · Sep 18, 2020 · 9f11c4b · 9f11c4b
1 parent 04bceb4
commit 9f11c4b
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 34 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,5 +1,7 @@
 # Upcoming release
 
+* Fixed uniform discretiser (`Discretiser(method='uniform')`) where all bins have identical widths.
+
 # Release 0.8.0
 
 * Add DYNOTEARS (`from_numpy_dynamic`, an algorithm for structure learning on Dynamic Bayesian Networks).
@@ -52,6 +54,6 @@ The initial release of CausalNex.
 
 ## Thanks for supporting contributions
 CausalNex was originally designed by [Paul Beaumont](https://www.linkedin.com/in/pbeaumont/) and [Ben Horsburgh](https://www.linkedin.com/in/benhorsburgh/) to solve challenges they faced in inferencing causality in their project work. This work was later turned into a product thanks to the following contributors:
-[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), and [Zain Patel](https://www.linkedin.com/in/zain-patel/).
+[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), [Zain Patel](https://www.linkedin.com/in/zain-patel/), and [Shuhei Ishida](https://www.linkedin.com/in/shuhei-i/).
 
 CausalNex would also not be possible without the generous sharing from leading researches in the field of causal inference and we are grateful to everyone who advised and supported us, filed issues or helped resolve them, asked and answered questions or simply be part of inspiring discussions.
diff --git a/causalnex/discretiser/discretiser.py b/causalnex/discretiser/discretiser.py
@@ -174,10 +174,9 @@ def fit(self, data: np.ndarray) -> "Discretiser":
  x.sort()
 
  if self.method == "uniform":
- bucket_width = len(x) / self.num_buckets
+ bucket_width = (np.max(x) - np.min(x)) / self.num_buckets
  self.numeric_split_points = [
- x[int(np.floor((n + 1) * bucket_width))]
- for n in range(self.num_buckets - 1)
+ np.min(x) + bucket_width * (n + 1) for n in range(self.num_buckets - 1)
  ]
 
  elif self.method == "quantile":

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -36,26 +36,16 @@
 
 class TestUniform:
  def test_fit_creates_exactly_uniform_splits_when_possible(self):
- """splits should be exactly uniform if possible"""
+ """splits should be exactly uniform"""
 
- arr = np.array(range(20))
+ arr = np.array(range(21))
  np.random.shuffle(arr)
- d = Discretiser(method="uniform", num_buckets=4)
+ d = Discretiser(method="uniform", num_buckets=5)
  d.fit(arr)
  for n in range(2):
- assert 4 < (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 5
-
- def test_fit_creates_close_to_uniform_splits_when_uniform_not_possible(self):
- """splits should be close to uniform if uniform is not possible"""
-
- arr = np.array(range(9))
- np.random.shuffle(arr)
- d = Discretiser(method="uniform", num_buckets=4)
- d.fit(arr)
-
- assert len(d.numeric_split_points) == 3
- for n in range(2):
- assert 2 <= (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 3
+ assert (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) == (
+ (d.numeric_split_points[n + 2] - d.numeric_split_points[n + 1])
+ )
 
  def test_fit_does_not_attempt_to_deal_with_identical_split_points(self):
  """if all data is identical, and num_buckets>1, then this is not possible.
@@ -70,20 +60,6 @@ def test_fit_does_not_attempt_to_deal_with_identical_split_points(self):
  d.numeric_split_points,
  )
 
- def test_transform_uneven_split(self):
- """Data that cannot be split evenly between buckets should be transformed
- into near-even buckets"""
-
- arr = np.array([n + 1 for n in range(10)])
- np.random.shuffle(arr)
- d = Discretiser(method="uniform", num_buckets=4)
- d.fit(arr)
- unique, counts = np.unique(d.transform(arr), return_counts=True)
- # check all 4 buckets are used
- assert np.array_equal([0, 1, 2, 3], unique)
- # check largest difference in distribution is 1 item
- assert (np.max(counts) - np.min(counts)) <= 1
-
  def test_transform_larger_than_fit_range_goes_into_last_bucket(self):
  """If a value larger than the input is transformed, then it
  should go into the maximum bucket"""