Skip to content

Commit 5212cbc

Browse files
authored
Clean-up and simplify median_grouped(). Vastly improve its docstring. (#92324)
1 parent b885b8f commit 5212cbc

File tree

2 files changed

+54
-96
lines changed

2 files changed

+54
-96
lines changed

Lib/statistics.py

+54-52
Original file line numberDiff line numberDiff line change
@@ -348,22 +348,6 @@ def _convert(value, T):
348348
raise
349349

350350

351-
def _find_lteq(a, x):
352-
'Locate the leftmost value exactly equal to x'
353-
i = bisect_left(a, x)
354-
if i != len(a) and a[i] == x:
355-
return i
356-
raise ValueError
357-
358-
359-
def _find_rteq(a, l, x):
360-
'Locate the rightmost value exactly equal to x'
361-
i = bisect_right(a, x, lo=l)
362-
if i != (len(a) + 1) and a[i - 1] == x:
363-
return i - 1
364-
raise ValueError
365-
366-
367351
def _fail_neg(values, errmsg='negative value'):
368352
"""Iterate over values, failing if any are less than zero."""
369353
for x in values:
@@ -628,57 +612,75 @@ def median_high(data):
628612

629613

630614
def median_grouped(data, interval=1):
631-
"""Return the 50th percentile (median) of grouped continuous data.
632-
633-
>>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
634-
3.7
635-
>>> median_grouped([52, 52, 53, 54])
636-
52.5
637-
638-
This calculates the median as the 50th percentile, and should be
639-
used when your data is continuous and grouped. In the above example,
640-
the values 1, 2, 3, etc. actually represent the midpoint of classes
641-
0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
642-
class 3.5-4.5, and interpolation is used to estimate it.
643-
644-
Optional argument ``interval`` represents the class interval, and
645-
defaults to 1. Changing the class interval naturally will change the
646-
interpolated 50th percentile value:
647-
648-
>>> median_grouped([1, 3, 3, 5, 7], interval=1)
649-
3.25
650-
>>> median_grouped([1, 3, 3, 5, 7], interval=2)
651-
3.5
652-
653-
This function does not check whether the data points are at least
654-
``interval`` apart.
615+
"""Estimates the median for numeric data binned around the midpoints
616+
of consecutive, fixed-width intervals.
617+
618+
The *data* can be any iterable of numeric data with each value being
619+
exactly the midpoint of a bin. At least one value must be present.
620+
621+
The *interval* is width of each bin.
622+
623+
For example, demographic information may have been summarized into
624+
consecutive ten-year age groups with each group being represented
625+
by the 5-year midpoints of the intervals:
626+
627+
>>> demographics = Counter({
628+
... 25: 172, # 20 to 30 years old
629+
... 35: 484, # 30 to 40 years old
630+
... 45: 387, # 40 to 50 years old
631+
... 55: 22, # 50 to 60 years old
632+
... 65: 6, # 60 to 70 years old
633+
... })
634+
635+
The 50th percentile (median) is the 536th person out of the 1071
636+
member cohort. That person is in the 30 to 40 year old age group.
637+
638+
The regular median() function would assume that everyone in the
639+
tricenarian age group was exactly 35 years old. A more tenable
640+
assumption is that the 484 members of that age group are evenly
641+
distributed between 30 and 40. For that, we use median_grouped().
642+
643+
>>> data = list(demographics.elements())
644+
>>> median(data)
645+
35
646+
>>> round(median_grouped(data, interval=10), 1)
647+
37.5
648+
649+
The caller is responsible for making sure the data points are separated
650+
by exact multiples of *interval*. This is essential for getting a
651+
correct result. The function does not check this precondition.
652+
655653
"""
656654
data = sorted(data)
657655
n = len(data)
658656
if n == 0:
659657
raise StatisticsError("no median for empty data")
660658
elif n == 1:
661659
return data[0]
660+
662661
# Find the value at the midpoint. Remember this corresponds to the
663-
# centre of the class interval.
662+
# midpoint of the class interval.
664663
x = data[n // 2]
664+
665+
# Generate a clear error message for non-numeric data
665666
for obj in (x, interval):
666667
if isinstance(obj, (str, bytes)):
667-
raise TypeError('expected number but got %r' % obj)
668+
raise TypeError(f'expected a number but got {obj!r}')
669+
670+
# Using O(log n) bisection, find where all the x values occur in the data.
671+
# All x will lie within data[i:j].
672+
i = bisect_left(data, x)
673+
j = bisect_right(data, x, lo=i)
674+
675+
# Interpolate the median using the formula found at:
676+
# https://www.cuemath.com/data/median-of-grouped-data/
668677
try:
669678
L = x - interval / 2 # The lower limit of the median interval.
670679
except TypeError:
671-
# Mixed type. For now we just coerce to float.
680+
# Coerce mixed types to float.
672681
L = float(x) - float(interval) / 2
673-
674-
# Uses bisection search to search for x in data with log(n) time complexity
675-
# Find the position of leftmost occurrence of x in data
676-
l1 = _find_lteq(data, x)
677-
# Find the position of rightmost occurrence of x in data[l1...len(data)]
678-
# Assuming always l1 <= l2
679-
l2 = _find_rteq(data, l1, x)
680-
cf = l1
681-
f = l2 - l1 + 1
682+
cf = i # Cumulative frequency of the preceding interval
683+
f = j - i # Number of elements in the median internal
682684
return L + interval * (n / 2 - cf) / f
683685

684686

Lib/test/test_statistics.py

-44
Original file line numberDiff line numberDiff line change
@@ -1040,50 +1040,6 @@ def test_error_msg(self):
10401040
self.assertEqual(errmsg, msg)
10411041

10421042

1043-
class FindLteqTest(unittest.TestCase):
1044-
# Test _find_lteq private function.
1045-
1046-
def test_invalid_input_values(self):
1047-
for a, x in [
1048-
([], 1),
1049-
([1, 2], 3),
1050-
([1, 3], 2)
1051-
]:
1052-
with self.subTest(a=a, x=x):
1053-
with self.assertRaises(ValueError):
1054-
statistics._find_lteq(a, x)
1055-
1056-
def test_locate_successfully(self):
1057-
for a, x, expected_i in [
1058-
([1, 1, 1, 2, 3], 1, 0),
1059-
([0, 1, 1, 1, 2, 3], 1, 1),
1060-
([1, 2, 3, 3, 3], 3, 2)
1061-
]:
1062-
with self.subTest(a=a, x=x):
1063-
self.assertEqual(expected_i, statistics._find_lteq(a, x))
1064-
1065-
1066-
class FindRteqTest(unittest.TestCase):
1067-
# Test _find_rteq private function.
1068-
1069-
def test_invalid_input_values(self):
1070-
for a, l, x in [
1071-
([1], 2, 1),
1072-
([1, 3], 0, 2)
1073-
]:
1074-
with self.assertRaises(ValueError):
1075-
statistics._find_rteq(a, l, x)
1076-
1077-
def test_locate_successfully(self):
1078-
for a, l, x, expected_i in [
1079-
([1, 1, 1, 2, 3], 0, 1, 2),
1080-
([0, 1, 1, 1, 2, 3], 0, 1, 3),
1081-
([1, 2, 3, 3, 3], 0, 3, 4)
1082-
]:
1083-
with self.subTest(a=a, l=l, x=x):
1084-
self.assertEqual(expected_i, statistics._find_rteq(a, l, x))
1085-
1086-
10871043
# === Tests for public functions ===
10881044

10891045
class UnivariateCommonMixin:

0 commit comments

Comments
 (0)