@@ -348,22 +348,6 @@ def _convert(value, T):
348
348
raise
349
349
350
350
351
- def _find_lteq (a , x ):
352
- 'Locate the leftmost value exactly equal to x'
353
- i = bisect_left (a , x )
354
- if i != len (a ) and a [i ] == x :
355
- return i
356
- raise ValueError
357
-
358
-
359
- def _find_rteq (a , l , x ):
360
- 'Locate the rightmost value exactly equal to x'
361
- i = bisect_right (a , x , lo = l )
362
- if i != (len (a ) + 1 ) and a [i - 1 ] == x :
363
- return i - 1
364
- raise ValueError
365
-
366
-
367
351
def _fail_neg (values , errmsg = 'negative value' ):
368
352
"""Iterate over values, failing if any are less than zero."""
369
353
for x in values :
@@ -628,57 +612,75 @@ def median_high(data):
628
612
629
613
630
614
def median_grouped (data , interval = 1 ):
631
- """Return the 50th percentile (median) of grouped continuous data.
632
-
633
- >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
634
- 3.7
635
- >>> median_grouped([52, 52, 53, 54])
636
- 52.5
637
-
638
- This calculates the median as the 50th percentile, and should be
639
- used when your data is continuous and grouped. In the above example,
640
- the values 1, 2, 3, etc. actually represent the midpoint of classes
641
- 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
642
- class 3.5-4.5, and interpolation is used to estimate it.
643
-
644
- Optional argument ``interval`` represents the class interval, and
645
- defaults to 1. Changing the class interval naturally will change the
646
- interpolated 50th percentile value:
647
-
648
- >>> median_grouped([1, 3, 3, 5, 7], interval=1)
649
- 3.25
650
- >>> median_grouped([1, 3, 3, 5, 7], interval=2)
651
- 3.5
652
-
653
- This function does not check whether the data points are at least
654
- ``interval`` apart.
615
+ """Estimates the median for numeric data binned around the midpoints
616
+ of consecutive, fixed-width intervals.
617
+
618
+ The *data* can be any iterable of numeric data with each value being
619
+ exactly the midpoint of a bin. At least one value must be present.
620
+
621
+ The *interval* is width of each bin.
622
+
623
+ For example, demographic information may have been summarized into
624
+ consecutive ten-year age groups with each group being represented
625
+ by the 5-year midpoints of the intervals:
626
+
627
+ >>> demographics = Counter({
628
+ ... 25: 172, # 20 to 30 years old
629
+ ... 35: 484, # 30 to 40 years old
630
+ ... 45: 387, # 40 to 50 years old
631
+ ... 55: 22, # 50 to 60 years old
632
+ ... 65: 6, # 60 to 70 years old
633
+ ... })
634
+
635
+ The 50th percentile (median) is the 536th person out of the 1071
636
+ member cohort. That person is in the 30 to 40 year old age group.
637
+
638
+ The regular median() function would assume that everyone in the
639
+ tricenarian age group was exactly 35 years old. A more tenable
640
+ assumption is that the 484 members of that age group are evenly
641
+ distributed between 30 and 40. For that, we use median_grouped().
642
+
643
+ >>> data = list(demographics.elements())
644
+ >>> median(data)
645
+ 35
646
+ >>> round(median_grouped(data, interval=10), 1)
647
+ 37.5
648
+
649
+ The caller is responsible for making sure the data points are separated
650
+ by exact multiples of *interval*. This is essential for getting a
651
+ correct result. The function does not check this precondition.
652
+
655
653
"""
656
654
data = sorted (data )
657
655
n = len (data )
658
656
if n == 0 :
659
657
raise StatisticsError ("no median for empty data" )
660
658
elif n == 1 :
661
659
return data [0 ]
660
+
662
661
# Find the value at the midpoint. Remember this corresponds to the
663
- # centre of the class interval.
662
+ # midpoint of the class interval.
664
663
x = data [n // 2 ]
664
+
665
+ # Generate a clear error message for non-numeric data
665
666
for obj in (x , interval ):
666
667
if isinstance (obj , (str , bytes )):
667
- raise TypeError ('expected number but got %r' % obj )
668
+ raise TypeError (f'expected a number but got { obj !r} ' )
669
+
670
+ # Using O(log n) bisection, find where all the x values occur in the data.
671
+ # All x will lie within data[i:j].
672
+ i = bisect_left (data , x )
673
+ j = bisect_right (data , x , lo = i )
674
+
675
+ # Interpolate the median using the formula found at:
676
+ # https://www.cuemath.com/data/median-of-grouped-data/
668
677
try :
669
678
L = x - interval / 2 # The lower limit of the median interval.
670
679
except TypeError :
671
- # Mixed type. For now we just coerce to float.
680
+ # Coerce mixed types to float.
672
681
L = float (x ) - float (interval ) / 2
673
-
674
- # Uses bisection search to search for x in data with log(n) time complexity
675
- # Find the position of leftmost occurrence of x in data
676
- l1 = _find_lteq (data , x )
677
- # Find the position of rightmost occurrence of x in data[l1...len(data)]
678
- # Assuming always l1 <= l2
679
- l2 = _find_rteq (data , l1 , x )
680
- cf = l1
681
- f = l2 - l1 + 1
682
+ cf = i # Cumulative frequency of the preceding interval
683
+ f = j - i # Number of elements in the median internal
682
684
return L + interval * (n / 2 - cf ) / f
683
685
684
686
0 commit comments