Skip to content

Commit 048e372

Browse files
committed
fixed percentileBy and indices
1 parent 535852d commit 048e372

File tree

3 files changed

+351
-52
lines changed

3 files changed

+351
-52
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/math/percentile.kt

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
package org.jetbrains.kotlinx.dataframe.math
22

33
import io.github.oshai.kotlinlogging.KotlinLogging
4+
import org.jetbrains.kotlinx.dataframe.api.isNaN
45
import org.jetbrains.kotlinx.dataframe.impl.aggregation.aggregators.CalculateReturnType
56
import org.jetbrains.kotlinx.dataframe.impl.isIntraComparable
67
import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveNumber
78
import org.jetbrains.kotlinx.dataframe.impl.nothingType
89
import org.jetbrains.kotlinx.dataframe.impl.renderType
9-
import org.jetbrains.kotlinx.dataframe.math.quantileOrNull
1010
import java.math.BigDecimal
1111
import java.math.BigInteger
12+
import kotlin.math.round
1213
import kotlin.reflect.KType
1314
import kotlin.reflect.full.withNullability
1415
import kotlin.reflect.typeOf
@@ -83,6 +84,11 @@ internal val percentileConversion: CalculateReturnType = { type, isEmpty ->
8384
}.withNullability(isEmpty)
8485
}
8586

87+
/**
88+
* Returns the index of the [percentile] in the unsorted sequence [this].
89+
* If `!`[skipNaN] and the sequence [this] contains NaN, the index of the first NaN will be returned.
90+
* Returns -1 if the sequence is empty.
91+
*/
8692
internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfPercentile(
8793
percentile: Double,
8894
type: KType,
@@ -108,16 +114,36 @@ internal fun <T : Comparable<T & Any>?> Sequence<T>.indexOfPercentile(
108114
)
109115
}
110116

117+
val indexList = this.mapIndexedNotNull { i, it ->
118+
if (it == null) {
119+
null
120+
} else {
121+
IndexedComparable(i, it)
122+
}
123+
}
124+
111125
// TODO make configurable
112126
val method = QuantileEstimationMethod.R3
113127

114128
// percentile of 25.0 means the 25th 100-quantile, so 25 / 100 = 0.25
115129
val p = percentile / 100.0
116-
return this.indexOfQuantile(
130+
131+
// get the index where the percentile can be found in the sorted sequence
132+
val indexEstimation = indexList.quantileIndexEstimation(
117133
p = p,
118-
type = type,
134+
type = typeOf<IndexedComparable<Nothing>>(),
119135
skipNaN = skipNaN,
120-
method = method as QuantileEstimationMethod<T & Any, Int>,
136+
method = method,
121137
name = "percentile",
122138
)
139+
if (indexEstimation.isNaN()) return this.indexOfFirst { it.isNaN }
140+
if (indexEstimation < 0.0) return -1
141+
require(indexEstimation == round(indexEstimation)) {
142+
"percentile expected a whole number index from quantileIndexEstimation but was $indexEstimation"
143+
}
144+
145+
val percentileResult = indexList.toList().quickSelect(k = indexEstimation.toInt())
146+
147+
// return the original unsorted index of the found result
148+
return percentileResult.index
123149
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/math/quantile.kt

Lines changed: 47 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -92,20 +92,31 @@ internal fun <T : Comparable<T>> Sequence<Any>.quantileOrNull(
9292
}
9393

9494
/**
95-
* p-quantile: the k'th q-quantile, where p = k/q.
95+
* Returns the index `i` of the [p]-quantile: the k'th q-quantile, where p = k/q.
9696
*
97+
* The returned index `i` is either exactly or approaching the index of the quantile in the sequence [this]
98+
* (when it's sorted and NaN's removed).
99+
* Returns -1.0 if the sequence [this] is empty.
100+
* Returns [Double.NaN] if `!`[skipNaN] and a NaN is encountered.
97101
*/
98-
internal fun <T : Comparable<T & Any>?, Index : Number> Sequence<Any?>.indexOfQuantile(
102+
internal fun <T : Comparable<T>> Sequence<Any?>.quantileIndexEstimation(
99103
p: Double,
100104
type: KType,
101105
skipNaN: Boolean,
102-
method: QuantileEstimationMethod<T & Any, Index>,
106+
method: QuantileEstimationMethod<T, *>,
103107
name: String = "quantile",
104-
): Index {
108+
): Double {
105109
val nonNullType = type.withNullability(false)
110+
106111
when {
107112
p !in 0.0..1.0 -> error("Quantile must be in range [0, 1]")
108113

114+
type.isMarkedNullable ->
115+
error("Encountered nullable type ${renderType(type)} in $name function. This should not occur.")
116+
117+
// this means the sequence is empty
118+
type == nothingType -> return -1.0
119+
109120
!nonNullType.isIntraComparable() ->
110121
error(
111122
"Unable to compute the $name for ${
@@ -120,37 +131,20 @@ internal fun <T : Comparable<T & Any>?, Index : Number> Sequence<Any?>.indexOfQu
120131
)
121132
}
122133

123-
@Suppress("UNCHECKED_CAST")
124-
fun Number.toIndex(): Index =
125-
when (method) {
126-
is QuantileEstimationMethod.Selecting -> this.toInt()
127-
is QuantileEstimationMethod.Interpolating -> this.toDouble()
128-
} as Index
129-
130134
// propagate NaN to return if they are not to be skipped
131135
if (nonNullType.canBeNaN && !skipNaN) {
132-
for ((i, it) in this.withIndex()) {
133-
if (it.isNaN) return i.toIndex()
134-
}
135-
}
136-
137-
val indexedSequence = this.mapIndexedNotNull { i, it ->
138-
if (it == null) {
139-
null
140-
} else {
141-
IndexedComparable(i, it as Comparable<Any>)
142-
}
136+
if (any { it.isNaN }) return Double.NaN
143137
}
144138
val list = when {
145-
nonNullType.canBeNaN -> indexedSequence.filterNot { it.value.isNaN }
146-
else -> indexedSequence
139+
nonNullType.canBeNaN -> this.filterNot { it.isNaN }
140+
else -> this
147141
}.toList()
148142

149143
val size = list.size
150-
if (size == 0) return (-1).toIndex()
151-
if (size == 1) return 0.toIndex()
144+
if (size == 0) return -1.0
145+
if (size == 1) return 0.0
152146

153-
return method.indexOfQuantile(p, size)
147+
return method.indexOfQuantile(p, size).toDouble()
154148
}
155149

156150
/**
@@ -183,54 +177,56 @@ internal sealed interface QuantileEstimationMethod<Value : Comparable<Value>, In
183177

184178
/** Inverse of the empirical distribution function. */
185179
data object R1 : Selecting {
186-
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Int = ceil(p * count).toInt()
180+
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Int =
181+
ceil(p * count).toInt()
182+
.coerceIn(1..count)
187183

188184
@Suppress("UNCHECKED_CAST")
189185
override fun quantile(p: Double, values: List<Comparable<Any>>): Comparable<Any> {
190186
val h = indexOfQuantile(p, values.size).toInt()
191-
return values.quickSelect(h.coerceIn(0..<values.size))
187+
return values.quickSelect(h)
192188
}
193189
}
194190

195191
/** The observation closest to `count * p` */
196192
data object R3 : Selecting {
197193
// following apache commons + paper instead of wikipedia
198-
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Int = round(count * p).toInt()
194+
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Int =
195+
round(count * p).toInt()
196+
.coerceIn(1..count)
199197

200198
@Suppress("UNCHECKED_CAST")
201199
override fun quantile(p: Double, values: List<Comparable<Any>>): Comparable<Any> {
202200
val h = indexOfQuantile(p, values.size).toInt()
203-
return values.quickSelect(h.coerceIn(0..<values.size))
201+
return values.quickSelect(h)
204202
}
205203
}
206204

207-
// overload to get the right comparable type
208-
@JvmName("quantileTyped")
209-
@Suppress("EXTENSION_SHADOWED_BY_MEMBER", "UNCHECKED_CAST", "INAPPLICABLE_JVM_NAME")
210-
fun <T : Comparable<T>> quantile(p: Double, values: List<T>): T =
211-
quantile(p, values as List<Comparable<Any>>) as T
212205
}
213206

214207
// TODO add R2, R4, R5, R6, R9
215208
sealed interface Interpolating : QuantileEstimationMethod<Double, Double> {
216209

217210
/** Linear interpolation of the modes for the order statistics for the uniform distribution on [0, 1]. */
218211
data object R7 : Interpolating, PieceWiseLinear {
219-
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Double = (count - 1.0) * p + 1.0
212+
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Double =
213+
((count - 1.0) * p + 1.0)
214+
.coerceIn(1.0..count.toDouble())
220215
}
221216

222217
/** Linear interpolation of the approximate medians for order statistics. */
223218
data object R8 : Interpolating, PieceWiseLinear {
224-
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Double = (count + 1.0 / 3.0) * p + 1.0 / 3.0
219+
override fun oneBasedIndexOfQuantile(p: Double, count: Int): Double =
220+
((count + 1.0 / 3.0) * p + 1.0 / 3.0)
221+
.coerceIn(1.0..count.toDouble())
225222
}
226223

227224
private interface PieceWiseLinear : Interpolating {
228225
override fun quantile(p: Double, values: List<Double>): Double {
229226
val h = oneBasedIndexOfQuantile(p, values.size)
230-
return values.quickSelect((floor(h).toInt() - 1).coerceIn(0..<values.size)) +
231-
(h - floor(h)) * (
232-
values.quickSelect((ceil(h).toInt() - 1).coerceIn(0..<values.size)) -
233-
values.quickSelect((floor(h).toInt() - 1).coerceIn(0..<values.size))
227+
return values.quickSelect(floor(h).toInt() - 1) + (h - floor(h)) * (
228+
values.quickSelect(ceil(h).toInt() - 1) -
229+
values.quickSelect(floor(h).toInt() - 1)
234230
)
235231
}
236232
}
@@ -246,6 +242,15 @@ internal sealed interface QuantileEstimationMethod<Value : Comparable<Value>, In
246242
}
247243
}
248244

245+
// overload to get the right comparable type
246+
@Suppress("UNCHECKED_CAST")
247+
internal fun <T : Comparable<T>> QuantileEstimationMethod.Selecting.quantile(p: Double, values: List<T>): T =
248+
quantile(p, values as List<Comparable<Any>>) as T
249+
250+
@Suppress("UNCHECKED_CAST")
251+
internal fun <T : Comparable<T>> QuantileEstimationMethod.Selecting.cast(): QuantileEstimationMethod<T, Int> =
252+
this as QuantileEstimationMethod<T, Int>
253+
249254
// corrects oneBasedIndexOfQuantile to zero-based index
250255
@Suppress("UNCHECKED_CAST")
251256
internal fun <IndexType : Number> QuantileEstimationMethod<*, IndexType>.indexOfQuantile(

0 commit comments

Comments
 (0)