|
1 | | -package org.jetbrains.kotlinx.dataframe.examples.kotlinSpark |
2 | | - |
3 | | -import org.apache.spark.api.java.JavaSparkContext |
4 | | -import org.apache.spark.sql.Dataset |
5 | | -import org.apache.spark.sql.Row |
6 | | -import org.apache.spark.sql.RowFactory |
7 | | -import org.apache.spark.sql.SparkSession |
8 | | -import org.apache.spark.sql.types.ArrayType |
9 | | -import org.apache.spark.sql.types.DataType |
10 | | -import org.apache.spark.sql.types.DataTypes |
11 | | -import org.apache.spark.sql.types.Decimal |
12 | | -import org.apache.spark.sql.types.DecimalType |
13 | | -import org.apache.spark.sql.types.MapType |
14 | | -import org.apache.spark.sql.types.StructType |
15 | | -import org.apache.spark.unsafe.types.CalendarInterval |
16 | | -import org.jetbrains.kotlinx.dataframe.AnyFrame |
17 | | -import org.jetbrains.kotlinx.dataframe.DataColumn |
18 | | -import org.jetbrains.kotlinx.dataframe.DataFrame |
19 | | -import org.jetbrains.kotlinx.dataframe.DataRow |
20 | | -import org.jetbrains.kotlinx.dataframe.api.rows |
21 | | -import org.jetbrains.kotlinx.dataframe.api.schema |
22 | | -import org.jetbrains.kotlinx.dataframe.api.toDataFrame |
23 | | -import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup |
24 | | -import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion |
25 | | -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema |
26 | | -import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema |
27 | | -import org.jetbrains.kotlinx.spark.api.toRDD |
28 | | -import java.math.BigDecimal |
29 | | -import java.math.BigInteger |
30 | | -import java.sql.Date |
31 | | -import java.sql.Timestamp |
32 | | -import java.time.Instant |
33 | | -import java.time.LocalDate |
34 | | -import kotlin.reflect.KType |
35 | | -import kotlin.reflect.KTypeProjection |
36 | | -import kotlin.reflect.full.createType |
37 | | -import kotlin.reflect.full.isSubtypeOf |
38 | | -import kotlin.reflect.full.withNullability |
39 | | -import kotlin.reflect.typeOf |
40 | | - |
41 | | -// region Spark to DataFrame |
42 | | - |
43 | | -/** |
44 | | - * Converts an untyped Spark [Dataset] (Dataframe) to a Kotlin [DataFrame]. |
45 | | - * [StructTypes][StructType] are converted to [ColumnGroups][ColumnGroup]. |
46 | | - * |
47 | | - * DataFrame supports type inference to do the conversion automatically. |
48 | | - * This is usually fine for smaller data sets, but when working with larger datasets a type map might be a good idea. |
49 | | - * See [convertToDataFrame] for more information. |
50 | | - */ |
51 | | -fun Dataset<Row>.convertToDataFrameByInference( |
52 | | - schema: StructType = schema(), |
53 | | - prefix: List<String> = emptyList(), |
54 | | -): AnyFrame { |
55 | | - val columns = schema.fields().map { field -> |
56 | | - val name = field.name() |
57 | | - when (val dataType = field.dataType()) { |
58 | | - is StructType -> |
59 | | - DataColumn.createColumnGroup( |
60 | | - name = name, |
61 | | - df = convertToDataFrameByInference(dataType, prefix + name), |
62 | | - ) |
63 | | - |
64 | | - else -> |
65 | | - DataColumn.createByInference( |
66 | | - name = name, |
67 | | - values = select((prefix + name).joinToString(".")) |
68 | | - .collectAsList() |
69 | | - .map { it[0] }, |
70 | | - suggestedType = TypeSuggestion.Infer, |
71 | | - nullable = field.nullable(), |
72 | | - ) |
73 | | - } |
74 | | - } |
75 | | - return columns.toDataFrame() |
76 | | -} |
77 | | - |
78 | | -/** |
79 | | - * Converts an untyped Spark [Dataset] (Dataframe) to a Kotlin [DataFrame]. |
80 | | - * [StructTypes][StructType] are converted to [ColumnGroups][ColumnGroup]. |
81 | | - * |
82 | | - * This version uses a [type-map][DataType.convertToDataFrame] to convert the schemas with a fallback to inference. |
83 | | - * For smaller data sets, inference is usually fine too. |
84 | | - * See [convertToDataFrameByInference] for more information. |
85 | | - */ |
86 | | -fun Dataset<Row>.convertToDataFrame(schema: StructType = schema(), prefix: List<String> = emptyList()): AnyFrame { |
87 | | - val columns = schema.fields().map { field -> |
88 | | - val name = field.name() |
89 | | - when (val dataType = field.dataType()) { |
90 | | - is StructType -> |
91 | | - DataColumn.createColumnGroup( |
92 | | - name = name, |
93 | | - df = convertToDataFrame(dataType, prefix + name), |
94 | | - ) |
95 | | - |
96 | | - else -> |
97 | | - DataColumn.createByInference( |
98 | | - name = name, |
99 | | - values = select((prefix + name).joinToString(".")) |
100 | | - .collectAsList() |
101 | | - .map { it[0] }, |
102 | | - suggestedType = |
103 | | - dataType.convertToDataFrame() |
104 | | - ?.let(TypeSuggestion::Use) |
105 | | - ?: TypeSuggestion.Infer, // fallback to inference if needed |
106 | | - nullable = field.nullable(), |
107 | | - ) |
108 | | - } |
109 | | - } |
110 | | - return columns.toDataFrame() |
111 | | -} |
112 | | - |
113 | | -/** |
114 | | - * Returns the corresponding Kotlin type for a given Spark DataType. |
115 | | - * |
116 | | - * This list may be incomplete, but it can at least give you a good start. |
117 | | - * |
118 | | - * @return The KType that corresponds to the Spark DataType, or null if no matching KType is found. |
119 | | - */ |
120 | | -fun DataType.convertToDataFrame(): KType? = |
121 | | - when { |
122 | | - this == DataTypes.ByteType -> typeOf<Byte>() |
123 | | - |
124 | | - this == DataTypes.ShortType -> typeOf<Short>() |
125 | | - |
126 | | - this == DataTypes.IntegerType -> typeOf<Int>() |
127 | | - |
128 | | - this == DataTypes.LongType -> typeOf<Long>() |
129 | | - |
130 | | - this == DataTypes.BooleanType -> typeOf<Boolean>() |
131 | | - |
132 | | - this == DataTypes.FloatType -> typeOf<Float>() |
133 | | - |
134 | | - this == DataTypes.DoubleType -> typeOf<Double>() |
| 1 | +@file:Suppress("ktlint:standard:no-empty-file") |
135 | 2 |
|
136 | | - this == DataTypes.StringType -> typeOf<String>() |
137 | | - |
138 | | - this == DataTypes.DateType -> typeOf<Date>() |
139 | | - |
140 | | - this == DataTypes.TimestampType -> typeOf<Timestamp>() |
141 | | - |
142 | | - this is DecimalType -> typeOf<Decimal>() |
143 | | - |
144 | | - this == DataTypes.CalendarIntervalType -> typeOf<CalendarInterval>() |
145 | | - |
146 | | - this == DataTypes.NullType -> nullableNothingType |
147 | | - |
148 | | - this == DataTypes.BinaryType -> typeOf<ByteArray>() |
149 | | - |
150 | | - this is ArrayType -> { |
151 | | - when (elementType()) { |
152 | | - DataTypes.ShortType -> typeOf<ShortArray>() |
153 | | - DataTypes.IntegerType -> typeOf<IntArray>() |
154 | | - DataTypes.LongType -> typeOf<LongArray>() |
155 | | - DataTypes.FloatType -> typeOf<FloatArray>() |
156 | | - DataTypes.DoubleType -> typeOf<DoubleArray>() |
157 | | - DataTypes.BooleanType -> typeOf<BooleanArray>() |
158 | | - else -> null |
159 | | - } |
160 | | - } |
161 | | - |
162 | | - this is MapType -> { |
163 | | - val key = keyType().convertToDataFrame() ?: return null |
164 | | - val value = valueType().convertToDataFrame() ?: return null |
165 | | - Map::class.createType( |
166 | | - listOf( |
167 | | - KTypeProjection.invariant(key), |
168 | | - KTypeProjection.invariant(value.withNullability(valueContainsNull())), |
169 | | - ), |
170 | | - ) |
171 | | - } |
172 | | - |
173 | | - else -> null |
174 | | - } |
175 | | - |
176 | | -// endregion |
177 | | - |
178 | | -// region DataFrame to Spark |
179 | | - |
180 | | -/** |
181 | | - * Converts the DataFrame to a Spark Dataset of Rows using the provided SparkSession and JavaSparkContext. |
182 | | - * |
183 | | - * Spark needs both the data and the schema to be converted to create a correct [Dataset]. |
184 | | - * |
185 | | - * @param spark The SparkSession object to use for creating the DataFrame. |
186 | | - * @param sc The JavaSparkContext object to use for converting the DataFrame to RDD. |
187 | | - * @return A Dataset of Rows representing the converted DataFrame. |
188 | | - */ |
189 | | -fun DataFrame<*>.convertToSpark(spark: SparkSession, sc: JavaSparkContext): Dataset<Row> { |
190 | | - val rows = sc.toRDD(rows().map { it.convertToSpark() }) |
191 | | - return spark.createDataFrame(rows, schema().convertToSpark()) |
192 | | -} |
193 | | - |
194 | | -/** |
195 | | - * Converts a DataRow to a Spark Row object. |
196 | | - * |
197 | | - * @return The converted Spark Row. |
198 | | - */ |
199 | | -fun DataRow<*>.convertToSpark(): Row = |
200 | | - RowFactory.create( |
201 | | - *values().map { |
202 | | - when (it) { |
203 | | - is DataRow<*> -> it.convertToSpark() |
204 | | - else -> it |
205 | | - } |
206 | | - }.toTypedArray(), |
207 | | - ) |
208 | | - |
209 | | -/** |
210 | | - * Converts a DataFrameSchema to a Spark StructType. |
211 | | - * |
212 | | - * @return The converted Spark StructType. |
213 | | - */ |
214 | | -fun DataFrameSchema.convertToSpark(): StructType = |
215 | | - DataTypes.createStructType( |
216 | | - columns.map { (name, schema) -> |
217 | | - DataTypes.createStructField(name, schema.convertToSpark(), schema.nullable) |
218 | | - }, |
219 | | - ) |
220 | | - |
221 | | -/** |
222 | | - * Converts a ColumnSchema object to Spark DataType. |
223 | | - * |
224 | | - * @return The Spark DataType corresponding to the given ColumnSchema object. |
225 | | - * @throws IllegalArgumentException if the column type or kind is unknown. |
226 | | - */ |
227 | | -fun ColumnSchema.convertToSpark(): DataType = |
228 | | - when (this) { |
229 | | - is ColumnSchema.Value -> type.convertToSpark() ?: error("unknown data type: $type") |
230 | | - is ColumnSchema.Group -> schema.convertToSpark() |
231 | | - is ColumnSchema.Frame -> error("nested dataframes are not supported") |
232 | | - else -> error("unknown column kind: $this") |
233 | | - } |
| 3 | +package org.jetbrains.kotlinx.dataframe.examples.kotlinSpark |
234 | 4 |
|
235 | | -/** |
236 | | - * Returns the corresponding Spark DataType for a given Kotlin type. |
237 | | - * |
238 | | - * This list may be incomplete, but it can at least give you a good start. |
239 | | - * |
240 | | - * @return The Spark DataType that corresponds to the Kotlin type, or null if no matching DataType is found. |
| 5 | +/* |
| 6 | + * See ../spark/compatibilityLayer.kt for the implementation. |
| 7 | + * It's the same with- and without the Kotlin Spark API. |
241 | 8 | */ |
242 | | -fun KType.convertToSpark(): DataType? = |
243 | | - when { |
244 | | - isSubtypeOf(typeOf<Byte?>()) -> DataTypes.ByteType |
245 | | - |
246 | | - isSubtypeOf(typeOf<Short?>()) -> DataTypes.ShortType |
247 | | - |
248 | | - isSubtypeOf(typeOf<Int?>()) -> DataTypes.IntegerType |
249 | | - |
250 | | - isSubtypeOf(typeOf<Long?>()) -> DataTypes.LongType |
251 | | - |
252 | | - isSubtypeOf(typeOf<Boolean?>()) -> DataTypes.BooleanType |
253 | | - |
254 | | - isSubtypeOf(typeOf<Float?>()) -> DataTypes.FloatType |
255 | | - |
256 | | - isSubtypeOf(typeOf<Double?>()) -> DataTypes.DoubleType |
257 | | - |
258 | | - isSubtypeOf(typeOf<String?>()) -> DataTypes.StringType |
259 | | - |
260 | | - isSubtypeOf(typeOf<LocalDate?>()) -> DataTypes.DateType |
261 | | - |
262 | | - isSubtypeOf(typeOf<Date?>()) -> DataTypes.DateType |
263 | | - |
264 | | - isSubtypeOf(typeOf<Timestamp?>()) -> DataTypes.TimestampType |
265 | | - |
266 | | - isSubtypeOf(typeOf<Instant?>()) -> DataTypes.TimestampType |
267 | | - |
268 | | - isSubtypeOf(typeOf<Decimal?>()) -> DecimalType.SYSTEM_DEFAULT() |
269 | | - |
270 | | - isSubtypeOf(typeOf<BigDecimal?>()) -> DecimalType.SYSTEM_DEFAULT() |
271 | | - |
272 | | - isSubtypeOf(typeOf<BigInteger?>()) -> DecimalType.SYSTEM_DEFAULT() |
273 | | - |
274 | | - isSubtypeOf(typeOf<CalendarInterval?>()) -> DataTypes.CalendarIntervalType |
275 | | - |
276 | | - isSubtypeOf(nullableNothingType) -> DataTypes.NullType |
277 | | - |
278 | | - isSubtypeOf(typeOf<ByteArray?>()) -> DataTypes.BinaryType |
279 | | - |
280 | | - isSubtypeOf(typeOf<ShortArray?>()) -> DataTypes.createArrayType(DataTypes.ShortType, false) |
281 | | - |
282 | | - isSubtypeOf(typeOf<IntArray?>()) -> DataTypes.createArrayType(DataTypes.IntegerType, false) |
283 | | - |
284 | | - isSubtypeOf(typeOf<LongArray?>()) -> DataTypes.createArrayType(DataTypes.LongType, false) |
285 | | - |
286 | | - isSubtypeOf(typeOf<FloatArray?>()) -> DataTypes.createArrayType(DataTypes.FloatType, false) |
287 | | - |
288 | | - isSubtypeOf(typeOf<DoubleArray?>()) -> DataTypes.createArrayType(DataTypes.DoubleType, false) |
289 | | - |
290 | | - isSubtypeOf(typeOf<BooleanArray?>()) -> DataTypes.createArrayType(DataTypes.BooleanType, false) |
291 | | - |
292 | | - isSubtypeOf(typeOf<Array<*>>()) -> |
293 | | - error("non-primitive arrays are not supported for now, you can add it yourself") |
294 | | - |
295 | | - isSubtypeOf(typeOf<List<*>>()) -> error("lists are not supported for now, you can add it yourself") |
296 | | - |
297 | | - isSubtypeOf(typeOf<Set<*>>()) -> error("sets are not supported for now, you can add it yourself") |
298 | | - |
299 | | - classifier == Map::class -> { |
300 | | - val (key, value) = arguments |
301 | | - DataTypes.createMapType( |
302 | | - key.type?.convertToSpark(), |
303 | | - value.type?.convertToSpark(), |
304 | | - value.type?.isMarkedNullable ?: true, |
305 | | - ) |
306 | | - } |
307 | | - |
308 | | - else -> null |
309 | | - } |
310 | | - |
311 | | -private val nullableNothingType: KType = typeOf<List<Nothing?>>().arguments.first().type!! |
312 | | - |
313 | | -// endregion |
0 commit comments