|
58 | 58 | import org.apache.spark.broadcast.Broadcast; |
59 | 59 | import org.apache.spark.sql.connector.read.InputPartition; |
60 | 60 | import org.apache.spark.sql.connector.read.PartitionReaderFactory; |
| 61 | +import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; |
61 | 62 | import org.apache.spark.sql.connector.read.streaming.MicroBatchStream; |
62 | 63 | import org.apache.spark.sql.connector.read.streaming.Offset; |
63 | 64 | import org.apache.spark.sql.connector.read.streaming.ReadLimit; |
| 65 | +import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; |
| 66 | +import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; |
64 | 67 | import org.apache.spark.sql.connector.read.streaming.SupportsAdmissionControl; |
65 | 68 | import org.slf4j.Logger; |
66 | 69 | import org.slf4j.LoggerFactory; |
@@ -309,6 +312,47 @@ private static StreamingOffset determineStartingOffset(Table table, Long fromTim |
309 | 312 | } |
310 | 313 | } |
311 | 314 |
|
| 315 | + private static int getMaxFiles(ReadLimit readLimit) { |
| 316 | + if (readLimit instanceof ReadMaxFiles) { |
| 317 | + return ((ReadMaxFiles) readLimit).maxFiles(); |
| 318 | + } |
| 319 | + |
| 320 | + if (readLimit instanceof CompositeReadLimit) { |
| 321 | + // We do not expect a CompositeReadLimit to contain a nested CompositeReadLimit. |
| 322 | + // In fact, it should only be a composite of two or more of ReadMinRows, ReadMaxRows and |
| 323 | + // ReadMaxFiles, with no more than one of each. |
| 324 | + ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); |
| 325 | + for (ReadLimit limit : limits) { |
| 326 | + if (limit instanceof ReadMaxFiles) { |
| 327 | + return ((ReadMaxFiles) limit).maxFiles(); |
| 328 | + } |
| 329 | + } |
| 330 | + } |
| 331 | + |
| 332 | + // there is no ReadMaxFiles, so return the default |
| 333 | + return Integer.MAX_VALUE; |
| 334 | + } |
| 335 | + |
| 336 | + private static int getMaxRows(ReadLimit readLimit) { |
| 337 | + if (readLimit instanceof ReadMaxRows) { |
| 338 | + long maxRows = ((ReadMaxRows) readLimit).maxRows(); |
| 339 | + return Math.toIntExact(maxRows); |
| 340 | + } |
| 341 | + |
| 342 | + if (readLimit instanceof CompositeReadLimit) { |
| 343 | + ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); |
| 344 | + for (ReadLimit limit : limits) { |
| 345 | + if (limit instanceof ReadMaxRows) { |
| 346 | + long maxRows = ((ReadMaxRows) limit).maxRows(); |
| 347 | + return Math.toIntExact(maxRows); |
| 348 | + } |
| 349 | + } |
| 350 | + } |
| 351 | + |
| 352 | + // there is no ReadMaxRows, so return the default |
| 353 | + return Integer.MAX_VALUE; |
| 354 | + } |
| 355 | + |
312 | 356 | @Override |
313 | 357 | @SuppressWarnings("checkstyle:CyclomaticComplexity") |
314 | 358 | public Offset latestOffset(Offset startOffset, ReadLimit limit) { |
@@ -368,10 +412,8 @@ public Offset latestOffset(Offset startOffset, ReadLimit limit) { |
368 | 412 | while (taskIter.hasNext()) { |
369 | 413 | FileScanTask task = taskIter.next(); |
370 | 414 | if (curPos >= startPosOfSnapOffset) { |
371 | | - // TODO : use readLimit provided in function param, the readLimits are derived from |
372 | | - // these 2 properties. |
373 | | - if ((curFilesAdded + 1) > maxFilesPerMicroBatch |
374 | | - || (curRecordCount + task.file().recordCount()) > maxRecordsPerMicroBatch) { |
| 415 | + if ((curFilesAdded + 1) > getMaxFiles(limit) |
| 416 | + || (curRecordCount + task.file().recordCount()) > getMaxRows(limit)) { |
375 | 417 | shouldContinueReading = false; |
376 | 418 | break; |
377 | 419 | } |
@@ -458,7 +500,7 @@ public ReadLimit getDefaultReadLimit() { |
458 | 500 | && maxRecordsPerMicroBatch != Integer.MAX_VALUE) { |
459 | 501 | ReadLimit[] readLimits = new ReadLimit[2]; |
460 | 502 | readLimits[0] = ReadLimit.maxFiles(maxFilesPerMicroBatch); |
461 | | - readLimits[1] = ReadLimit.maxRows(maxFilesPerMicroBatch); |
| 503 | + readLimits[1] = ReadLimit.maxRows(maxRecordsPerMicroBatch); |
462 | 504 | return ReadLimit.compositeLimit(readLimits); |
463 | 505 | } else if (maxFilesPerMicroBatch != Integer.MAX_VALUE) { |
464 | 506 | return ReadLimit.maxFiles(maxFilesPerMicroBatch); |
|
0 commit comments