-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[1105] Delta Lake Change Data Feed support - streaming reads #1154
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ import scala.util.matching.Regex | |
|
||
import org.apache.spark.sql.delta.{ColumnWithDefaultExprUtils, DeltaErrors, DeltaLog, DeltaOptions, DeltaTimeTravelSpec, GeneratedColumn, StartingVersion, StartingVersionLatest} | ||
import org.apache.spark.sql.delta.actions._ | ||
import org.apache.spark.sql.delta.commands.cdc.CDCReader | ||
import org.apache.spark.sql.delta.files.DeltaSourceSnapshot | ||
import org.apache.spark.sql.delta.metering.DeltaLogging | ||
import org.apache.spark.sql.delta.schema.SchemaUtils | ||
|
@@ -93,8 +94,15 @@ trait DeltaSourceBase extends Source | |
with SupportsAdmissionControl | ||
with DeltaLogging { self: DeltaSource => | ||
|
||
override val schema: StructType = | ||
ColumnWithDefaultExprUtils.removeDefaultExpressions(deltaLog.snapshot.metadata.schema) | ||
override val schema: StructType = { | ||
val schemaWithoutCDC = | ||
ColumnWithDefaultExprUtils.removeDefaultExpressions(deltaLog.snapshot.metadata.schema) | ||
if (options.readChangeFeed) { | ||
CDCReader.cdcReadSchema(schemaWithoutCDC) | ||
} else { | ||
schemaWithoutCDC | ||
} | ||
} | ||
|
||
protected var lastOffsetForTriggerAvailableNow: DeltaSourceOffset = _ | ||
|
||
|
@@ -104,20 +112,30 @@ trait DeltaSourceBase extends Source | |
isStartingVersion: Boolean, | ||
limits: Option[AdmissionLimits] = Some(new AdmissionLimits())): | ||
ClosableIterator[IndexedFile] = { | ||
val changes = getFileChanges(fromVersion, fromIndex, isStartingVersion) | ||
if (limits.isEmpty) return changes | ||
|
||
// Take each change until we've seen the configured number of addFiles. Some changes don't | ||
// represent file additions; we retain them for offset tracking, but they don't count towards | ||
// the maxFilesPerTrigger conf. | ||
var admissionControl = limits.get | ||
changes.withClose { it => | ||
it.takeWhile { index => | ||
admissionControl.admit(Option(index.add)) | ||
if (options.readChangeFeed) { | ||
// in this CDC use case, we need to consider RemoveFile and AddCDCFiles when getting the | ||
// offset. | ||
|
||
// This method is only used to get the offset so we need to return an iterator of IndexedFile. | ||
getFileChangesForCDC(fromVersion, fromIndex, isStartingVersion, limits, None).flatMap(_._2) | ||
.toClosable | ||
} else { | ||
val changes = getFileChanges(fromVersion, fromIndex, isStartingVersion) | ||
if (limits.isEmpty) return changes | ||
|
||
// Take each change until we've seen the configured number of addFiles. Some changes don't | ||
// represent file additions; we retain them for offset tracking, but they don't count towards | ||
// the maxFilesPerTrigger conf. | ||
var admissionControl = limits.get | ||
changes.withClose { it => | ||
it.takeWhile { index => | ||
admissionControl.admit(Option(index.add)) | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: delete |
||
/** | ||
* get the changes from startVersion, startIndex to the end | ||
* @param startVersion - calculated starting version | ||
|
@@ -131,21 +149,25 @@ trait DeltaSourceBase extends Source | |
startIndex: Long, | ||
isStartingVersion: Boolean, | ||
endOffset: DeltaSourceOffset): DataFrame = { | ||
val changes = getFileChanges(startVersion, startIndex, isStartingVersion) | ||
try { | ||
val fileActionsIter = changes.takeWhile { case IndexedFile(version, index, _, _, _, _) => | ||
version < endOffset.reservoirVersion || | ||
(version == endOffset.reservoirVersion && index <= endOffset.index) | ||
} | ||
if (options.readChangeFeed) { | ||
getCDCFileChangesAndCreateDataFrame(startVersion, startIndex, isStartingVersion, endOffset) | ||
} else { | ||
val changes = getFileChanges(startVersion, startIndex, isStartingVersion) | ||
try { | ||
val fileActionsIter = changes.takeWhile { case IndexedFile(version, index, _, _, _, _) => | ||
version < endOffset.reservoirVersion || | ||
(version == endOffset.reservoirVersion && index <= endOffset.index) | ||
} | ||
|
||
val filteredIndexedFiles = fileActionsIter.filter { indexedFile => | ||
indexedFile.getFileAction != null && | ||
excludeRegex.forall(_.findFirstIn(indexedFile.getFileAction.path).isEmpty) | ||
} | ||
val filteredIndexedFiles = fileActionsIter.filter { indexedFile => | ||
indexedFile.getFileAction != null && | ||
excludeRegex.forall(_.findFirstIn(indexedFile.getFileAction.path).isEmpty) | ||
} | ||
|
||
createDataFrame(filteredIndexedFiles) | ||
} finally { | ||
changes.close() | ||
createDataFrame(filteredIndexedFiles) | ||
} finally { | ||
changes.close() | ||
} | ||
} | ||
} | ||
|
||
|
@@ -183,7 +205,8 @@ case class DeltaSource( | |
deltaLog: DeltaLog, | ||
options: DeltaOptions, | ||
filters: Seq[Expression] = Nil) | ||
extends DeltaSourceBase { | ||
extends DeltaSourceBase | ||
with DeltaSourceCDCSupport { | ||
|
||
// Deprecated. Please use `ignoreDeletes` or `ignoreChanges` from now on. | ||
private val ignoreFileDeletion = { | ||
|
@@ -520,9 +543,36 @@ case class DeltaSource( | |
override def toString(): String = s"DeltaSource[${deltaLog.dataPath}]" | ||
|
||
trait DeltaSourceAdmissionBase { self: AdmissionLimits => | ||
// This variable indicates whether a commit has already been processed by a batch or not. | ||
var commitProcessedInBatch = false | ||
|
||
/** | ||
* This overloaded method checks if all the AddCDCFiles for a commit can be accommodated by | ||
* the rate limit. | ||
*/ | ||
def admit(fileActions: Seq[AddCDCFile]): Boolean = { | ||
def getSize(actions: Seq[AddCDCFile]): Long = { | ||
actions.foldLeft(0L) { (l, r) => l + r.size } | ||
} | ||
if (fileActions.isEmpty) { | ||
true | ||
} else { | ||
// if no files have been admitted, then admit all to avoid deadlock | ||
// else check if all of the files together satisfy the limit, only then admit | ||
val shouldAdmit = !commitProcessedInBatch || | ||
(filesToTake - fileActions.size >= 0 && bytesToTake - getSize(fileActions) >= 0) | ||
|
||
commitProcessedInBatch = true | ||
filesToTake -= fileActions.size | ||
bytesToTake -= getSize(fileActions) | ||
shouldAdmit | ||
} | ||
} | ||
|
||
/** Whether to admit the next file */ | ||
def admit(fileAction: Option[FileAction]): Boolean = { | ||
commitProcessedInBatch = true | ||
|
||
def getSize(action: FileAction): Long = { | ||
action match { | ||
case a: AddFile => | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: add a comment on why we removing some fields from the schema