-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23683][SQL] FileCommitProtocol.instantiate() hardening #20824
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
steveloughran
wants to merge
3
commits into
apache:master
from
steveloughran:stevel/SPARK-23683-protocol-instantiate
Closed
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
529db08
With SPARK-20236, FileCommitProtocol.instantiate() looks for a three …
steveloughran a18ed58
SPARK-20236: fix style typo and improve failure reporting on an asser…
steveloughran 64602ae
SPARK-23683 document why the test is not using assert() to validate e…
steveloughran File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
148 changes: 148 additions & 0 deletions
148
core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,148 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.internal.io | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
|
|
||
| /** | ||
| * Unit tests for instantiation of FileCommitProtocol implementations. | ||
| */ | ||
| class FileCommitProtocolInstantiationSuite extends SparkFunSuite { | ||
|
|
||
| test("Dynamic partitions require appropriate constructor") { | ||
|
|
||
| // you cannot instantiate a two-arg client with dynamic partitions | ||
| // enabled. | ||
| val ex = intercept[IllegalArgumentException] { | ||
| instantiateClassic(true) | ||
| } | ||
| // check the contents of the message and rethrow if unexpected. | ||
| // this preserves the stack trace of the unexpected | ||
| // exception. | ||
| if (!ex.toString.contains("Dynamic Partition Overwrite")) { | ||
| fail(s"Wrong text in caught exception $ex", ex) | ||
| } | ||
| } | ||
|
|
||
| test("Standard partitions work with classic constructor") { | ||
| instantiateClassic(false) | ||
| } | ||
|
|
||
| test("Three arg constructors have priority") { | ||
| assert(3 == instantiateNew(false).argCount, | ||
| "Wrong constructor argument count") | ||
| } | ||
|
|
||
| test("Three arg constructors have priority when dynamic") { | ||
| assert(3 == instantiateNew(true).argCount, | ||
| "Wrong constructor argument count") | ||
| } | ||
|
|
||
| test("The protocol must be of the correct class") { | ||
| intercept[ClassCastException] { | ||
| FileCommitProtocol.instantiate( | ||
| classOf[Other].getCanonicalName, | ||
| "job", | ||
| "path", | ||
| false) | ||
| } | ||
| } | ||
|
|
||
| test("If there is no matching constructor, class hierarchy is irrelevant") { | ||
| intercept[NoSuchMethodException] { | ||
| FileCommitProtocol.instantiate( | ||
| classOf[NoMatchingArgs].getCanonicalName, | ||
| "job", | ||
| "path", | ||
| false) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a classic two-arg protocol instance. | ||
| * @param dynamic dyanmic partitioning mode | ||
| * @return the instance | ||
| */ | ||
| private def instantiateClassic(dynamic: Boolean): ClassicConstructorCommitProtocol = { | ||
| FileCommitProtocol.instantiate( | ||
| classOf[ClassicConstructorCommitProtocol].getCanonicalName, | ||
| "job", | ||
| "path", | ||
| dynamic).asInstanceOf[ClassicConstructorCommitProtocol] | ||
| } | ||
|
|
||
| /** | ||
| * Create a three-arg protocol instance. | ||
| * @param dynamic dyanmic partitioning mode | ||
| * @return the instance | ||
| */ | ||
| private def instantiateNew( | ||
| dynamic: Boolean): FullConstructorCommitProtocol = { | ||
| FileCommitProtocol.instantiate( | ||
| classOf[FullConstructorCommitProtocol].getCanonicalName, | ||
| "job", | ||
| "path", | ||
| dynamic).asInstanceOf[FullConstructorCommitProtocol] | ||
| } | ||
|
|
||
| } | ||
|
|
||
| /** | ||
| * This protocol implementation does not have the new three-arg | ||
| * constructor. | ||
| */ | ||
| private class ClassicConstructorCommitProtocol(arg1: String, arg2: String) | ||
| extends HadoopMapReduceCommitProtocol(arg1, arg2) { | ||
| } | ||
|
|
||
| /** | ||
| * This protocol implementation does have the new three-arg constructor | ||
| * alongside the original, and a 4 arg one for completeness. | ||
| * The final value of the real constructor is the number of arguments | ||
| * used in the 2- and 3- constructor, for test assertions. | ||
| */ | ||
| private class FullConstructorCommitProtocol( | ||
| arg1: String, | ||
| arg2: String, | ||
| b: Boolean, | ||
| val argCount: Int) | ||
| extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { | ||
|
|
||
| def this(arg1: String, arg2: String) = { | ||
| this(arg1, arg2, false, 2) | ||
| } | ||
|
|
||
| def this(arg1: String, arg2: String, b: Boolean) = { | ||
| this(arg1, arg2, false, 3) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * This has the 2-arity constructor, but isn't the right class. | ||
| */ | ||
| private class Other(arg1: String, arg2: String) { | ||
|
|
||
| } | ||
|
|
||
| /** | ||
| * This has no matching arguments as well as being the wrong class. | ||
| */ | ||
| private class NoMatchingArgs() { | ||
|
|
||
| } | ||
|
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, why don't we warn and continue? Just wanted to make sure that we took this case into account. For example,
wouldn't this invalidate the case below?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Problem is that the dynamic partition logic in
InsertIntoHadoopFsRelationCommandassumes that rename() is a fast reliable operation you can do with any implementation of the FileCommitProtocol, sets itself up for it when enabled, then instantiates the inner committer, and carries on with the dynamic partitioning, irrespective of whether or not. rename() doesn't always work like that, breaking the rest of the algorithm.If the committer doesn't have that 3-arg constructor, you can't be confident that you can do that. To silently log and continue is to run the risk that the underlying committers commit algorithm isn't compatible with the algorithm.
A fail-fast ensures that when the outcome is going to be unknown, you aren' t left trying to work out what's happened.
Regarding your example, yes, it's in trouble. Problem is: how to differentiate that from subclasses which don't know anything at all about the new feature. you can't even look for an interface on the newly created object if the base class implements it; you are left with some dynamic probe of the instance.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm .. actually we could do .. for example ..
(^ it's not double checked closely, for example, if the signature is safe or not. Was just an idea)
and use
committer.dynamicPartitionOverwriteinInsertIntoHadoopFsRelationCommandto respect if the commit protocol supports or not, if I understood all correctly, and then produce a warning (or throw an error?) saying dynamic partition overwrite will be ignored (or not).However, sure. I think this case is kind of a made-up case and should be a corner case I guess. I don't want to suggest an overkill (maybe) and I think we don't have to make this complicated too much for now.
I am okay as is. Just wanted to make sure that we considered and checked other possible stories.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
something like that would work, though it'd be bit more convoluted...InsertIntoFSRelation would have to check, and then handle the situation of missing support.
One thing to consider in any form is: all implementations of FileCommitProtocol should be aware of the new Dynamic Partition overwrite feature...adding a new 3-arg constructor is an implicit way of saying "I understand this". Where it's weak is there's no way for for it to say "I understand this and will handle it myself" Because essentially that's what being done in the [Netflix Partioned committer(https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/PartitionedStagingCommitter.java#L142), which purges all parts for which the new job has data. With that committer, if the insert asks for the feature then the FileCommitProtocol binding to it could (somehow) turn this on and so handle everything internally.
Like I said, a more complex model. It'd need changes a fair way through things and then the usual complexity of getting commit logic.