[SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once #34453

This patch passes all tests.
This patch merges cleanly.
This patch adds the following public classes (experimental):
class SparkConf(object):
class ProbabilisticClassifier(Classifier, _ProbabilisticClassifierParams, metaclass=ABCMeta):
class ProbabilisticClassificationModel(
class _JavaProbabilisticClassifier(ProbabilisticClassifier, _JavaClassifier, metaclass=ABCMeta):
class _JavaProbabilisticClassificationModel(
class _LinearSVCParams(
class LinearSVCModel(
class _LogisticRegressionParams(
class LogisticRegression(
class LogisticRegressionModel(
class BinaryLogisticRegressionSummary(_BinaryClassificationSummary, LogisticRegressionSummary):
class BinaryLogisticRegressionTrainingSummary(
class DecisionTreeClassifier(
class DecisionTreeClassificationModel(
class RandomForestClassifier(
class RandomForestClassificationModel(
class RandomForestClassificationTrainingSummary(
class BinaryRandomForestClassificationTrainingSummary(
class GBTClassifier(
class GBTClassificationModel(
class NaiveBayes(
class NaiveBayesModel(
class _MultilayerPerceptronParams(
class MultilayerPerceptronClassifier(
class MultilayerPerceptronClassificationModel(
class MultilayerPerceptronClassificationTrainingSummary(
class FMClassifier(
class FMClassificationModel(
class _GaussianMixtureParams(
class GaussianMixtureModel(
class _KMeansParams(
class KMeansModel(
class _BisectingKMeansParams(
class BisectingKMeansModel(
class PowerIterationClustering(
class BinaryClassificationEvaluator(
class RegressionEvaluator(
class MulticlassClassificationEvaluator(
class MultilabelClassificationEvaluator(
class ClusteringEvaluator(
class RankingEvaluator(
class Binarizer(
class BucketedRandomProjectionLSH(
class BucketedRandomProjectionLSHModel(
class Bucketizer(
class ElementwiseProduct(
class FeatureHasher(
class HashingTF(
class _OneHotEncoderParams(
class PolynomialExpansion(
class QuantileDiscretizer(
class _StringIndexerParams(
class StopWordsRemover(
class VectorAssembler(
class VectorSizeHint(
class VarianceThresholdSelector(
class VarianceThresholdSelectorModel(
class UnivariateFeatureSelector(
class UnivariateFeatureSelectorModel(
class _LinearRegressionParams(
class LinearRegressionModel(
class IsotonicRegression(
class IsotonicRegressionModel(JavaModel, _IsotonicRegressionParams, JavaMLWritable, JavaMLReadable):
class DecisionTreeRegressor(
class RandomForestRegressor(
class _AFTSurvivalRegressionParams(
class AFTSurvivalRegression(
class AFTSurvivalRegressionModel(
class _GeneralizedLinearRegressionParams(
class GeneralizedLinearRegression(
class GeneralizedLinearRegressionModel(
class _FactorizationMachinesParams(
class FMRegressionModel(
class CrossValidator(
class TrainValidationSplit(
+ \"class name
class MultivariateGaussian(NamedTuple):
class TimedeltaOps(DataTypeOps):
class TimedeltaIndex(Index):
class MissingPandasLikeTimedeltaIndex(MissingPandasLikeIndex):
class PandasSQLStringFormatter(string.Formatter):
class PandasAPIOnSparkAdviceWarning(Warning):
class UDFBasicProfiler(BasicProfiler):
class CloudPickleSerializer(FramedSerializer):
class ArrowStreamUDFSerializer(ArrowStreamSerializer):
class SQLStringFormatter(string.Formatter):
class DayTimeIntervalType(AtomicType):
class DayTimeIntervalTypeConverter(object):
class ExecutorPodsPollingSnapshotSource(
class ExecutorPodsWatchSnapshotSource(
class ExecutorRollPlugin extends SparkPlugin
class ExecutorRollDriverPlugin extends DriverPlugin with Logging
class AnsiCombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends
trait ExpressionBuilder
case class RelationTimeTravel(
case class AsOfTimestamp(timestamp: Long) extends TimeTravelSpec
case class AsOfVersion(version: String) extends TimeTravelSpec
class CombinedTypeCoercionRule(rules: Seq[TypeCoercionRule]) extends TypeCoercionRule
case class ExpressionStats(expr: Expression)(var useCount: Int)
case class PrettyPythonUDF(
case class MapContainsKey(
case class TryElementAt(left: Expression, right: Expression, child: Expression)
case class ConvertTimezone(
case class AesEncrypt(
case class AesDecrypt(
trait PadExpressionBuilderBase extends ExpressionBuilder
case class StringLPad(str: Expression, len: Expression, pad: Expression)
case class BinaryLPad(str: Expression, len: Expression, pad: Expression, child: Expression)
case class BinaryRPad(str: Expression, len: Expression, pad: Expression, child: Expression)
case class UnclosedCommentProcessor(
case class PythonMapInArrow(
case class CreateTable(
case class DropIndex(
case class TableSpec(
public class ColumnIOUtil
case class OptimizeSkewedJoin(ensureRequirements: EnsureRequirements)
case class ParquetColumn(
case class DropIndexExec(
case class PushedDownOperators(
case class TableSampleInfo(
trait MapInBatchExec extends UnaryExecNode
case class PythonMapInArrowExec(
class RatePerMicroBatchProvider extends SimpleTableProvider with DataSourceRegister
class RatePerMicroBatchTable(
class RatePerMicroBatchStream(
case class RatePerMicroBatchStreamOffset(offset: Long, timestamp: Long) extends Offset
case class RatePerMicroBatchStreamInputPartition(
class RatePerMicroBatchStreamPartitionReader(
// When this is enabled, this class does additional lookup on write operations (put/delete) to

SparkQA · 2021-12-15T09:10:27Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50698/

SparkQA · 2021-12-15T10:10:59Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50698/

AngersZhuuuu · 2021-12-15T10:28:12Z

legacy feature flag

For legacy flag, should we must use spark.sql.legacy.xxx and default value is false?

SparkQA · 2021-12-15T11:14:50Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50703/

SparkQA · 2021-12-15T12:14:15Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50703/

...erver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala

SparkQA · 2021-12-16T08:23:25Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50742/

SparkQA · 2021-12-16T09:25:50Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50742/

docs/sql-migration-guide.md

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

bogdanghit · 2021-12-16T09:26:36Z

...rver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkMetadataOperationSuite.scala

        Seq("shiftleft", "shiftright", "shiftrightunsigned"))
      checkResult(metaData.getFunctions(null, "default", "upPer"), Seq("upper"))
+
+      statement.execute(s"SET ${SQLConf.THRIFTSERVER_SEPARATE_DISPLAY_SYSTEM_FUNCTION.key}=true")


Can we add a test with two schemas and run an unfiltered getFunctions call to show that previously we'd see duplicates, whereas now the functions are unique?

Great, thanks!

juliuszsompolski · 2021-12-16T10:00:32Z

It's up to our decision, but it's safe to keep the existing behavior first and to switch it at next version because it gives the users a chance to try this new behavior.

My 2cents: working with various partners and vendors of BI tools, we found GetFunctions to rarely be used at all, and when we found it used, it was in the context of the current behaviour causing trouble (running to slow, causing UI freezes because of trying to render the humongous list of duplicated functions). I am not aware of any tool depending on the current behaviour.
Note that before GetTables / GetSchemas / GetColumns gotten implemented in Spark 3.0, this was all throwing wrong results or errors with Spark, and it has not been escalated for a long time. Only recently BI vendors are picking up serious interest in developing connectors that take advantage of these functions, and this has been reported as something that is an unexpected and unwanted behaviour.

AngersZhuuuu · 2021-12-16T10:03:26Z

It's up to our decision, but it's safe to keep the existing behavior first and to switch it at next version because it gives the users a chance to try this new behavior.

My 2cents: working with various partners and vendors of BI tools, we found GetFunctions to rarely be used at all, and when we found it used, it was in the context of the current behaviour causing trouble (running to slow, causing UI freezes because of trying to render the humongous list of duplicated functions). I am not aware of any tool depending on the current behaviour. Note that before GetTables / GetSchemas / GetColumns gotten implemented in Spark 3.0, this was all throwing wrong results or errors with Spark, and it has not been escalated for a long time. Only recently BI vendors are picking up serious interest in developing connectors that take advantage of these functions, and this has been reported as something that is an unexpected and unwanted behaviour.

We use HUE for adhoc before, to support use hue in spark's thrift server. we also changed a lot..

SparkQA · 2021-12-16T11:36:32Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50755/

bogdanghit

LGTM, thanks @AngersZhuuuu

SparkQA · 2021-12-16T12:17:20Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50758/

SparkQA · 2021-12-16T12:18:09Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50755/

SparkQA · 2021-12-16T12:41:42Z

Test build #146267 has finished for PR 34453 at commit 479d533.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

SparkQA · 2021-12-16T13:17:48Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/50758/

SparkQA · 2021-12-16T15:20:27Z

Test build #146281 has finished for PR 34453 at commit 366eed7.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

SparkQA · 2021-12-16T16:23:56Z

Test build #146284 has finished for PR 34453 at commit 392c5ea.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

AngersZhuuuu · 2021-12-17T02:05:27Z

gentle ping @dongjoon-hyun WDYT of current code?

AngersZhuuuu · 2021-12-20T03:30:37Z

Any more suggestion? also cc @wangyum

AngersZhuuuu · 2022-01-04T02:41:29Z

Any more suggestion?

bogdanghit · 2022-01-12T07:53:47Z

Any more suggestion?

It looks good to me. @dongjoon-hyun WDYT?

bogdanghit · 2022-01-27T09:28:44Z

Gentle nudge @AngersZhuuuu @dongjoon-hyun @wangyum.

bogdanghit · 2022-02-15T13:52:23Z

@dongjoon-hyun what should still be done to push this through?

srowen · 2022-02-15T15:04:25Z

Would the existing behavior ever be desirable? sounds like more of a bug?

bogdanghit · 2022-02-15T17:33:56Z

@srowen that was my initial thought as well, but there are concerns it may be a breaking change because of the different result format

github-actions · 2022-05-29T00:19:53Z

We're closing this PR because it hasn't been updated in a while. This isn't a judgement on the merit of the PR in any way. It's just a way of keeping the PR queue manageable.
If you'd like to revive this PR, please reopen it and ask a committer to remove the Stale tag!

Update SparkGetFunctionsOperation.scala

118af14

AngersZhuuuu changed the title ~~[SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once~~ [WIP][SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once Nov 1, 2021

github-actions bot added the SQL label Nov 1, 2021

AngersZhuuuu added 2 commits November 1, 2021 13:11

Update SparkGetFunctionsOperation.scala

a1b21fe

Update SparkMetadataOperationSuite.scala

3362038

AngersZhuuuu marked this pull request as draft November 1, 2021 05:16

Merge branch 'master' into SPARK-37173

75131c8

AngersZhuuuu added 2 commits December 15, 2021 17:50

update

29da65a

update

4bbc8cc

AngersZhuuuu marked this pull request as ready for review December 15, 2021 10:07

github-actions bot added the DOCS label Dec 15, 2021

Update sql-migration-guide.md

89bb678

AngersZhuuuu changed the title ~~[WIP][SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once~~ [SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once Dec 15, 2021

dongjoon-hyun reviewed Dec 16, 2021

View reviewed changes

...erver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala Outdated Show resolved Hide resolved

dongjoon-hyun reviewed Dec 16, 2021

View reviewed changes

...erver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala Outdated Show resolved Hide resolved

Follow comment

479d533

bogdanghit reviewed Dec 16, 2021

View reviewed changes

follow comment

366eed7

Update SparkMetadataOperationSuite.scala

392c5ea

bogdanghit approved these changes Dec 16, 2021

View reviewed changes

Merge branch 'master' into SPARK-37173

2b74561

github-actions bot added the Stale label May 29, 2022

github-actions bot closed this May 30, 2022

[SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once #34453

[SPARK-37173][SQL] SparkGetFunctionOperation return builtin function only once #34453

Uh oh!

Conversation

AngersZhuuuu commented Nov 1, 2021

What changes were proposed in this pull request?

Why are the changes needed?

Does this PR introduce any user-facing change?

How was this patch tested?

Uh oh!

SparkQA commented Nov 1, 2021

Uh oh!

SparkQA commented Nov 1, 2021

Uh oh!

SparkQA commented Nov 1, 2021

Uh oh!

SparkQA commented Nov 1, 2021

Uh oh!

SparkQA commented Nov 1, 2021

Uh oh!

AngersZhuuuu commented Nov 1, 2021

Uh oh!

SparkQA commented Nov 1, 2021

Uh oh!

SparkQA commented Nov 10, 2021

Uh oh!

juliuszsompolski commented Dec 14, 2021

Uh oh!

dongjoon-hyun commented Dec 14, 2021

Uh oh!

AngersZhuuuu commented Dec 15, 2021

Uh oh!

SparkQA commented Dec 15, 2021

Uh oh!

SparkQA commented Dec 15, 2021

Uh oh!

SparkQA commented Dec 15, 2021

Uh oh!

AngersZhuuuu commented Dec 15, 2021

Uh oh!

SparkQA commented Dec 15, 2021

Uh oh!

SparkQA commented Dec 15, 2021

Uh oh!

Uh oh!

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

Uh oh!

Uh oh!

bogdanghit Dec 16, 2021

Choose a reason for hiding this comment

Uh oh!

AngersZhuuuu Dec 16, 2021

Choose a reason for hiding this comment

Uh oh!

bogdanghit Dec 16, 2021

Choose a reason for hiding this comment

Uh oh!

juliuszsompolski commented Dec 16, 2021

Uh oh!

AngersZhuuuu commented Dec 16, 2021

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

bogdanghit left a comment

Choose a reason for hiding this comment

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!

SparkQA commented Dec 16, 2021

Uh oh!