-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24423][SQL] Add a new option for JDBC sources #21590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
080ef69
3375b94
037f46f
cb447f0
c3feada
765de0b
57f9e3f
c083e13
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,7 +27,7 @@ import org.apache.spark.sql.types.StructType | |
| * Options for the JDBC data source. | ||
| */ | ||
| class JDBCOptions( | ||
| @transient private val parameters: CaseInsensitiveMap[String]) | ||
| @transient val parameters: CaseInsensitiveMap[String]) | ||
| extends Serializable { | ||
|
|
||
| import JDBCOptions._ | ||
|
|
@@ -65,11 +65,31 @@ class JDBCOptions( | |
| // Required parameters | ||
| // ------------------------------------------------------------ | ||
| require(parameters.isDefinedAt(JDBC_URL), s"Option '$JDBC_URL' is required.") | ||
| require(parameters.isDefinedAt(JDBC_TABLE_NAME), s"Option '$JDBC_TABLE_NAME' is required.") | ||
| // a JDBC URL | ||
| val url = parameters(JDBC_URL) | ||
| // name of table | ||
| val table = parameters(JDBC_TABLE_NAME) | ||
| // table name or a table subquery. | ||
| val tableOrQuery = (parameters.get(JDBC_TABLE_NAME), parameters.get(JDBC_QUERY_STRING)) match { | ||
| case (Some(name), Some(subquery)) => | ||
| throw new IllegalArgumentException( | ||
| s"Both '$JDBC_TABLE_NAME' and '$JDBC_QUERY_STRING' can not be specified at the same time." | ||
| ) | ||
| case (None, None) => | ||
| throw new IllegalArgumentException( | ||
| s"Option '$JDBC_TABLE_NAME' or '$JDBC_QUERY_STRING' is required." | ||
| ) | ||
| case (Some(name), None) => | ||
| if (name.isEmpty) { | ||
| throw new IllegalArgumentException(s"Option '$JDBC_TABLE_NAME' can not be empty.") | ||
| } else { | ||
| name.trim | ||
| } | ||
| case (None, Some(subquery)) => | ||
| if (subquery.isEmpty) { | ||
| throw new IllegalArgumentException(s"Option `$JDBC_QUERY_STRING` can not be empty.") | ||
| } else { | ||
| s"(${subquery}) __SPARK_GEN_JDBC_SUBQUERY_NAME_${curId.getAndIncrement()}" | ||
| } | ||
| } | ||
|
|
||
| // ------------------------------------------------------------ | ||
| // Optional parameters | ||
|
|
@@ -109,6 +129,20 @@ class JDBCOptions( | |
| s"When reading JDBC data sources, users need to specify all or none for the following " + | ||
| s"options: '$JDBC_PARTITION_COLUMN', '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', " + | ||
| s"and '$JDBC_NUM_PARTITIONS'") | ||
|
|
||
| require(!(parameters.get(JDBC_QUERY_STRING).isDefined && partitionColumn.isDefined), | ||
| s""" | ||
| |Options '$JDBC_QUERY_STRING' and '$JDBC_PARTITION_COLUMN' can not be specified together. | ||
| |Please define the query using `$JDBC_TABLE_NAME` option instead and make sure to qualify | ||
| |the partition columns using the supplied subquery alias to resolve any ambiguity. | ||
| |Example : | ||
| |spark.read.format("jdbc") | ||
| | .option("dbtable", "(select c1, c2 from t1) as subq") | ||
| | .option("partitionColumn", "subq.c1" | ||
| | .load() | ||
| """.stripMargin | ||
| ) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we define these two parameters simultaneously? How about the case where columns in query output have a partition column?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @maropu So since the we auto generate a subquery alias here for easy of use, we r disallowing the query option together with partition columns. As users wouldn't know how to qualify the partition columns given the suquery alias is generated implicitly. In this case, we ask them to use the existing dbtable to specify the query where they are in control to specify the alias themselves. Another option i considered is to introduce "queryAlias" as another option. But thought to avoid it for simplicity.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @maropu The new option
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking the case below (this can be accepted in
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dilipbiswal Could you reply @maropu with an example?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @maropu Currently we disallow it to be on the safe side. Lets take your example. When using the query option to pass on the query, we basically expect the users to supply select c0 p0, c1 p1, c2 p2 from t where c0 > 1In spark , we will parentesize the query and add in an alias to confirm to the table subquery syntax. Given the user input the above query, he could decide to qualify the partition column names with the table name. So he could do the following : al df = spark.read
.format("jdbc")
.option("driver", "org.postgresql.Driver")
.option("url", "jdbc:postgresql://localhost:5432/postgres?user=maropu")
.option("query", "select c0 p0, c1 p1, c2 p2 from t where c0 > 1")
.option("partitionColumn", "t.p2") ==> User qualifies the column names.
.option("lowerBound", "1")
.option("upperBound", "3")
.option("numPartitions", "2")
.load()In this case we will end up generating the query of the following form - select * from (select c0 p0, c1 p1, c2 p2 from t where c0 > 1) __SPARK_GEN_ALIAS where t.p2 >= 1 and t.p2 <=3However this would be an invalid query. In the query option, its possible to specify a complex query involving joins. Thats the reason, we disallow it to be in safe side. In the dbtable option, users are responsible to explicitly specify the alias and would now how to qualify the partition columns. Lets see if we can improve this in future. If you have some ideas, please let us know.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. aha, ok. Thanks for the kind explanation. |
||
|
|
||
| val fetchSize = { | ||
| val size = parameters.getOrElse(JDBC_BATCH_FETCH_SIZE, "0").toInt | ||
| require(size >= 0, | ||
|
|
@@ -149,7 +183,30 @@ class JDBCOptions( | |
| val sessionInitStatement = parameters.get(JDBC_SESSION_INIT_STATEMENT) | ||
| } | ||
|
|
||
| class JdbcOptionsInWrite( | ||
| @transient override val parameters: CaseInsensitiveMap[String]) | ||
| extends JDBCOptions(parameters) { | ||
|
|
||
| import JDBCOptions._ | ||
|
|
||
| def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters)) | ||
|
|
||
| def this(url: String, table: String, parameters: Map[String, String]) = { | ||
| this(CaseInsensitiveMap(parameters ++ Map( | ||
| JDBCOptions.JDBC_URL -> url, | ||
| JDBCOptions.JDBC_TABLE_NAME -> table))) | ||
| } | ||
|
|
||
| require( | ||
| parameters.get(JDBC_TABLE_NAME).isDefined, | ||
| s"Option '$JDBC_TABLE_NAME' is required. " + | ||
| s"Option '$JDBC_QUERY_STRING' is not applicable while writing.") | ||
|
|
||
| val table = parameters(JDBC_TABLE_NAME) | ||
| } | ||
|
|
||
| object JDBCOptions { | ||
| private val curId = new java.util.concurrent.atomic.AtomicLong(0L) | ||
| private val jdbcOptionNames = collection.mutable.Set[String]() | ||
|
|
||
| private def newOption(name: String): String = { | ||
|
|
@@ -159,6 +216,7 @@ object JDBCOptions { | |
|
|
||
| val JDBC_URL = newOption("url") | ||
| val JDBC_TABLE_NAME = newOption("dbtable") | ||
| val JDBC_QUERY_STRING = newOption("query") | ||
| val JDBC_DRIVER_CLASS = newOption("driver") | ||
| val JDBC_PARTITION_COLUMN = newOption("partitionColumn") | ||
| val JDBC_LOWER_BOUND = newOption("lowerBound") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,7 +59,7 @@ class JdbcRelationProvider extends CreatableRelationProvider | |
| mode: SaveMode, | ||
| parameters: Map[String, String], | ||
| df: DataFrame): BaseRelation = { | ||
| val options = new JDBCOptions(parameters) | ||
| val options = new JdbcOptionsInWrite(parameters) | ||
| val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis | ||
|
|
||
| val conn = JdbcUtils.createConnectionFactory(options)() | ||
|
|
@@ -86,7 +86,8 @@ class JdbcRelationProvider extends CreatableRelationProvider | |
|
|
||
| case SaveMode.ErrorIfExists => | ||
| throw new AnalysisException( | ||
| s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.") | ||
| s"Table or view '${options.table}' already exists. " + | ||
| s"SaveMode: ErrorIfExists.") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No change, right? |
||
|
|
||
| case SaveMode.Ignore => | ||
| // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,7 @@ import org.h2.jdbc.JdbcSQLException | |
| import org.scalatest.{BeforeAndAfter, PrivateMethodTester} | ||
|
|
||
| import org.apache.spark.{SparkException, SparkFunSuite} | ||
| import org.apache.spark.sql.{AnalysisException, DataFrame, Row} | ||
| import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} | ||
| import org.apache.spark.sql.catalyst.parser.CatalystSqlParser | ||
| import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap | ||
| import org.apache.spark.sql.execution.DataSourceScanExec | ||
|
|
@@ -39,7 +39,7 @@ import org.apache.spark.sql.test.SharedSQLContext | |
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| class JDBCSuite extends SparkFunSuite | ||
| class JDBCSuite extends QueryTest | ||
| with BeforeAndAfter with PrivateMethodTester with SharedSQLContext { | ||
| import testImplicits._ | ||
|
|
||
|
|
@@ -1099,7 +1099,7 @@ class JDBCSuite extends SparkFunSuite | |
| test("SPARK-19318: Connection properties keys should be case-sensitive.") { | ||
| def testJdbcOptions(options: JDBCOptions): Unit = { | ||
| // Spark JDBC data source options are case-insensitive | ||
| assert(options.table == "t1") | ||
| assert(options.tableOrQuery == "t1") | ||
| // When we convert it to properties, it should be case-sensitive. | ||
| assert(options.asProperties.size == 3) | ||
| assert(options.asProperties.get("customkey") == null) | ||
|
|
@@ -1255,4 +1255,92 @@ class JDBCSuite extends SparkFunSuite | |
| testIncorrectJdbcPartitionColumn(testH2Dialect.quoteIdentifier("ThEiD")) | ||
| } | ||
| } | ||
|
|
||
| test("query JDBC option - negative tests") { | ||
| val query = "SELECT * FROM test.people WHERE theid = 1" | ||
| // load path | ||
| val e1 = intercept[RuntimeException] { | ||
| val df = spark.read.format("jdbc") | ||
| .option("Url", urlWithUserAndPass) | ||
| .option("query", query) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @HyukjinKwon Thanks.. I will update the doc. |
||
| .option("dbtable", "test.people") | ||
| .load() | ||
| }.getMessage | ||
| assert(e1.contains("Both 'dbtable' and 'query' can not be specified at the same time.")) | ||
|
|
||
| // jdbc api path | ||
| val properties = new Properties() | ||
| properties.setProperty(JDBCOptions.JDBC_QUERY_STRING, query) | ||
| val e2 = intercept[RuntimeException] { | ||
| spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", properties).collect() | ||
| }.getMessage | ||
| assert(e2.contains("Both 'dbtable' and 'query' can not be specified at the same time.")) | ||
|
|
||
| val e3 = intercept[RuntimeException] { | ||
| sql( | ||
| s""" | ||
| |CREATE OR REPLACE TEMPORARY VIEW queryOption | ||
| |USING org.apache.spark.sql.jdbc | ||
| |OPTIONS (url '$url', query '$query', dbtable 'TEST.PEOPLE', | ||
| | user 'testUser', password 'testPass') | ||
| """.stripMargin.replaceAll("\n", " ")) | ||
| }.getMessage | ||
| assert(e3.contains("Both 'dbtable' and 'query' can not be specified at the same time.")) | ||
|
|
||
| val e4 = intercept[RuntimeException] { | ||
| val df = spark.read.format("jdbc") | ||
| .option("Url", urlWithUserAndPass) | ||
| .option("query", "") | ||
| .load() | ||
| }.getMessage | ||
| assert(e4.contains("Option `query` can not be empty.")) | ||
|
|
||
| // Option query and partitioncolumn are not allowed together. | ||
| val expectedErrorMsg = | ||
| s""" | ||
| |Options 'query' and 'partitionColumn' can not be specified together. | ||
| |Please define the query using `dbtable` option instead and make sure to qualify | ||
| |the partition columns using the supplied subquery alias to resolve any ambiguity. | ||
| |Example : | ||
| |spark.read.format("jdbc") | ||
| | .option("dbtable", "(select c1, c2 from t1) as subq") | ||
| | .option("partitionColumn", "subq.c1" | ||
| | .load() | ||
| """.stripMargin | ||
| val e5 = intercept[RuntimeException] { | ||
| sql( | ||
| s""" | ||
| |CREATE OR REPLACE TEMPORARY VIEW queryOption | ||
| |USING org.apache.spark.sql.jdbc | ||
| |OPTIONS (url '$url', query '$query', user 'testUser', password 'testPass', | ||
| | partitionColumn 'THEID', lowerBound '1', upperBound '4', numPartitions '3') | ||
| """.stripMargin.replaceAll("\n", " ")) | ||
| }.getMessage | ||
| assert(e5.contains(expectedErrorMsg)) | ||
| } | ||
|
|
||
| test("query JDBC option") { | ||
| val query = "SELECT name, theid FROM test.people WHERE theid = 1" | ||
| // query option to pass on the query string. | ||
| val df = spark.read.format("jdbc") | ||
| .option("Url", urlWithUserAndPass) | ||
| .option("query", query) | ||
| .load() | ||
| checkAnswer( | ||
| df, | ||
| Row("fred", 1) :: Nil) | ||
|
|
||
| // query option in the create table path. | ||
| sql( | ||
| s""" | ||
| |CREATE OR REPLACE TEMPORARY VIEW queryOption | ||
| |USING org.apache.spark.sql.jdbc | ||
| |OPTIONS (url '$url', query '$query', user 'testUser', password 'testPass') | ||
| """.stripMargin.replaceAll("\n", " ")) | ||
|
|
||
| checkAnswer( | ||
| sql("select name, theid from queryOption"), | ||
| Row("fred", 1) :: Nil) | ||
|
|
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great! Please add them to the doc as I mentioned above?