From 9d280ec792e7c969e25e7158b02b8593131ed265 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 18 Aug 2015 13:02:20 -0700 Subject: [PATCH] Update build to test using official Amazon JDBC driver. --- README.md | 17 +++++++++-------- build.sbt | 10 +++++++--- .../spark/redshift/Parameters.scala | 4 ++-- .../spark/redshift/ParametersSuite.scala | 8 ++++---- .../spark/redshift/RedshiftSourceSuite.scala | 19 +++++++++++-------- 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index f14f919f..126d83e9 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,9 @@ Further, as Redshift is an AWS product, some AWS libraries will be required. Thi your deployment environment will include `hadoop-aws`, or other things necessary to access S3, credentials, etc. Check the dependencies with "provided" scope in build.sbt if you're at all unclear. -You're also going to need a JDBC driver that is compatible with Redshift. The one used for testing can be -found in build.sbt, however Amazon recommend that you use [their driver](http://docs.aws.amazon.com/redshift/latest/mgmt/configure-jdbc-connection.html). +You're also going to need a JDBC driver that is compatible with Redshift. Amazon recommend that you +use [their driver](http://docs.aws.amazon.com/redshift/latest/mgmt/configure-jdbc-connection.html), +although this library has also been successfully tested using the Postgres JDBC driver. ## Usage @@ -49,7 +50,7 @@ val sqlContext = new SQLContext(sc) // Get some data from a Redshift table val df: DataFrame = sqlContext.read .format("com.databricks.spark.redshift") - .option("url", "jdbc:postgresql://redshifthost:5439/database?user=username&password=pass") + .option("url", "jdbc:redshift://redshifthost:5439/database?user=username&password=pass") .option("dbtable" -> "my_table") .option("tempdir" -> "s3://path/for/temp/data") .load() @@ -59,7 +60,7 @@ val df: DataFrame = sqlContext.read df.write .format("com.databricks.spark.redshift") - .option("url", "jdbc:postgresql://redshifthost:5439/database?user=username&password=pass") + .option("url", "jdbc:redshift://redshifthost:5439/database?user=username&password=pass") .option("dbtable" -> "my_table_copy") .option("tempdir" -> "s3://path/for/temp/data") .mode("error") @@ -77,7 +78,7 @@ sql_context = SQLContext(sc) # Read data from a table df = sql_context.read \ .format("com.databricks.spark.redshift") \ - .option("url", "jdbc:postgresql://redshifthost:5439/database?user=username&password=pass") \ + .option("url", "jdbc:redshift://redshifthost:5439/database?user=username&password=pass") \ .option("dbtable" -> "my_table") \ .option("tempdir" -> "s3://path/for/temp/data") \ .load() @@ -85,7 +86,7 @@ df = sql_context.read \ # Write back to a table df.write \ .format("com.databricks.spark.redshift") - .option("url", "jdbc:postgresql://redshifthost:5439/database?user=username&password=pass") \ + .option("url", "jdbc:redshift://redshifthost:5439/database?user=username&password=pass") \ .option("dbtable" -> "my_table_copy") \ .option("tempdir" -> "s3://path/for/temp/data") \ .mode("error") @@ -99,7 +100,7 @@ CREATE TABLE my_table USING com.databricks.spark.redshift OPTIONS (dbtable 'my_table', tempdir 's3://my_bucket/tmp', - url 'jdbc:postgresql://host:port/db?user=username&password=pass'); + url 'jdbc:redshift://host:port/db?user=username&password=pass'); ``` ### Scala helper functions @@ -204,7 +205,7 @@ and use that as a temp location for this data. jdbcdriver No - org.postgresql.Driver + com.amazon.redshift.jdbc4.Driver The class name of the JDBC driver to load before JDBC operations. Must be on classpath. diff --git a/build.sbt b/build.sbt index 918627ec..72aa96c8 100644 --- a/build.sbt +++ b/build.sbt @@ -33,9 +33,13 @@ libraryDependencies += "com.databricks" %% "spark-avro" % "1.0.0" libraryDependencies += "org.apache.avro" % "avro-mapred" % "1.7.6" % "provided" exclude("org.mortbay.jetty", "servlet-api") // A Redshift-compatible JDBC driver must be present on the classpath for spark-redshift to work. -// For testing, we using a Postgres driver, but it is recommended that the Amazon driver is used -// in production. See http://docs.aws.amazon.com/redshift/latest/mgmt/configure-jdbc-connection.html -libraryDependencies += "postgresql" % "postgresql" % "8.3-606.jdbc4" % "provided" +// The Amazon driver is recommended for production use; it can be obtained from +// http://docs.aws.amazon.com/redshift/latest/mgmt/configure-jdbc-connection.html + +// A Redshift-compatible JDBC driver must be present on the classpath for spark-redshift to work. +// For testing, we use an Amazon driver, which is available from +// http://docs.aws.amazon.com/redshift/latest/mgmt/configure-jdbc-connection.html +libraryDependencies += "com.amazon.redshift" % "jdbc4" % "1.1.7.1007" % "test" from "https://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC4-1.1.7.1007.jar" libraryDependencies += "com.google.guava" % "guava" % "14.0.1" % Test diff --git a/src/main/scala/com/databricks/spark/redshift/Parameters.scala b/src/main/scala/com/databricks/spark/redshift/Parameters.scala index 10d5a7e1..e8626e56 100644 --- a/src/main/scala/com/databricks/spark/redshift/Parameters.scala +++ b/src/main/scala/com/databricks/spark/redshift/Parameters.scala @@ -34,7 +34,7 @@ private [redshift] object Parameters extends Logging { // * sortkeyspec has no default, but is optional // * distkey has no default, but is optional unless using diststyle KEY - "jdbcdriver" -> "org.postgresql.Driver", + "jdbcdriver" -> "com.amazon.redshift.jdbc4.Driver", "overwrite" -> "false", "diststyle" -> "EVEN", "usestagingtable" -> "true", @@ -101,7 +101,7 @@ private [redshift] object Parameters extends Logging { /** * The JDBC driver class name. This is used to make sure the driver is registered before connecting over - * JDBC. Default is "org.postgresql.Driver" + * JDBC. Default is "com.amazon.redshift.jdbc4.Driver" */ def jdbcDriver = parameters("jdbcdriver") diff --git a/src/test/scala/com/databricks/spark/redshift/ParametersSuite.scala b/src/test/scala/com/databricks/spark/redshift/ParametersSuite.scala index edea5bbb..57c990c5 100644 --- a/src/test/scala/com/databricks/spark/redshift/ParametersSuite.scala +++ b/src/test/scala/com/databricks/spark/redshift/ParametersSuite.scala @@ -28,7 +28,7 @@ class ParametersSuite extends FunSuite with Matchers { Map( "tempdir" -> "s3://foo/bar", "dbtable" -> "test_table", - "url" -> "jdbc:postgresql://foo/bar") + "url" -> "jdbc:redshift://foo/bar") val mergedParams = Parameters.mergeParameters(params) @@ -47,7 +47,7 @@ class ParametersSuite extends FunSuite with Matchers { Map( "tempdir" -> "s3://foo/bar", "dbtable" -> "test_table", - "url" -> "jdbc:postgresql://foo/bar") + "url" -> "jdbc:redshift://foo/bar") val mergedParams1 = Parameters.mergeParameters(params) val mergedParams2 = Parameters.mergeParameters(params) @@ -63,8 +63,8 @@ class ParametersSuite extends FunSuite with Matchers { } } - checkMerge(Map("dbtable" -> "test_table", "url" -> "jdbc:postgresql://foo/bar")) - checkMerge(Map("tempdir" -> "s3://foo/bar", "url" -> "jdbc:postgresql://foo/bar")) + checkMerge(Map("dbtable" -> "test_table", "url" -> "jdbc:redshift://foo/bar")) + checkMerge(Map("tempdir" -> "s3://foo/bar", "url" -> "jdbc:redshift://foo/bar")) checkMerge(Map("dbtable" -> "test_table", "tempdir" -> "s3://foo/bar")) } } diff --git a/src/test/scala/com/databricks/spark/redshift/RedshiftSourceSuite.scala b/src/test/scala/com/databricks/spark/redshift/RedshiftSourceSuite.scala index 0cb2566f..cf79b019 100644 --- a/src/test/scala/com/databricks/spark/redshift/RedshiftSourceSuite.scala +++ b/src/test/scala/com/databricks/spark/redshift/RedshiftSourceSuite.scala @@ -151,7 +151,8 @@ class RedshiftSourceSuite test("DefaultSource can load Redshift UNLOAD output to a DataFrame") { - val params = Map("url" -> "jdbc:postgresql://foo/bar", + val params = Map( + "url" -> "jdbc:redshift://foo/bar", "tempdir" -> "tmp", "dbtable" -> "test_table", "aws_access_key_id" -> "test1", @@ -173,7 +174,8 @@ class RedshiftSourceSuite test("DefaultSource supports simple column filtering") { - val params = Map("url" -> "jdbc:postgresql://foo/bar", + val params = Map( + "url" -> "jdbc:redshift://foo/bar", "tempdir" -> "tmp", "dbtable" -> "test_table", "aws_access_key_id" -> "test1", @@ -201,7 +203,8 @@ class RedshiftSourceSuite test("DefaultSource supports user schema, pruned and filtered scans") { - val params = Map("url" -> "jdbc:postgresql://foo/bar", + val params = Map( + "url" -> "jdbc:redshift://foo/bar", "tempdir" -> "tmp", "dbtable" -> "test_table", "aws_access_key_id" -> "test1", @@ -235,7 +238,7 @@ class RedshiftSourceSuite val testSqlContext = new SQLContext(sc) - val jdbcUrl = "jdbc:postgresql://foo/bar" + val jdbcUrl = "jdbc:redshift://foo/bar" val params = Map("url" -> jdbcUrl, "tempdir" -> tempDir, @@ -284,7 +287,7 @@ class RedshiftSourceSuite test("Failed copies are handled gracefully when using a staging table") { val testSqlContext = new SQLContext(sc) - val jdbcUrl = "jdbc:postgresql://foo/bar" + val jdbcUrl = "jdbc:redshift://foo/bar" val params = Map("url" -> jdbcUrl, "tempdir" -> tempDir, @@ -368,7 +371,7 @@ class RedshiftSourceSuite test("Append SaveMode doesn't destroy existing data") { val testSqlContext = new SQLContext(sc) - val jdbcUrl = "jdbc:postgresql://foo/bar" + val jdbcUrl = "jdbc:redshift://foo/bar" val params = Map("url" -> jdbcUrl, "tempdir" -> tempDir, @@ -410,7 +413,7 @@ class RedshiftSourceSuite test("Respect SaveMode.ErrorIfExists when table exists") { val testSqlContext = new SQLContext(sc) - val jdbcUrl = "jdbc:postgresql://foo/bar" + val jdbcUrl = "jdbc:redshift://foo/bar" val params = Map("url" -> jdbcUrl, "tempdir" -> tempDir, @@ -438,7 +441,7 @@ class RedshiftSourceSuite test("Do nothing when table exists if SaveMode = Ignore") { val testSqlContext = new SQLContext(sc) - val jdbcUrl = "jdbc:postgresql://foo/bar" + val jdbcUrl = "jdbc:redshift://foo/bar" val params = Map("url" -> jdbcUrl, "tempdir" -> tempDir,