From e7def7c491c8fb06a73aea2f2e072dbe0e59c1da Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 6 Jul 2016 16:45:21 -0700 Subject: [PATCH 01/11] [SPARK-16381] move example code to a separate R file --- docs/sql-programming-guide.md | 155 ++-------------------- examples/src/main/r/RSparkSQLExample.R | 175 +++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 142 deletions(-) create mode 100644 examples/src/main/r/RSparkSQLExample.R diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 68419e133159..0fb04937071e 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -86,9 +86,7 @@ The entry point into all functionality in Spark is the [`SparkSession`](api/pyth The entry point into all functionality in Spark is the [`SparkSession`](api/R/sparkR.session.html) class. To initialize a basic `SparkSession`, just call `sparkR.session()`: -{% highlight r %} -sparkR.session() -{% endhighlight %} +{% include_example init_session r/RSparkSQLExample.R %} Note that when invoked for the first time, `sparkR.session()` initializes a global `SparkSession` singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the `SparkSession` once, then SparkR functions like `read.df` will be able to access this global instance implicitly, and users don't need to pass the `SparkSession` instance around. @@ -155,12 +153,7 @@ from a Hive table, or from [Spark data sources](#data-sources). As an example, the following creates a DataFrame based on the content of a JSON file: -{% highlight r %} -df <- read.json("examples/src/main/resources/people.json") - -# Displays the content of the DataFrame -showDF(df) -{% endhighlight %} +{% include_example create_DataFrames r/RSparkSQLExample.R %} @@ -343,50 +336,8 @@ In addition to simple column references and expressions, DataFrames also have a
-{% highlight r %} -# Create the DataFrame -df <- read.json("examples/src/main/resources/people.json") - -# Show the content of the DataFrame -showDF(df) -## age name -## null Michael -## 30 Andy -## 19 Justin - -# Print the schema in a tree format -printSchema(df) -## root -## |-- age: long (nullable = true) -## |-- name: string (nullable = true) -# Select only the "name" column -showDF(select(df, "name")) -## name -## Michael -## Andy -## Justin - -# Select everybody, but increment the age by 1 -showDF(select(df, df$name, df$age + 1)) -## name (age + 1) -## Michael null -## Andy 31 -## Justin 20 - -# Select people older than 21 -showDF(where(df, df$age > 21)) -## age name -## 30 Andy - -# Count people by age -showDF(count(groupBy(df, "age"))) -## age count -## null 1 -## 19 1 -## 30 1 - -{% endhighlight %} +{% include_example untyped_transformations r/RSparkSQLExample.R %} For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html). @@ -429,12 +380,10 @@ df = spark.sql("SELECT * FROM table")
The `sql` function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. -{% highlight r %} -df <- sql("SELECT * FROM table") -{% endhighlight %} -
+{% include_example sql_query r/RSparkSQLExample.R %}
+ ## Creating Datasets @@ -888,10 +837,7 @@ df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
-{% highlight r %} -df <- read.df("examples/src/main/resources/users.parquet") -write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") -{% endhighlight %} +{% include_example source_parquet r/RSparkSQLExample.R %}
@@ -937,12 +883,7 @@ df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
-{% highlight r %} - -df <- read.df("examples/src/main/resources/people.json", "json") -write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet") - -{% endhighlight %} +{% include_example source_json r/RSparkSQLExample.R %}
@@ -978,9 +919,7 @@ df = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet
-{% highlight r %} -df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") -{% endhighlight %} +{% include_example direct_query r/RSparkSQLExample.R %}
@@ -1133,26 +1072,7 @@ for teenName in teenNames.collect():
-{% highlight r %} - -schemaPeople # The SparkDataFrame from the previous example. - -# SparkDataFrame can be saved as Parquet files, maintaining the schema information. -write.parquet(schemaPeople, "people.parquet") - -# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. -# The result of loading a parquet file is also a DataFrame. -parquetFile <- read.parquet("people.parquet") - -# Parquet files can also be used to create a temporary view and then used in SQL statements. -createOrReplaceTempView(parquetFile, "parquetFile") -teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") -schema <- structType(structField("name", "string")) -teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) -for (teenName in collect(teenNames)$name) { - cat(teenName, "\n") -} -{% endhighlight %} +{% include_example load_programmatically r/RSparkSQLExample.R %}
@@ -1315,27 +1235,7 @@ df3.printSchema()
-{% highlight r %} - -# Create a simple DataFrame, stored into a partition directory -write.df(df1, "data/test_table/key=1", "parquet", "overwrite") - -# Create another DataFrame in a new partition directory, -# adding a new column and dropping an existing column -write.df(df2, "data/test_table/key=2", "parquet", "overwrite") - -# Read the partitioned table -df3 <- read.df("data/test_table", "parquet", mergeSchema="true") -printSchema(df3) - -# The final schema consists of all 3 columns in the Parquet files together -# with the partitioning column appeared in the partition directory paths. -# root -# |-- single: int (nullable = true) -# |-- double: int (nullable = true) -# |-- triple: int (nullable = true) -# |-- key : int (nullable = true) -{% endhighlight %} +{% include_example schema_merging r/RSparkSQLExample.R %}
@@ -1601,25 +1501,8 @@ Note that the file that is offered as _a json file_ is not a typical JSON file. line must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail. -{% highlight r %} -# A JSON dataset is pointed to by path. -# The path can be either a single text file or a directory storing text files. -path <- "examples/src/main/resources/people.json" -# Create a DataFrame from the file(s) pointed to by path -people <- read.json(path) +{% include_example load_json_file r/RSparkSQLExample.R %} -# The inferred schema can be visualized using the printSchema() method. -printSchema(people) -# root -# |-- age: long (nullable = true) -# |-- name: string (nullable = true) - -# Register this DataFrame as a table. -createOrReplaceTempView(people, "people") - -# SQL statements can be run by using the sql methods. -teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") -{% endhighlight %}
@@ -1734,16 +1617,8 @@ results = spark.sql("FROM src SELECT key, value").collect() When working with Hive one must instantiate `SparkSession` with Hive support. This adds support for finding tables in the MetaStore and writing queries using HiveQL. -{% highlight r %} -# enableHiveSupport defaults to TRUE -sparkR.session(enableHiveSupport = TRUE) -sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") -sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") - -# Queries can be expressed in HiveQL. -results <- collect(sql("FROM src SELECT key, value")) -{% endhighlight %} +{% include_example hive_table r/RSparkSQLExample.R %}
@@ -1920,11 +1795,7 @@ df = spark.read.format('jdbc').options(url='jdbc:postgresql:dbserver', dbtable='
-{% highlight r %} - -df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") - -{% endhighlight %} +{% include_example jdbc r/RSparkSQLExample.R %}
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R new file mode 100644 index 000000000000..e8260f001b8f --- /dev/null +++ b/examples/src/main/r/RSparkSQLExample.R @@ -0,0 +1,175 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# $example on:init_session$ +sparkR.session() +# $example off:init_session$ + + +# $example on:create_DataFrames$ +df <- read.json("examples/src/main/resources/people.json") + +# Displays the content of the DataFrame +showDF(df) +# $example off:create_DataFrames$ + + +# $example on:untyped_transformations$ +# Create the DataFrame +df <- read.json("examples/src/main/resources/people.json") + +# Show the content of the DataFrame +showDF(df) +## age name +## null Michael +## 30 Andy +## 19 Justin + +# Print the schema in a tree format +printSchema(df) +## root +## |-- age: long (nullable = true) +## |-- name: string (nullable = true) + +# Select only the "name" column +showDF(select(df, "name")) +## name +## Michael +## Andy +## Justin + +# Select everybody, but increment the age by 1 +showDF(select(df, df$name, df$age + 1)) +## name (age + 1) +## Michael null +## Andy 31 +## Justin 20 + +# Select people older than 21 +showDF(where(df, df$age > 21)) +## age name +## 30 Andy + +# Count people by age +showDF(count(groupBy(df, "age"))) +## age count +## null 1 +## 19 1 +## 30 1 +# $example off:untyped_transformations$ + + +# $example on:sql_query$ +df <- sql("SELECT * FROM table") +# $example off:sql_query$ + + +# $example on:source_parquet$ +df <- read.df("examples/src/main/resources/users.parquet") +write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") +# $example off:source_parquet$ + + +# $example on:source_json$ +df <- read.df("examples/src/main/resources/people.json", "json") +write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet") +# $example off:source_json$ + + +# $example on:direct_query$ +df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") +# $example off:direct_query$ + + +# $example on:load_programmatically$ +schemaPeople # The SparkDataFrame from the previous example. + +# SparkDataFrame can be saved as Parquet files, maintaining the schema information. +write.parquet(schemaPeople, "people.parquet") + +# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. +# The result of loading a parquet file is also a DataFrame. +parquetFile <- read.parquet("people.parquet") + +# Parquet files can also be used to create a temporary view and then used in SQL statements. +createOrReplaceTempView(parquetFile, "parquetFile") +teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") +schema <- structType(structField("name", "string")) +teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) +for (teenName in collect(teenNames)$name) { + cat(teenName, "\n") +} +# $example off:load_programmatically$ + + +# $example on:schema_merging$ +# Create a simple DataFrame, stored into a partition directory +write.df(df1, "data/test_table/key=1", "parquet", "overwrite") + +# Create another DataFrame in a new partition directory, +# adding a new column and dropping an existing column +write.df(df2, "data/test_table/key=2", "parquet", "overwrite") + +# Read the partitioned table +df3 <- read.df("data/test_table", "parquet", mergeSchema="true") +printSchema(df3) + +# The final schema consists of all 3 columns in the Parquet files together +# with the partitioning column appeared in the partition directory paths. +# root +# |-- single: int (nullable = true) +# |-- double: int (nullable = true) +# |-- triple: int (nullable = true) +# |-- key : int (nullable = true) +# $example off:schema_merging$ + + +# $example on:load_json_file$ +# A JSON dataset is pointed to by path. +# The path can be either a single text file or a directory storing text files. +path <- "examples/src/main/resources/people.json" +# Create a DataFrame from the file(s) pointed to by path +people <- read.json(path) + +# The inferred schema can be visualized using the printSchema() method. +printSchema(people) +# root +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) + +# Register this DataFrame as a table. +createOrReplaceTempView(people, "people") + +# SQL statements can be run by using the sql methods. +teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") +# $example off:load_json_file$ + + +# $example on:hive_table$ +# enableHiveSupport defaults to TRUE +sparkR.session(enableHiveSupport = TRUE) +sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") +sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src") + +# Queries can be expressed in HiveQL. +results <- collect(sql("FROM src SELECT key, value")) +# $example off:hive_table$ + + +# $example on:jdbc$ +df <- read.jdbc("jdbc:postgresql:dbserver", "schema.tablename", user = "username", password = "password") +# $example off:jdbc$ From 1af09f31fd506143e8fe45b530dd46e39df76d6b Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 7 Jul 2016 17:48:04 -0700 Subject: [PATCH 02/11] [SPARK-16381] some fixes, more to come --- docs/sql-programming-guide.md | 2 +- examples/src/main/r/RSparkSQLExample.R | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 0fb04937071e..448251cfdc69 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -337,7 +337,7 @@ In addition to simple column references and expressions, DataFrames also have a
-{% include_example untyped_transformations r/RSparkSQLExample.R %} +{% include_example dataframe_operations r/RSparkSQLExample.R %} For a complete list of the types of operations that can be performed on a DataFrame refer to the [API Documentation](api/R/index.html). diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index e8260f001b8f..fd6b2d187b7a 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -24,16 +24,19 @@ sparkR.session() df <- read.json("examples/src/main/resources/people.json") # Displays the content of the DataFrame +head(df) + +# Another method to print the first few rows and optionally truncate the printing of long values showDF(df) # $example off:create_DataFrames$ -# $example on:untyped_transformations$ +# $example on:dataframe_operations$ # Create the DataFrame df <- read.json("examples/src/main/resources/people.json") # Show the content of the DataFrame -showDF(df) +head(df) ## age name ## null Michael ## 30 Andy @@ -46,31 +49,31 @@ printSchema(df) ## |-- name: string (nullable = true) # Select only the "name" column -showDF(select(df, "name")) +head(select(df, "name")) ## name ## Michael ## Andy ## Justin # Select everybody, but increment the age by 1 -showDF(select(df, df$name, df$age + 1)) +head(select(df, df$name, df$age + 1)) ## name (age + 1) ## Michael null ## Andy 31 ## Justin 20 # Select people older than 21 -showDF(where(df, df$age > 21)) +head(where(df, df$age > 21)) ## age name ## 30 Andy # Count people by age -showDF(count(groupBy(df, "age"))) +head(count(groupBy(df, "age"))) ## age count ## null 1 ## 19 1 ## 30 1 -# $example off:untyped_transformations$ +# $example off:dataframe_operations$ # $example on:sql_query$ From 9ac6a7049187d817a732193fad0e86993eddf197 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 7 Jul 2016 23:11:54 -0700 Subject: [PATCH 03/11] [SPARK-16381] make schema merge example runnable --- examples/src/main/r/RSparkSQLExample.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index fd6b2d187b7a..357e2897b220 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -99,10 +99,10 @@ df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") # $example on:load_programmatically$ -schemaPeople # The SparkDataFrame from the previous example. +df <- read.df("examples/src/main/resources/people.json", "json") # SparkDataFrame can be saved as Parquet files, maintaining the schema information. -write.parquet(schemaPeople, "people.parquet") +write.parquet(df, "people.parquet") # Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved. # The result of loading a parquet file is also a DataFrame. @@ -120,6 +120,9 @@ for (teenName in collect(teenNames)$name) { # $example on:schema_merging$ +df1 <- createDataFrame(data.frame(single=c(12, 29), double=c(19, 23))) +df2 <- createDataFrame(data.frame(double=c(19, 23), triple=c(23, 18))) + # Create a simple DataFrame, stored into a partition directory write.df(df1, "data/test_table/key=1", "parquet", "overwrite") @@ -134,9 +137,9 @@ printSchema(df3) # The final schema consists of all 3 columns in the Parquet files together # with the partitioning column appeared in the partition directory paths. # root -# |-- single: int (nullable = true) -# |-- double: int (nullable = true) -# |-- triple: int (nullable = true) +# |-- single: double (nullable = true) +# |-- double: double (nullable = true) +# |-- triple: double (nullable = true) # |-- key : int (nullable = true) # $example off:schema_merging$ @@ -159,6 +162,9 @@ createOrReplaceTempView(people, "people") # SQL statements can be run by using the sql methods. teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +## 1 Justin # $example off:load_json_file$ From 05ee46bc46ddcb6855ab85ea79f256b1d6d27b90 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 7 Jul 2016 23:17:27 -0700 Subject: [PATCH 04/11] [SPARK-16381] make sql_query example runnable --- examples/src/main/r/RSparkSQLExample.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index 357e2897b220..cf96e9adbf0c 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -76,6 +76,11 @@ head(count(groupBy(df, "age"))) # $example off:dataframe_operations$ +# Create a DataFrame from json file +path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json") +peopleDF <- read.json(path) +# Register this DataFrame as a table. +createOrReplaceTempView(peopleDF, "table") # $example on:sql_query$ df <- sql("SELECT * FROM table") # $example off:sql_query$ From 828b2cf7d37684b8cb05803b10e41264adc4c926 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 7 Jul 2016 23:45:12 -0700 Subject: [PATCH 05/11] [SPARK-16381] make load_programmatically example runnable --- examples/src/main/r/RSparkSQLExample.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index cf96e9adbf0c..a931f0ab51ee 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -15,6 +15,8 @@ # limitations under the License. # +library(SparkR) + # $example on:init_session$ sparkR.session() # $example off:init_session$ @@ -116,11 +118,18 @@ parquetFile <- read.parquet("people.parquet") # Parquet files can also be used to create a temporary view and then used in SQL statements. createOrReplaceTempView(parquetFile, "parquetFile") teenagers <- sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +## 1 Justin + schema <- structType(structField("name", "string")) teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) for (teenName in collect(teenNames)$name) { cat(teenName, "\n") } +## Name: Michael +## Name: Andy +## Name: Justin # $example off:load_programmatically$ From 7dca42dfd87597db8aa15cf1c32868baecbfd99e Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 7 Jul 2016 23:47:41 -0700 Subject: [PATCH 06/11] [SPARK-16381] replace last showDF() --- examples/src/main/r/RSparkSQLExample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index a931f0ab51ee..802fa194fb94 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -29,7 +29,7 @@ df <- read.json("examples/src/main/resources/people.json") head(df) # Another method to print the first few rows and optionally truncate the printing of long values -showDF(df) +head(df) # $example off:create_DataFrames$ From cd184b31d9fe7f57c6690e8a69f77b64f3ec9228 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Fri, 8 Jul 2016 11:51:56 -0700 Subject: [PATCH 07/11] [SPARK-16381] minor fix --- examples/src/main/r/RSparkSQLExample.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index 802fa194fb94..868d5482c4cb 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -18,7 +18,7 @@ library(SparkR) # $example on:init_session$ -sparkR.session() +sparkR.session(appName='MyApp', sparkConfig=list(spark.executor.memory="1g")) # $example off:init_session$ @@ -29,7 +29,7 @@ df <- read.json("examples/src/main/resources/people.json") head(df) # Another method to print the first few rows and optionally truncate the printing of long values -head(df) +showDF(df) # $example off:create_DataFrames$ From 5e95fdd327efd7edbbecf30f3346b486ad86bbf8 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Fri, 8 Jul 2016 12:01:05 -0700 Subject: [PATCH 08/11] [SPARK-16381] make it verbose --- examples/src/main/r/RSparkSQLExample.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index 868d5482c4cb..1fe6f219e1ca 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -96,7 +96,8 @@ write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet") # $example on:source_json$ df <- read.df("examples/src/main/resources/people.json", "json") -write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet") +namesAndAges <- select(df, "name", "age") +write.df(namesAndAges, "namesAndAges.parquet", "parquet") # $example off:source_json$ From d5b0b7f111a28c63ca6e501ff0017af64881f0b4 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Fri, 8 Jul 2016 12:10:39 -0700 Subject: [PATCH 09/11] [SPARK-16381] remove code duplicate etc --- examples/src/main/r/RSparkSQLExample.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index 1fe6f219e1ca..c339273a6f93 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -78,11 +78,8 @@ head(count(groupBy(df, "age"))) # $example off:dataframe_operations$ -# Create a DataFrame from json file -path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json") -peopleDF <- read.json(path) # Register this DataFrame as a table. -createOrReplaceTempView(peopleDF, "table") +createOrReplaceTempView(df, "table") # $example on:sql_query$ df <- sql("SELECT * FROM table") # $example off:sql_query$ @@ -123,6 +120,7 @@ head(teenagers) ## name ## 1 Justin +# We can also run custom R-UDFs on Spark DataFrames. Here we prefix all the names with "Name:" schema <- structType(structField("name", "string")) teenNames <- dapply(df, function(p) { cbind(paste("Name:", p$name)) }, schema) for (teenName in collect(teenNames)$name) { From a1eca2bc5f038e3966de87fdbce35f42cee4dd32 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Fri, 8 Jul 2016 16:06:28 -0700 Subject: [PATCH 10/11] [SPARK-16381] style fix --- examples/src/main/r/RSparkSQLExample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index c339273a6f93..eba3f1b91e2d 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -18,7 +18,7 @@ library(SparkR) # $example on:init_session$ -sparkR.session(appName='MyApp', sparkConfig=list(spark.executor.memory="1g")) +sparkR.session(appName = "MyApp", sparkConfig = list(spark.executor.memory = "1g")) # $example off:init_session$ From 7195750788d1526f77eea1345f7c0cf5431aca05 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Fri, 8 Jul 2016 16:53:28 -0700 Subject: [PATCH 11/11] [SPARK-16381] fix space style in other r examples --- examples/src/main/r/dataframe.R | 2 +- examples/src/main/r/ml.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index a377d6e864d2..295f9b427622 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -18,7 +18,7 @@ library(SparkR) # Initialize SparkSession -sc <- sparkR.session(appName="SparkR-DataFrame-example") +sc <- sparkR.session(appName = "SparkR-DataFrame-example") # Create a simple local data.frame localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R index 940c98dcb97a..65242e68b3c8 100644 --- a/examples/src/main/r/ml.R +++ b/examples/src/main/r/ml.R @@ -22,7 +22,7 @@ library(SparkR) # Initialize SparkSession -sparkR.session(appName="SparkR-ML-example") +sparkR.session(appName = "SparkR-ML-example") # $example on$ ############################ spark.glm and glm ##############################################