-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-15509][ML][SparkR] R MLlib algorithms should support input columns "features" and "label" #13584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-15509][ML][SparkR] R MLlib algorithms should support input columns "features" and "label" #13584
Changes from all commits
cfed884
77886fe
e112ac0
ef3702e
aab3a12
f68ac34
c8e30e9
43b2f8c
82069c7
1bc150f
caa4183
1701252
d9e3be5
8bb370e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.ml.r | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.ml.feature.RFormula | ||
| import org.apache.spark.sql.Dataset | ||
|
|
||
| object RWrapperUtils extends Logging { | ||
|
|
||
| /** | ||
| * DataFrame column check. | ||
| * When loading data, default columns "features" and "label" will be added. And these two names | ||
| * would conflict with RFormula default feature and label column names. | ||
| * Here is to change the column name to avoid "column already exists" error. | ||
| * | ||
| * @param rFormula RFormula instance | ||
| * @param data Input dataset | ||
| * @return Unit | ||
| */ | ||
| def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = { | ||
| if (data.schema.fieldNames.contains(rFormula.getLabelCol)) { | ||
| val newLabelName = convertToUniqueName(rFormula.getLabelCol, data.schema.fieldNames) | ||
| logWarning( | ||
| s"data containing ${rFormula.getLabelCol} column, using new name $newLabelName instead") | ||
| rFormula.setLabelCol(newLabelName) | ||
| } | ||
|
|
||
| if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) { | ||
| val newFeaturesName = convertToUniqueName(rFormula.getFeaturesCol, data.schema.fieldNames) | ||
| logWarning(s"data containing ${rFormula.getFeaturesCol} column, " + | ||
| s"using new name $newFeaturesName instead") | ||
| rFormula.setFeaturesCol(newFeaturesName) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Convert conflicting name to be an unique name. | ||
| * Appending a sequence number, like originalName_output1 | ||
| * and incrementing until it is not already there | ||
| * | ||
| * @param originalName Original name | ||
| * @param fieldNames Array of field names in existing schema | ||
| * @return String | ||
| */ | ||
| def convertToUniqueName(originalName: String, fieldNames: Array[String]): String = { | ||
| var counter = 1 | ||
| var newName = originalName + "_output" | ||
|
|
||
| while (fieldNames.contains(newName)) { | ||
| newName = originalName + "_output" + counter | ||
| counter += 1 | ||
| } | ||
| newName | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,9 +54,6 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul | |
| intercept[IllegalArgumentException] { | ||
| formula.fit(original) | ||
| } | ||
| intercept[IllegalArgumentException] { | ||
| formula.fit(original) | ||
| } | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here is just a duplication of above |
||
|
|
||
| test("label column already exists") { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.ml.r | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.ml.feature.{RFormula, RFormulaModel} | ||
| import org.apache.spark.mllib.util.MLlibTestSparkContext | ||
|
|
||
| class RWrapperUtilsSuite extends SparkFunSuite with MLlibTestSparkContext { | ||
|
|
||
| test("avoid libsvm data column name conflicting") { | ||
| val rFormula = new RFormula().setFormula("label ~ features") | ||
| val data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt") | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I used All I found is like this |
||
|
|
||
| // if not checking column name, then IllegalArgumentException | ||
| intercept[IllegalArgumentException] { | ||
| rFormula.fit(data) | ||
| } | ||
|
|
||
| // after checking, model build is ok | ||
| RWrapperUtils.checkDataColumns(rFormula, data) | ||
|
|
||
| assert(rFormula.getLabelCol == "label_output") | ||
| assert(rFormula.getFeaturesCol == "features_output") | ||
|
|
||
| val model = rFormula.fit(data) | ||
| assert(model.isInstanceOf[RFormulaModel]) | ||
|
|
||
| assert(model.getLabelCol == "label_output") | ||
| assert(model.getFeaturesCol == "features_output") | ||
| } | ||
|
|
||
| test("generate unique name by appending a sequence number") { | ||
| val originalName = "label" | ||
| val fieldNames = Array("label_output", "label_output1", "label_output2") | ||
| val newName = RWrapperUtils.convertToUniqueName(originalName, fieldNames) | ||
|
|
||
| assert(newName === "label_output3") | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: i think we end up checking for
label_outputtwice, once inif (data.schema.fieldNames.contains(rFormula.getFeaturesCol))and second time withinconvertToUniqueName? Perhaps we merge them?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think in
if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)), it's checkinglabelonlyand in
convertToUniqueName (),_outputwill be appended resulting inlabel_output:var newName = originalName + "_output", and thenlabel_outputis checked atwhile (fieldNames.contains(newName))am I missing something? @felix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fair enough. that makes sense, thanks