-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-45088][PYTHON][CONNECT] Make getitem work with duplicated columns
#42828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,6 @@ | |
| import io | ||
| from contextlib import redirect_stdout | ||
|
|
||
| from pyspark import StorageLevel | ||
|
||
| from pyspark.sql import SparkSession, Row, functions | ||
| from pyspark.sql.functions import col, lit, count, sum, mean, struct | ||
| from pyspark.sql.pandas.utils import pyarrow_version_less_than_minimum | ||
|
|
@@ -63,6 +62,51 @@ | |
|
|
||
|
|
||
| class DataFrameTestsMixin: | ||
| def test_getitem_invalid_indices(self): | ||
| df = self.spark.sql( | ||
| "SELECT * FROM VALUES " | ||
| "(1, 1.1, 'a'), " | ||
| "(2, 2.2, 'b'), " | ||
| "(4, 4.4, 'c') " | ||
| "AS TAB(a, b, c)" | ||
| ) | ||
|
|
||
| # accepted type and values | ||
| for index in [False, True, 0, 1, 2, -1, -2, -3]: | ||
| df[index] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really a bad API. Why do we add such an API? To support order by ordinal, we can just order by integer literals. The SQL parser also parses
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
To support
Do you mean use should directly
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have offline discussion with wenchen, will fix it by switching to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it seems Let me revert this PR first |
||
|
|
||
| # negative cases: ordinal out of range | ||
| for index in [-10, -4, 3, 10, 100]: | ||
| with self.assertRaises(IndexError): | ||
| df[index] | ||
|
|
||
| # negative cases: unsupported types | ||
| for index in [None, 1.0, Decimal(1)]: | ||
| with self.assertRaises(PySparkTypeError): | ||
| df[index] | ||
|
|
||
| def test_getitem_duplicated_column(self): | ||
| df = self.spark.sql( | ||
| "SELECT * FROM VALUES " | ||
| "(1, 1.1, 'a'), " | ||
| "(2, 2.2, 'b'), " | ||
| "(4, 4.4, 'c') " | ||
| "AS TAB(a, a, a)" | ||
| ) | ||
|
|
||
| self.assertEqual( | ||
| df.select(df[0]).schema.simpleString(), | ||
| "struct<a:int>", | ||
| ) | ||
| self.assertEqual( | ||
| df.select(df[1]).schema.simpleString(), | ||
| "struct<a:decimal(2,1)>", | ||
| ) | ||
| self.assertEqual( | ||
| df.select(df[2]).schema.simpleString(), | ||
| "struct<a:string>", | ||
| ) | ||
|
|
||
| def test_range(self): | ||
| self.assertEqual(self.spark.range(1, 1).count(), 0) | ||
| self.assertEqual(self.spark.range(1, 0, -1).count(), 1) | ||
|
|
@@ -77,7 +121,6 @@ def test_duplicated_column_names(self): | |
| self.assertEqual(2, row[1]) | ||
| self.assertEqual("Row(c=1, c=2)", str(row)) | ||
| # Cannot access columns | ||
| self.assertRaises(AnalysisException, lambda: df.select(df[0]).first()) | ||
| self.assertRaises(AnalysisException, lambda: df.select(df.c).first()) | ||
| self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first()) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also cc @cloud-fan for the usage of
GetColumnByOrdinal