Skip to content

Commit f01a8db

Browse files
amaliujiaHyukjinKwon
authored andcommitted
[SPARK-41164][CONNECT] Update relations.proto to follow Connect proto development guide
### What changes were proposed in this pull request? As we have a guidance for Connect proto ([adding proto messages](https://github.com/apache/spark/blob/master/connector/connect/docs/adding-proto-messages.md)), this PR updates `relations.proto` to follow the development guide. This PR also adds some missing documentation for the proto. ### Why are the changes needed? 1. Follow development guide. 2. Improve proto Documentation. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? Existing UT Closes #38678 from amaliujia/improve_relation_proto_to_follow_proto_rules. Authored-by: Rui Wang <rui.wang@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
1 parent a428e44 commit f01a8db

File tree

3 files changed

+372
-172
lines changed

3 files changed

+372
-172
lines changed

connector/connect/src/main/protobuf/spark/connect/relations.proto

Lines changed: 101 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,13 @@ message Unknown {}
6767

6868
// Common metadata of all relations.
6969
message RelationCommon {
70+
// (Required) Shared relation metadata.
7071
string source_info = 1;
7172
}
7273

7374
// Relation that uses a SQL query to generate the output.
7475
message SQL {
76+
// (Required) The SQL query.
7577
string query = 1;
7678
}
7779

@@ -84,15 +86,20 @@ message Read {
8486
}
8587

8688
message NamedTable {
89+
// (Required) Unparsed identifier for the table.
8790
string unparsed_identifier = 1;
8891
}
8992

9093
message DataSource {
91-
// Required. Supported formats include: parquet, orc, text, json, parquet, csv, avro.
94+
// (Required) Supported formats include: parquet, orc, text, json, parquet, csv, avro.
9295
string format = 1;
93-
// Optional. If not set, Spark will infer the schema.
94-
string schema = 2;
95-
// The key is case insensitive.
96+
97+
// (Optional) If not set, Spark will infer the schema.
98+
optional string schema = 2;
99+
100+
// Options for the data source. The context of this map varies based on the
101+
// data source format. This options could be empty for valid data source format.
102+
// The map key is case insensitive.
96103
map<string, string> options = 3;
97104
}
98105
}
@@ -106,24 +113,39 @@ message Project {
106113
//
107114
// For example, `SELECT ABS(-1)` is valid plan without an input plan.
108115
Relation input = 1;
116+
117+
// (Required) A Project requires at least one expression.
109118
repeated Expression expressions = 3;
110119
}
111120

112121
// Relation that applies a boolean expression `condition` on each row of `input` to produce
113122
// the output result.
114123
message Filter {
124+
// (Required) Input relation for a Filter.
115125
Relation input = 1;
126+
127+
// (Required) A Filter must have a condition expression.
116128
Expression condition = 2;
117129
}
118130

119131
// Relation of type [[Join]].
120132
//
121133
// `left` and `right` must be present.
122134
message Join {
135+
// (Required) Left input relation for a Join.
123136
Relation left = 1;
137+
138+
// (Required) Right input relation for a Join.
124139
Relation right = 2;
140+
141+
// (Optional) The join condition. Could be unset when `using_columns` is utilized.
142+
//
143+
// This field does not co-exist with using_columns.
125144
Expression join_condition = 3;
145+
146+
// (Required) The join type.
126147
JoinType join_type = 4;
148+
127149
// Optional. using_columns provides a list of columns that should present on both sides of
128150
// the join inputs that this Join will join on. For example A JOIN B USING col_name is
129151
// equivalent to A JOIN B on A.col_name = B.col_name.
@@ -144,11 +166,25 @@ message Join {
144166

145167
// Relation of type [[SetOperation]]
146168
message SetOperation {
169+
// (Required) Left input relation for a Set operation.
147170
Relation left_input = 1;
171+
172+
// (Required) Right input relation for a Set operation.
148173
Relation right_input = 2;
174+
175+
// (Required) The Set operation type.
149176
SetOpType set_op_type = 3;
150-
bool is_all = 4;
151-
bool by_name = 5;
177+
178+
// (Optional) If to remove duplicate rows.
179+
//
180+
// True to preserve all results.
181+
// False to remove duplicate rows.
182+
optional bool is_all = 4;
183+
184+
// (Optional) If to perform the Set operation based on name resolution.
185+
//
186+
// Only UNION supports this option.
187+
optional bool by_name = 5;
152188

153189
enum SetOpType {
154190
SET_OP_TYPE_UNSPECIFIED = 0;
@@ -160,29 +196,42 @@ message SetOperation {
160196

161197
// Relation of type [[Limit]] that is used to `limit` rows from the input relation.
162198
message Limit {
199+
// (Required) Input relation for a Limit.
163200
Relation input = 1;
201+
202+
// (Required) the limit.
164203
int32 limit = 2;
165204
}
166205

167206
// Relation of type [[Offset]] that is used to read rows staring from the `offset` on
168207
// the input relation.
169208
message Offset {
209+
// (Required) Input relation for an Offset.
170210
Relation input = 1;
211+
212+
// (Required) the limit.
171213
int32 offset = 2;
172214
}
173215

174216
// Relation of type [[Aggregate]].
175217
message Aggregate {
218+
// (Required) Input relation for a Aggregate.
176219
Relation input = 1;
220+
177221
repeated Expression grouping_expressions = 2;
178222
repeated Expression result_expressions = 3;
179223
}
180224

181225
// Relation of type [[Sort]].
182226
message Sort {
227+
// (Required) Input relation for a Sort.
183228
Relation input = 1;
229+
230+
// (Required) Sort fields.
184231
repeated SortField sort_fields = 2;
185-
bool is_global = 3;
232+
233+
// (Optional) if this is a global sort.
234+
optional bool is_global = 3;
186235

187236
message SortField {
188237
Expression expression = 1;
@@ -206,58 +255,83 @@ message Sort {
206255
// Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only
207256
// the subset of columns or all the columns.
208257
message Deduplicate {
258+
// (Required) Input relation for a Deduplicate.
209259
Relation input = 1;
260+
261+
// (Optional) Deduplicate based on a list of column names.
262+
//
263+
// This field does not co-use with `all_columns_as_keys`.
210264
repeated string column_names = 2;
211-
bool all_columns_as_keys = 3;
265+
266+
// (Optional) Deduplicate based on all the columns of the input relation.
267+
//
268+
// This field does not co-use with `column_names`.
269+
optional bool all_columns_as_keys = 3;
212270
}
213271

272+
// A relation that does not need to be qualified by name.
214273
message LocalRelation {
274+
// (Optional) A list qualified attributes.
215275
repeated Expression.QualifiedAttribute attributes = 1;
216276
// TODO: support local data.
217277
}
218278

219279
// Relation of type [[Sample]] that samples a fraction of the dataset.
220280
message Sample {
281+
// (Required) Input relation for a Sample.
221282
Relation input = 1;
283+
284+
// (Required) lower bound.
222285
double lower_bound = 2;
286+
287+
// (Required) upper bound.
223288
double upper_bound = 3;
224-
bool with_replacement = 4;
289+
290+
// (Optional) Whether to sample with replacement.
291+
optional bool with_replacement = 4;
292+
293+
// (Optional) The random seed.
225294
optional int64 seed = 5;
226295
}
227296

228297
// Relation of type [[Range]] that generates a sequence of integers.
229298
message Range {
230-
// Optional. Default value = 0
231-
int64 start = 1;
232-
// Required.
299+
// (Optional) Default value = 0
300+
optional int64 start = 1;
301+
302+
// (Required)
233303
int64 end = 2;
234-
// Required.
304+
305+
// (Required)
235306
int64 step = 3;
307+
236308
// Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if
237309
// it is set, or 2) spark default parallelism.
238310
optional int32 num_partitions = 4;
239311
}
240312

241313
// Relation alias.
242314
message SubqueryAlias {
243-
// Required. The input relation.
315+
// (Required) The input relation of SubqueryAlias.
244316
Relation input = 1;
245-
// Required. The alias.
317+
318+
// (Required) The alias.
246319
string alias = 2;
247-
// Optional. Qualifier of the alias.
320+
321+
// (Optional) Qualifier of the alias.
248322
repeated string qualifier = 3;
249323
}
250324

251325
// Relation repartition.
252326
message Repartition {
253-
// Required. The input relation.
327+
// (Required) The input relation of Repartition.
254328
Relation input = 1;
255329

256-
// Required. Must be positive.
330+
// (Required) Must be positive.
257331
int32 num_partitions = 2;
258332

259-
// Optional. Default value is false.
260-
bool shuffle = 3;
333+
// (Optional) Default value is false.
334+
optional bool shuffle = 3;
261335
}
262336

263337
// Compose the string representing rows for output.
@@ -267,14 +341,14 @@ message ShowString {
267341
Relation input = 1;
268342

269343
// (Required) Number of rows to show.
270-
optional int32 numRows = 2;
344+
int32 numRows = 2;
271345

272346
// (Required) If set to more than 0, truncates strings to
273347
// `truncate` characters and all cells will be aligned right.
274-
optional int32 truncate = 3;
348+
int32 truncate = 3;
275349

276350
// (Required) If set to true, prints output rows vertically (one line per column value).
277-
optional bool vertical = 4;
351+
bool vertical = 4;
278352
}
279353

280354
// Computes specified statistics for numeric and string columns.
@@ -344,10 +418,10 @@ message NAFill {
344418

345419
// Rename columns on the input relation by the same length of names.
346420
message RenameColumnsBySameLengthNames {
347-
// Required. The input relation.
421+
// (Required) The input relation of RenameColumnsBySameLengthNames.
348422
Relation input = 1;
349423

350-
// Required.
424+
// (Required)
351425
//
352426
// The number of columns of the input relation must be equal to the length
353427
// of this field. If this is not true, an exception will be returned.
@@ -357,11 +431,11 @@ message RenameColumnsBySameLengthNames {
357431

358432
// Rename columns on the input relation by a map with name to name mapping.
359433
message RenameColumnsByNameToNameMap {
360-
// Required. The input relation.
434+
// (Required) The input relation.
361435
Relation input = 1;
362436

363437

364-
// Required.
438+
// (Required)
365439
//
366440
// Renaming column names of input relation from A to B where A is the map key
367441
// and B is the map value. This is a no-op if schema doesn't contain any A. It

0 commit comments

Comments
 (0)