@@ -109,7 +109,7 @@ object FileFormatWriter extends Logging {
109109 outputSpec : OutputSpec ,
110110 hadoopConf : Configuration ,
111111 partitionColumns : Seq [Attribute ],
112- bucketSpec : Option [BucketSpec ],
112+ bucketIdExpression : Option [Expression ],
113113 statsTrackers : Seq [WriteJobStatsTracker ],
114114 options : Map [String , String ])
115115 : Set [String ] = {
@@ -122,17 +122,6 @@ object FileFormatWriter extends Logging {
122122 val partitionSet = AttributeSet (partitionColumns)
123123 val dataColumns = outputSpec.outputColumns.filterNot(partitionSet.contains)
124124
125- val bucketIdExpression = bucketSpec.map { spec =>
126- val bucketColumns = spec.bucketColumnNames.map(c => dataColumns.find(_.name == c).get)
127- // Use `HashPartitioning.partitionIdExpression` as our bucket id expression, so that we can
128- // guarantee the data distribution is same between shuffle and bucketed data source, which
129- // enables us to only shuffle one side when join a bucketed table and a normal one.
130- HashPartitioning (bucketColumns, spec.numBuckets).partitionIdExpression
131- }
132- val sortColumns = bucketSpec.toSeq.flatMap {
133- spec => spec.sortColumnNames.map(c => dataColumns.find(_.name == c).get)
134- }
135-
136125 val caseInsensitiveOptions = CaseInsensitiveMap (options)
137126
138127 // Note: prepareWrite has side effect. It sets "job".
@@ -156,40 +145,14 @@ object FileFormatWriter extends Logging {
156145 statsTrackers = statsTrackers
157146 )
158147
159- // We should first sort by partition columns, then bucket id, and finally sorting columns.
160- val requiredOrdering = partitionColumns ++ bucketIdExpression ++ sortColumns
161- // the sort order doesn't matter
162- val actualOrdering = plan.outputOrdering.map(_.child)
163- val orderingMatched = if (requiredOrdering.length > actualOrdering.length) {
164- false
165- } else {
166- requiredOrdering.zip(actualOrdering).forall {
167- case (requiredOrder, childOutputOrder) =>
168- requiredOrder.semanticEquals(childOutputOrder)
169- }
170- }
171-
172148 SQLExecution .checkSQLExecutionId(sparkSession)
173149
174150 // This call shouldn't be put into the `try` block below because it only initializes and
175151 // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
176152 committer.setupJob(job)
177153
178154 try {
179- val rdd = if (orderingMatched) {
180- plan.execute()
181- } else {
182- // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
183- // the physical plan may have different attribute ids due to optimizer removing some
184- // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
185- val orderingExpr = requiredOrdering
186- .map(SortOrder (_, Ascending ))
187- .map(BindReferences .bindReference(_, outputSpec.outputColumns))
188- SortExec (
189- orderingExpr,
190- global = false ,
191- child = plan).execute()
192- }
155+ val rdd = plan.execute()
193156 val ret = new Array [WriteTaskResult ](rdd.partitions.length)
194157 sparkSession.sparkContext.runJob(
195158 rdd,
@@ -202,7 +165,7 @@ object FileFormatWriter extends Logging {
202165 committer,
203166 iterator = iter)
204167 },
205- 0 until rdd.partitions.length ,
168+ rdd.partitions.indices ,
206169 (index, res : WriteTaskResult ) => {
207170 committer.onTaskCommit(res.commitMsg)
208171 ret(index) = res
@@ -521,18 +484,18 @@ object FileFormatWriter extends Logging {
521484 var recordsInFile : Long = 0L
522485 var fileCounter = 0
523486 val updatedPartitions = mutable.Set [String ]()
524- var currentPartionValues : Option [UnsafeRow ] = None
487+ var currentPartitionValues : Option [UnsafeRow ] = None
525488 var currentBucketId : Option [Int ] = None
526489
527490 for (row <- iter) {
528491 val nextPartitionValues = if (isPartitioned) Some (getPartitionValues(row)) else None
529492 val nextBucketId = if (isBucketed) Some (getBucketId(row)) else None
530493
531- if (currentPartionValues != nextPartitionValues || currentBucketId != nextBucketId) {
494+ if (currentPartitionValues != nextPartitionValues || currentBucketId != nextBucketId) {
532495 // See a new partition or bucket - write to a new partition dir (or a new bucket file).
533- if (isPartitioned && currentPartionValues != nextPartitionValues) {
534- currentPartionValues = Some (nextPartitionValues.get.copy())
535- statsTrackers.foreach(_.newPartition(currentPartionValues .get))
496+ if (isPartitioned && currentPartitionValues != nextPartitionValues) {
497+ currentPartitionValues = Some (nextPartitionValues.get.copy())
498+ statsTrackers.foreach(_.newPartition(currentPartitionValues .get))
536499 }
537500 if (isBucketed) {
538501 currentBucketId = nextBucketId
@@ -543,7 +506,7 @@ object FileFormatWriter extends Logging {
543506 fileCounter = 0
544507
545508 releaseResources()
546- newOutputWriter(currentPartionValues , currentBucketId, fileCounter, updatedPartitions)
509+ newOutputWriter(currentPartitionValues , currentBucketId, fileCounter, updatedPartitions)
547510 } else if (desc.maxRecordsPerFile > 0 &&
548511 recordsInFile >= desc.maxRecordsPerFile) {
549512 // Exceeded the threshold in terms of the number of records per file.
@@ -554,7 +517,7 @@ object FileFormatWriter extends Logging {
554517 s " File counter $fileCounter is beyond max value $MAX_FILE_COUNTER" )
555518
556519 releaseResources()
557- newOutputWriter(currentPartionValues , currentBucketId, fileCounter, updatedPartitions)
520+ newOutputWriter(currentPartitionValues , currentBucketId, fileCounter, updatedPartitions)
558521 }
559522 val outputRow = getOutputRow(row)
560523 currentWriter.write(outputRow)
0 commit comments