Skip to content

Commit 3e4e868

Browse files
maropucloud-fan
authored andcommitted
[SPARK-16135][SQL] Remove hashCode and euqals in ArrayBasedMapData
## What changes were proposed in this pull request? This pr is to remove `hashCode` and `equals` in `ArrayBasedMapData` because the type cannot be used as join keys, grouping keys, or in equality tests. ## How was this patch tested? Add a new test suite `MapDataSuite` for comparison tests. Author: Takeshi YAMAMURO <linguin.m.s@gmail.com> Closes #13847 from maropu/UnsafeMapTest.
1 parent 11f420b commit 3e4e868

File tree

6 files changed

+76
-23
lines changed

6 files changed

+76
-23
lines changed

sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,10 @@ public UnsafeMapData getMap(int ordinal) {
298298
return map;
299299
}
300300

301+
// This `hashCode` computation could consume much processor time for large data.
302+
// If the computation becomes a bottleneck, we can use a light-weight logic; the first fixed bytes
303+
// are used to compute `hashCode` (See `Vector.hashCode`).
304+
// The same issue exists in `UnsafeRow.hashCode`.
301305
@Override
302306
public int hashCode() {
303307
return Murmur3_x86_32.hashUnsafeBytes(baseObject, baseOffset, sizeInBytes, 42);

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,6 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte
2424

2525
override def copy(): MapData = new ArrayBasedMapData(keyArray.copy(), valueArray.copy())
2626

27-
override def equals(o: Any): Boolean = {
28-
if (!o.isInstanceOf[ArrayBasedMapData]) {
29-
return false
30-
}
31-
32-
val other = o.asInstanceOf[ArrayBasedMapData]
33-
if (other eq null) {
34-
return false
35-
}
36-
37-
this.keyArray == other.keyArray && this.valueArray == other.valueArray
38-
}
39-
40-
override def hashCode: Int = {
41-
keyArray.hashCode() * 37 + valueArray.hashCode()
42-
}
43-
4427
override def toString: String = {
4528
s"keys: $keyArray, values: $valueArray"
4629
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ package org.apache.spark.sql.catalyst.util
1919

2020
import org.apache.spark.sql.types.DataType
2121

22+
/**
23+
* This is an internal data representation for map type in Spark SQL. This should not implement
24+
* `equals` and `hashCode` because the type cannot be used as join keys, grouping keys, or
25+
* in equality tests. See SPARK-9415 and PR#13847 for the discussions.
26+
*/
2227
abstract class MapData extends Serializable {
2328

2429
def numElements(): Int

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,10 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
110110
case (expr, i) => Seq(Literal(i), expr)
111111
}))
112112
val plan = GenerateMutableProjection.generate(expressions)
113-
val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType))
114-
val expected = Seq(new ArrayBasedMapData(
115-
new GenericArrayData(0 until length),
116-
new GenericArrayData(Seq.fill(length)(true))))
113+
val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType)).map {
114+
case m: ArrayBasedMapData => ArrayBasedMapData.toScalaMap(m)
115+
}
116+
val expected = (0 until length).map((_, true)).toMap :: Nil
117117

118118
if (!checkResult(actual, expected)) {
119119
fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
2626
import org.apache.spark.sql.catalyst.expressions.codegen._
2727
import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
2828
import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
29+
import org.apache.spark.sql.catalyst.util.MapData
2930
import org.apache.spark.sql.types.DataType
3031
import org.apache.spark.util.Utils
3132

@@ -52,15 +53,18 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
5253

5354
/**
5455
* Check the equality between result of expression and expected value, it will handle
55-
* Array[Byte] and Spread[Double].
56+
* Array[Byte], Spread[Double], and MapData.
5657
*/
5758
protected def checkResult(result: Any, expected: Any): Boolean = {
5859
(result, expected) match {
5960
case (result: Array[Byte], expected: Array[Byte]) =>
6061
java.util.Arrays.equals(result, expected)
6162
case (result: Double, expected: Spread[Double @unchecked]) =>
6263
expected.asInstanceOf[Spread[Double]].isWithin(result)
63-
case _ => result == expected
64+
case (result: MapData, expected: MapData) =>
65+
result.keyArray() == expected.keyArray() && result.valueArray() == expected.valueArray()
66+
case _ =>
67+
result == expected
6468
}
6569
}
6670

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.expressions
19+
20+
import scala.collection._
21+
22+
import org.apache.spark.SparkFunSuite
23+
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
24+
import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
25+
import org.apache.spark.unsafe.types.UTF8String
26+
27+
class MapDataSuite extends SparkFunSuite {
28+
29+
test("inequality tests") {
30+
def u(str: String): UTF8String = UTF8String.fromString(str)
31+
32+
// test data
33+
val testMap1 = Map(u("key1") -> 1)
34+
val testMap2 = Map(u("key1") -> 1, u("key2") -> 2)
35+
val testMap3 = Map(u("key1") -> 1)
36+
val testMap4 = Map(u("key1") -> 1, u("key2") -> 2)
37+
38+
// ArrayBasedMapData
39+
val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
40+
val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
41+
val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
42+
val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
43+
assert(testArrayMap1 !== testArrayMap3)
44+
assert(testArrayMap2 !== testArrayMap4)
45+
46+
// UnsafeMapData
47+
val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
48+
val row = new GenericMutableRow(1)
49+
def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
50+
row.update(0, map)
51+
val unsafeRow = unsafeConverter.apply(row)
52+
unsafeRow.getMap(0).copy
53+
}
54+
assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
55+
assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
56+
}
57+
}

0 commit comments

Comments
 (0)