Skip to content
This repository was archived by the owner on May 10, 2024. It is now read-only.

Commit 2594141

Browse files
author
Artem Tarasov
committed
switched to signed binary comparison, added tests
1 parent 9f7a20b commit 2594141

File tree

3 files changed

+98
-2
lines changed

3 files changed

+98
-2
lines changed

src/parquet/util/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ endif()
7171

7272
ADD_PARQUET_TEST(bit-util-test)
7373
ADD_PARQUET_TEST(buffer-test)
74+
ADD_PARQUET_TEST(comparison-test)
7475
ADD_PARQUET_TEST(input-output-test)
7576
ADD_PARQUET_TEST(mem-allocator-test)
7677
ADD_PARQUET_TEST(mem-pool-test)
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <gtest/gtest.h>
19+
20+
#include "parquet/schema/descriptor.h"
21+
#include "parquet/types.h"
22+
#include "parquet/util/comparison.h"
23+
24+
#include <cstdint>
25+
#include <iostream>
26+
#include <vector>
27+
28+
namespace parquet {
29+
30+
namespace test {
31+
32+
static ByteArray ByteArrayFromString(const std::string& s) {
33+
auto ptr = reinterpret_cast<const uint8_t*>(s.data());
34+
return ByteArray(s.size(), ptr);
35+
}
36+
37+
static FLBA FLBAFromString(const std::string& s) {
38+
auto ptr = reinterpret_cast<const uint8_t*>(s.data());
39+
return FLBA(ptr);
40+
}
41+
42+
TEST(Comparison, ByteArray) {
43+
using namespace parquet::schema;
44+
NodePtr node = PrimitiveNode::Make("bytearray", Repetition::REQUIRED, Type::BYTE_ARRAY);
45+
ColumnDescriptor descr(node, 0, 0);
46+
Compare<parquet::ByteArray> less(&descr);
47+
48+
std::string a = "arrange";
49+
std::string b = "arrangement";
50+
auto arr1 = ByteArrayFromString(a);
51+
auto arr2 = ByteArrayFromString(b);
52+
ASSERT_TRUE(less(arr1, arr2));
53+
54+
a = u8"braten";
55+
b = u8"bügeln";
56+
auto arr3 = ByteArrayFromString(a);
57+
auto arr4 = ByteArrayFromString(b);
58+
// see PARQUET-686 discussion about binary comparison
59+
ASSERT_TRUE(!less(arr3, arr4));
60+
}
61+
62+
TEST(Comparison, FLBA) {
63+
using namespace parquet::schema;
64+
65+
std::string a = "Antidisestablishmentarianism";
66+
std::string b = "Bundesgesundheitsministerium";
67+
auto arr1 = FLBAFromString(a);
68+
auto arr2 = FLBAFromString(b);
69+
70+
NodePtr node = PrimitiveNode::Make("FLBA", Repetition::REQUIRED,
71+
Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, a.size());
72+
ColumnDescriptor descr(node, 0, 0);
73+
Compare<parquet::FixedLenByteArray> less(&descr);
74+
ASSERT_TRUE(less(arr1, arr2));
75+
}
76+
77+
TEST(Comparison, Int96) {
78+
parquet::Int96 a{1, 41, 14}, b{1, 41, 42};
79+
80+
using namespace parquet::schema;
81+
NodePtr node = PrimitiveNode::Make("int96", Repetition::REQUIRED, Type::INT96);
82+
ColumnDescriptor descr(node, 0, 0);
83+
Compare<parquet::Int96> less(&descr);
84+
ASSERT_TRUE(less(a, b));
85+
b.value[2] = 14;
86+
ASSERT_TRUE(!less(a, b) && !less(b, a));
87+
}
88+
89+
} // namespace test
90+
91+
} // namespace parquet

src/parquet/util/comparison.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,17 @@ inline bool Compare<Int96>::operator()(const Int96& a, const Int96& b) {
4242

4343
template <>
4444
inline bool Compare<ByteArray>::operator()(const ByteArray& a, const ByteArray& b) {
45-
return std::lexicographical_compare(a.ptr, a.ptr + a.len, b.ptr, b.ptr + b.len);
45+
auto aptr = reinterpret_cast<const int8_t*>(a.ptr);
46+
auto bptr = reinterpret_cast<const int8_t*>(b.ptr);
47+
return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len);
4648
}
4749

4850
template <>
4951
inline bool Compare<FLBA>::operator()(const FLBA& a, const FLBA& b) {
52+
auto aptr = reinterpret_cast<const int8_t*>(a.ptr);
53+
auto bptr = reinterpret_cast<const int8_t*>(b.ptr);
5054
return std::lexicographical_compare(
51-
a.ptr, a.ptr + type_length_, b.ptr, b.ptr + type_length_);
55+
aptr, aptr + type_length_, bptr, bptr + type_length_);
5256
}
5357

5458
} // namespace parquet

0 commit comments

Comments
 (0)