Skip to content

Commit 27edd25

Browse files
emkornfieldwesm
authored andcommitted
ARROW-210: Cleanup of the string related types in C++ code base
One thing that is worth discussing is if char types should also be removed (if they aren't i'll add the missing unit tests). I also moved CharType to type.h which seems more consistent with existing code. I can clean it up either way in a follow-up review if we decide with want to push types into their corresponding Array headers. Author: Micah Kornfield <emkornfield@gmail.com> Closes #85 from emkornfield/emk_string_types_wip and squashes the following commits: 4414816 [Micah Kornfield] remove CHAR from parquet 6f0634c [Micah Kornfield] remove char type and add dcheck 58bfcc9 [Micah Kornfield] fix style of char_type_ 1e0152d [Micah Kornfield] wip
1 parent 790d541 commit 27edd25

File tree

10 files changed

+307
-115
lines changed

10 files changed

+307
-115
lines changed

cpp/src/arrow/parquet/schema.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -250,11 +250,6 @@ Status FieldToNode(const std::shared_ptr<Field>& field, NodePtr* out) {
250250
case Type::DOUBLE:
251251
type = ParquetType::DOUBLE;
252252
break;
253-
case Type::CHAR:
254-
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
255-
logical_type = LogicalType::UTF8;
256-
length = static_cast<CharType*>(field->type.get())->size;
257-
break;
258253
case Type::STRING:
259254
type = ParquetType::BYTE_ARRAY;
260255
logical_type = LogicalType::UTF8;

cpp/src/arrow/type.cc

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,18 @@ std::string Field::ToString() const {
3131

3232
DataType::~DataType() {}
3333

34-
StringType::StringType() : DataType(Type::STRING) {}
34+
bool DataType::Equals(const DataType* other) const {
35+
bool equals = other && ((this == other) ||
36+
((this->type == other->type) &&
37+
((this->num_children() == other->num_children()))));
38+
if (equals) {
39+
for (int i = 0; i < num_children(); ++i) {
40+
// TODO(emkornfield) limit recursion
41+
if (!children_[i]->Equals(other->children_[i])) { return false; }
42+
}
43+
}
44+
return equals;
45+
}
3546

3647
std::string StringType::ToString() const {
3748
std::string result(name());
@@ -44,6 +55,10 @@ std::string ListType::ToString() const {
4455
return s.str();
4556
}
4657

58+
std::string BinaryType::ToString() const {
59+
return std::string(name());
60+
}
61+
4762
std::string StructType::ToString() const {
4863
std::stringstream s;
4964
s << "struct<";

cpp/src/arrow/type.h

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <string>
2424
#include <vector>
2525

26+
#include "arrow/util/macros.h"
27+
2628
namespace arrow {
2729

2830
// Data types in this library are all *logical*. They can be expressed as
@@ -53,15 +55,9 @@ struct Type {
5355
// 8-byte floating point value
5456
DOUBLE = 11,
5557

56-
// CHAR(N): fixed-length UTF8 string with length N
57-
CHAR = 12,
58-
5958
// UTF8 variable-length string as List<Char>
6059
STRING = 13,
6160

62-
// VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1)
63-
VARCHAR = 14,
64-
6561
// Variable-length bytes (no guarantee of UTF8-ness)
6662
BINARY = 15,
6763

@@ -114,12 +110,15 @@ struct DataType {
114110

115111
virtual ~DataType();
116112

117-
bool Equals(const DataType* other) {
118-
// Call with a pointer so more friendly to subclasses
119-
return other && ((this == other) || (this->type == other->type));
120-
}
113+
// Return whether the types are equal
114+
//
115+
// Types that are logically convertable from one to another e.g. List<UInt8>
116+
// and Binary are NOT equal).
117+
virtual bool Equals(const DataType* other) const;
121118

122-
bool Equals(const std::shared_ptr<DataType>& other) { return Equals(other.get()); }
119+
bool Equals(const std::shared_ptr<DataType>& other) const {
120+
return Equals(other.get());
121+
}
123122

124123
const std::shared_ptr<Field>& child(int i) const { return children_[i]; }
125124

@@ -236,9 +235,8 @@ struct DoubleType : public PrimitiveType<DoubleType> {
236235

237236
struct ListType : public DataType {
238237
// List can contain any other logical value type
239-
explicit ListType(const std::shared_ptr<DataType>& value_type) : DataType(Type::LIST) {
240-
children_ = {std::make_shared<Field>("item", value_type)};
241-
}
238+
explicit ListType(const std::shared_ptr<DataType>& value_type)
239+
: ListType(value_type, Type::LIST) {}
242240

243241
explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST) {
244242
children_ = {value_field};
@@ -251,15 +249,38 @@ struct ListType : public DataType {
251249
static char const* name() { return "list"; }
252250

253251
std::string ToString() const override;
252+
253+
protected:
254+
// Constructor for classes that are implemented as List Arrays.
255+
ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type)
256+
: DataType(logical_type) {
257+
// TODO ARROW-187 this can technically fail, make a constructor method ?
258+
children_ = {std::make_shared<Field>("item", value_type)};
259+
}
254260
};
255261

256-
// String is a logical type consisting of a physical list of 1-byte values
257-
struct StringType : public DataType {
258-
StringType();
262+
// BinaryType type is reprsents lists of 1-byte values.
263+
struct BinaryType : public ListType {
264+
BinaryType() : BinaryType(Type::BINARY) {}
265+
static char const* name() { return "binary"; }
266+
std::string ToString() const override;
267+
268+
protected:
269+
// Allow subclasses to change the logical type.
270+
explicit BinaryType(Type::type logical_type)
271+
: ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {}
272+
};
273+
274+
// UTF encoded strings
275+
struct StringType : public BinaryType {
276+
StringType() : BinaryType(Type::STRING) {}
259277

260278
static char const* name() { return "string"; }
261279

262280
std::string ToString() const override;
281+
282+
protected:
283+
explicit StringType(Type::type logical_type) : BinaryType(logical_type) {}
263284
};
264285

265286
struct StructType : public DataType {

cpp/src/arrow/types/construct.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,8 @@ Status MakeListArray(const TypePtr& type, int32_t length,
127127
case Type::LIST:
128128
out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap));
129129
break;
130-
case Type::CHAR:
131130
case Type::DECIMAL_TEXT:
132131
case Type::STRING:
133-
case Type::VARCHAR:
134132
out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap));
135133
break;
136134
default:

cpp/src/arrow/types/decimal.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ struct DecimalType : public DataType {
2929
: DataType(Type::DECIMAL), precision(precision_), scale(scale_) {}
3030
int precision;
3131
int scale;
32-
3332
static char const* name() { return "decimal"; }
3433

3534
std::string ToString() const override;

cpp/src/arrow/types/list.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ class ListArray : public Array {
6666
int32_t offset(int i) const { return offsets_[i]; }
6767

6868
// Neither of these functions will perform boundschecking
69-
int32_t value_offset(int i) { return offsets_[i]; }
70-
int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i]; }
69+
int32_t value_offset(int i) const { return offsets_[i]; }
70+
int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; }
7171

7272
bool EqualsExact(const ListArray& other) const;
7373
bool Equals(const std::shared_ptr<Array>& arr) const override;
@@ -92,9 +92,9 @@ class ListArray : public Array {
9292
// a sequence of offests and null values.
9393
//
9494
// A note on types. Per arrow/type.h all types in the c++ implementation are
95-
// logical so even though this class always builds an Array of lists, this can
95+
// logical so even though this class always builds list array, this can
9696
// represent multiple different logical types. If no logical type is provided
97-
// at construction time, the class defaults to List<T> where t is take from the
97+
// at construction time, the class defaults to List<T> where t is taken from the
9898
// value_builder/values that the object is constructed with.
9999
class ListBuilder : public ArrayBuilder {
100100
public:

cpp/src/arrow/types/string-test.cc

Lines changed: 161 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -34,32 +34,14 @@ namespace arrow {
3434

3535
class Buffer;
3636

37-
TEST(TypesTest, TestCharType) {
38-
CharType t1(5);
39-
40-
ASSERT_EQ(t1.type, Type::CHAR);
41-
ASSERT_EQ(t1.size, 5);
42-
43-
ASSERT_EQ(t1.ToString(), std::string("char(5)"));
44-
45-
// Test copy constructor
46-
CharType t2 = t1;
47-
ASSERT_EQ(t2.type, Type::CHAR);
48-
ASSERT_EQ(t2.size, 5);
49-
}
50-
51-
TEST(TypesTest, TestVarcharType) {
52-
VarcharType t1(5);
53-
54-
ASSERT_EQ(t1.type, Type::VARCHAR);
55-
ASSERT_EQ(t1.size, 5);
56-
57-
ASSERT_EQ(t1.ToString(), std::string("varchar(5)"));
58-
59-
// Test copy constructor
60-
VarcharType t2 = t1;
61-
ASSERT_EQ(t2.type, Type::VARCHAR);
62-
ASSERT_EQ(t2.size, 5);
37+
TEST(TypesTest, BinaryType) {
38+
BinaryType t1;
39+
BinaryType e1;
40+
StringType t2;
41+
EXPECT_TRUE(t1.Equals(&e1));
42+
EXPECT_FALSE(t1.Equals(&t2));
43+
ASSERT_EQ(t1.type, Type::BINARY);
44+
ASSERT_EQ(t1.ToString(), std::string("binary"));
6345
}
6446

6547
TEST(TypesTest, TestStringType) {
@@ -119,6 +101,7 @@ class TestStringContainer : public ::testing::Test {
119101
TEST_F(TestStringContainer, TestArrayBasics) {
120102
ASSERT_EQ(length_, strings_->length());
121103
ASSERT_EQ(1, strings_->null_count());
104+
ASSERT_OK(strings_->Validate());
122105
}
123106

124107
TEST_F(TestStringContainer, TestType) {
@@ -163,7 +146,10 @@ class TestStringBuilder : public TestBuilder {
163146
builder_.reset(new StringBuilder(pool_, type_));
164147
}
165148

166-
void Done() { result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish()); }
149+
void Done() {
150+
result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish());
151+
result_->Validate();
152+
}
167153

168154
protected:
169155
TypePtr type_;
@@ -216,4 +202,152 @@ TEST_F(TestStringBuilder, TestZeroLength) {
216202
Done();
217203
}
218204

205+
// Binary container type
206+
// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating
207+
// with String
208+
class TestBinaryContainer : public ::testing::Test {
209+
public:
210+
void SetUp() {
211+
chars_ = {'a', 'b', 'b', 'c', 'c', 'c'};
212+
offsets_ = {0, 1, 1, 1, 3, 6};
213+
valid_bytes_ = {1, 1, 0, 1, 1};
214+
expected_ = {"a", "", "", "bb", "ccc"};
215+
216+
MakeArray();
217+
}
218+
219+
void MakeArray() {
220+
length_ = offsets_.size() - 1;
221+
int nchars = chars_.size();
222+
223+
value_buf_ = test::to_buffer(chars_);
224+
values_ = ArrayPtr(new UInt8Array(nchars, value_buf_));
225+
226+
offsets_buf_ = test::to_buffer(offsets_);
227+
228+
null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_);
229+
null_count_ = test::null_count(valid_bytes_);
230+
231+
strings_ = std::make_shared<BinaryArray>(
232+
length_, offsets_buf_, values_, null_count_, null_bitmap_);
233+
}
234+
235+
protected:
236+
std::vector<int32_t> offsets_;
237+
std::vector<char> chars_;
238+
std::vector<uint8_t> valid_bytes_;
239+
240+
std::vector<std::string> expected_;
241+
242+
std::shared_ptr<Buffer> value_buf_;
243+
std::shared_ptr<Buffer> offsets_buf_;
244+
std::shared_ptr<Buffer> null_bitmap_;
245+
246+
int null_count_;
247+
int length_;
248+
249+
ArrayPtr values_;
250+
std::shared_ptr<BinaryArray> strings_;
251+
};
252+
253+
TEST_F(TestBinaryContainer, TestArrayBasics) {
254+
ASSERT_EQ(length_, strings_->length());
255+
ASSERT_EQ(1, strings_->null_count());
256+
ASSERT_OK(strings_->Validate());
257+
}
258+
259+
TEST_F(TestBinaryContainer, TestType) {
260+
TypePtr type = strings_->type();
261+
262+
ASSERT_EQ(Type::BINARY, type->type);
263+
ASSERT_EQ(Type::BINARY, strings_->type_enum());
264+
}
265+
266+
TEST_F(TestBinaryContainer, TestListFunctions) {
267+
int pos = 0;
268+
for (size_t i = 0; i < expected_.size(); ++i) {
269+
ASSERT_EQ(pos, strings_->value_offset(i));
270+
ASSERT_EQ(expected_[i].size(), strings_->value_length(i));
271+
pos += expected_[i].size();
272+
}
273+
}
274+
275+
TEST_F(TestBinaryContainer, TestDestructor) {
276+
auto arr = std::make_shared<BinaryArray>(
277+
length_, offsets_buf_, values_, null_count_, null_bitmap_);
278+
}
279+
280+
TEST_F(TestBinaryContainer, TestGetValue) {
281+
for (size_t i = 0; i < expected_.size(); ++i) {
282+
if (valid_bytes_[i] == 0) {
283+
ASSERT_TRUE(strings_->IsNull(i));
284+
} else {
285+
int32_t len = -1;
286+
const uint8_t* bytes = strings_->GetValue(i, &len);
287+
ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
288+
}
289+
}
290+
}
291+
292+
class TestBinaryBuilder : public TestBuilder {
293+
public:
294+
void SetUp() {
295+
TestBuilder::SetUp();
296+
type_ = TypePtr(new BinaryType());
297+
builder_.reset(new BinaryBuilder(pool_, type_));
298+
}
299+
300+
void Done() {
301+
result_ = std::dynamic_pointer_cast<BinaryArray>(builder_->Finish());
302+
result_->Validate();
303+
}
304+
305+
protected:
306+
TypePtr type_;
307+
308+
std::unique_ptr<BinaryBuilder> builder_;
309+
std::shared_ptr<BinaryArray> result_;
310+
};
311+
312+
TEST_F(TestBinaryBuilder, TestScalarAppend) {
313+
std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
314+
std::vector<uint8_t> is_null = {0, 0, 0, 1, 0};
315+
316+
int N = strings.size();
317+
int reps = 1000;
318+
319+
for (int j = 0; j < reps; ++j) {
320+
for (int i = 0; i < N; ++i) {
321+
if (is_null[i]) {
322+
builder_->AppendNull();
323+
} else {
324+
builder_->Append(
325+
reinterpret_cast<const uint8_t*>(strings[i].data()), strings[i].size());
326+
}
327+
}
328+
}
329+
Done();
330+
ASSERT_OK(result_->Validate());
331+
ASSERT_EQ(reps * N, result_->length());
332+
ASSERT_EQ(reps, result_->null_count());
333+
ASSERT_EQ(reps * 6, result_->values()->length());
334+
335+
int32_t length;
336+
for (int i = 0; i < N * reps; ++i) {
337+
if (is_null[i % N]) {
338+
ASSERT_TRUE(result_->IsNull(i));
339+
} else {
340+
ASSERT_FALSE(result_->IsNull(i));
341+
const uint8_t* vals = result_->GetValue(i, &length);
342+
ASSERT_EQ(strings[i % N].size(), length);
343+
ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length));
344+
}
345+
}
346+
}
347+
348+
TEST_F(TestBinaryBuilder, TestZeroLength) {
349+
// All buffers are null
350+
Done();
351+
}
352+
219353
} // namespace arrow

0 commit comments

Comments
 (0)