Skip to content
Closed
4 changes: 4 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,10 @@ include(ThirdpartyToolchain)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_COMMON_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARROW_CXXFLAGS}")

if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
endif()

if ("${COMPILER_FAMILY}" STREQUAL "clang")
# Using Clang with ccache causes a bunch of spurious warnings that are
# purportedly fixed in the next version of ccache. See the following for details:
Expand Down
59 changes: 59 additions & 0 deletions cpp/src/arrow/compute/compute-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,65 @@ TEST_F(TestCast, OffsetOutputBuffer) {
int16(), e3);
}

TEST_F(TestCast, StringToBoolean) {
CastOptions options;

vector<bool> is_valid = {true, false, true, true, true};

vector<std::string> v1 = {"False", "true", "true", "True", "false"};
vector<std::string> v2 = {"0", "1", "1", "1", "0"};
vector<bool> e = {false, true, true, true, false};
CheckCase<StringType, std::string, BooleanType, bool>(utf8(), v1, is_valid, boolean(),
e, options);
CheckCase<StringType, std::string, BooleanType, bool>(utf8(), v2, is_valid, boolean(),
e, options);
}

TEST_F(TestCast, StringToNumber) {
CastOptions options;

vector<bool> is_valid = {true, false, true, true, true};

// string to int
vector<std::string> v_int = {"0", "1", "127", "-1", "0"};
vector<int8_t> e_int8 = {0, 1, 127, -1, 0};
vector<int16_t> e_int16 = {0, 1, 127, -1, 0};
vector<int32_t> e_int32 = {0, 1, 127, -1, 0};
vector<int64_t> e_int64 = {0, 1, 127, -1, 0};
CheckCase<StringType, std::string, Int8Type, int8_t>(utf8(), v_int, is_valid, int8(),
e_int8, options);
CheckCase<StringType, std::string, Int16Type, int16_t>(utf8(), v_int, is_valid, int16(),
e_int16, options);
CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(),
e_int32, options);
CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(),
e_int64, options);

// string to uint
vector<std::string> v_uint = {"0", "1", "127", "255", "0"};
vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0};
vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0};
vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0};
vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0};
CheckCase<StringType, std::string, UInt8Type, uint8_t>(utf8(), v_uint, is_valid,
uint8(), e_uint8, options);
CheckCase<StringType, std::string, UInt16Type, uint16_t>(utf8(), v_uint, is_valid,
uint16(), e_uint16, options);
CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid,
uint32(), e_uint32, options);
CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid,
uint64(), e_uint64, options);

// string to float
vector<std::string> v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"};
vector<float> e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f};
vector<double> e_double = {0.1, 1.2, 127.3, 200.4, 0.5};
CheckCase<StringType, std::string, FloatType, float>(utf8(), v_float, is_valid,
float32(), e_float, options);
CheckCase<StringType, std::string, DoubleType, double>(utf8(), v_float, is_valid,
float64(), e_double, options);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you test with a non-zero offset (e.g. foo->Slice(2))?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wesm It seems that the sliced pattern is already tested in CheckCase method.
https://github.com/Licht-T/arrow/blob/master/cpp/src/arrow/compute/compute-test.cc#L123

}

template <typename TestType>
class TestDictionaryCast : public TestCast {};

Expand Down
117 changes: 117 additions & 0 deletions cpp/src/arrow/compute/kernels/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@

#include "arrow/compute/kernels/cast.h"

#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/numeric/conversion/cast.hpp>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to not rely on Boost for this, e.g. are there some alternatives in the STL or that we can access otherwise? I will review the rest in more detail later

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems that boost::numeric_cast and boost::lexical_cast are not replaceable by STL.
STL has std::to_string, but it does not support small size ints.
http://en.cppreference.com/w/cpp/string/basic_string/to_string

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be ok in the of small size ints just to upcast them? This should not affect performance as it's a small temporary.

#include <cstdint>
#include <cstring>
#include <functional>
Expand Down Expand Up @@ -735,6 +738,104 @@ struct CastFunctor<T, DictionaryType,
}
};

// ----------------------------------------------------------------------
// String to Number

template <typename T>
typename std::enable_if<std::is_arithmetic<T>::value && !std::is_same<T, int8_t>::value &&
!std::is_same<T, uint8_t>::value,
T>::type
CastStringToNumeric(const std::string& s) {
return boost::lexical_cast<T>(s);
}

template <typename T>
typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
T>::type
CastStringToNumeric(const std::string& s) {
// Convert to int before casting to T
// because boost::lexical_cast does not support 8bit int/uint.
return boost::numeric_cast<T>(boost::lexical_cast<int>(s));
}

template <typename O>
struct CastFunctor<O, StringType,
typename std::enable_if<std::is_base_of<Number, O>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
using out_type = typename O::c_type;
StringArray input_array(input.Copy());

auto out_data = GetMutableValues<out_type>(output, 1);

for (int64_t i = 0; i < input.length; ++i) {
if (input_array.IsNull(i)) {
out_data++;
continue;
}

std::string s = input_array.GetString(i);

try {
*out_data++ = CastStringToNumeric<out_type>(s);
} catch (...) {
std::stringstream ss;
ss << "Failed to cast String '" << s << "' into " << output->type->ToString();
ctx->SetStatus(Status(StatusCode::SerializationError, ss.str()));
return;
}
}
}
};

// ----------------------------------------------------------------------
// String to Boolean

template <typename O>
struct CastFunctor<O, StringType,
typename std::enable_if<std::is_same<BooleanType, O>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
StringArray input_array(input.Copy());
internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
input.length);

for (int64_t i = 0; i < input.length; ++i) {
if (input_array.IsNull(i)) {
writer.Next();
continue;
}

auto s = input_array.GetString(i);
auto s_lower = boost::algorithm::to_lower_copy(s);
bool flag;

if (s_lower == "true") {
flag = true;
} else if (s_lower == "false") {
flag = false;
} else {
try {
flag = boost::lexical_cast<bool>(s);
} catch (...) {
std::stringstream ss;
ss << "Failed to cast String '" << s << "' into " << output->type->ToString();
ctx->SetStatus(Status(StatusCode::SerializationError, ss.str()));
return;
}
}

if (flag) {
writer.Set();
} else {
writer.Clear();
}
writer.Next();
}
writer.Finish();
}
};

// ----------------------------------------------------------------------

typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
Expand Down Expand Up @@ -913,6 +1014,20 @@ class CastKernel : public UnaryKernel {
FN(TimestampType, Date64Type); \
FN(TimestampType, Int64Type);

#define STRING_CASES(FN, IN_TYPE) \
FN(StringType, StringType); \
FN(StringType, BooleanType); \
FN(StringType, UInt8Type); \
FN(StringType, Int8Type); \
FN(StringType, UInt16Type); \
FN(StringType, Int16Type); \
FN(StringType, UInt32Type); \
FN(StringType, Int32Type); \
FN(StringType, UInt64Type); \
FN(StringType, Int64Type); \
FN(StringType, FloatType); \
FN(StringType, DoubleType);

#define DICTIONARY_CASES(FN, IN_TYPE) \
FN(IN_TYPE, NullType); \
FN(IN_TYPE, Time32Type); \
Expand Down Expand Up @@ -970,6 +1085,7 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type);
GET_CAST_FUNCTION(TIME32_CASES, Time32Type);
GET_CAST_FUNCTION(TIME64_CASES, Time64Type);
GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType);
GET_CAST_FUNCTION(STRING_CASES, StringType);
GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType);

#define CAST_FUNCTION_CASE(InType) \
Expand Down Expand Up @@ -1017,6 +1133,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>&
CAST_FUNCTION_CASE(Time32Type);
CAST_FUNCTION_CASE(Time64Type);
CAST_FUNCTION_CASE(TimestampType);
CAST_FUNCTION_CASE(StringType);
CAST_FUNCTION_CASE(DictionaryType);
case Type::LIST:
RETURN_NOT_OK(GetListCastFunc(in_type, out_type, options, kernel));
Expand Down