-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-1491: [C++] Add casting implementations from strings to numbers or boolean #1387
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d614dd4
8a6470c
32a9170
6252cb0
5d3871d
24dec32
fbd5907
4d0d781
6020f3d
fdbd3e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,9 @@ | |
|
|
||
| #include "arrow/compute/kernels/cast.h" | ||
|
|
||
| #include <boost/algorithm/string.hpp> | ||
| #include <boost/lexical_cast.hpp> | ||
| #include <boost/numeric/conversion/cast.hpp> | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to not rely on Boost for this, e.g. are there some alternatives in the STL or that we can access otherwise? I will review the rest in more detail later
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems that
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't it be ok in the of small size ints just to upcast them? This should not affect performance as it's a small temporary. |
||
| #include <cstdint> | ||
| #include <cstring> | ||
| #include <functional> | ||
|
|
@@ -735,6 +738,104 @@ struct CastFunctor<T, DictionaryType, | |
| } | ||
| }; | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // String to Number | ||
|
|
||
| template <typename T> | ||
| typename std::enable_if<std::is_arithmetic<T>::value && !std::is_same<T, int8_t>::value && | ||
| !std::is_same<T, uint8_t>::value, | ||
| T>::type | ||
| CastStringToNumeric(const std::string& s) { | ||
| return boost::lexical_cast<T>(s); | ||
| } | ||
|
|
||
| template <typename T> | ||
| typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, | ||
| T>::type | ||
| CastStringToNumeric(const std::string& s) { | ||
| // Convert to int before casting to T | ||
| // because boost::lexical_cast does not support 8bit int/uint. | ||
| return boost::numeric_cast<T>(boost::lexical_cast<int>(s)); | ||
| } | ||
|
|
||
| template <typename O> | ||
| struct CastFunctor<O, StringType, | ||
| typename std::enable_if<std::is_base_of<Number, O>::value>::type> { | ||
| void operator()(FunctionContext* ctx, const CastOptions& options, | ||
| const ArrayData& input, ArrayData* output) { | ||
| using out_type = typename O::c_type; | ||
| StringArray input_array(input.Copy()); | ||
|
|
||
| auto out_data = GetMutableValues<out_type>(output, 1); | ||
|
|
||
| for (int64_t i = 0; i < input.length; ++i) { | ||
| if (input_array.IsNull(i)) { | ||
| out_data++; | ||
| continue; | ||
| } | ||
|
|
||
| std::string s = input_array.GetString(i); | ||
|
|
||
| try { | ||
| *out_data++ = CastStringToNumeric<out_type>(s); | ||
| } catch (...) { | ||
| std::stringstream ss; | ||
| ss << "Failed to cast String '" << s << "' into " << output->type->ToString(); | ||
| ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); | ||
| return; | ||
| } | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // String to Boolean | ||
|
|
||
| template <typename O> | ||
| struct CastFunctor<O, StringType, | ||
| typename std::enable_if<std::is_same<BooleanType, O>::value>::type> { | ||
| void operator()(FunctionContext* ctx, const CastOptions& options, | ||
| const ArrayData& input, ArrayData* output) { | ||
| StringArray input_array(input.Copy()); | ||
| internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset, | ||
| input.length); | ||
|
|
||
| for (int64_t i = 0; i < input.length; ++i) { | ||
| if (input_array.IsNull(i)) { | ||
| writer.Next(); | ||
| continue; | ||
| } | ||
|
|
||
| auto s = input_array.GetString(i); | ||
| auto s_lower = boost::algorithm::to_lower_copy(s); | ||
| bool flag; | ||
|
|
||
| if (s_lower == "true") { | ||
| flag = true; | ||
| } else if (s_lower == "false") { | ||
| flag = false; | ||
| } else { | ||
| try { | ||
| flag = boost::lexical_cast<bool>(s); | ||
| } catch (...) { | ||
| std::stringstream ss; | ||
| ss << "Failed to cast String '" << s << "' into " << output->type->ToString(); | ||
| ctx->SetStatus(Status(StatusCode::SerializationError, ss.str())); | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| if (flag) { | ||
| writer.Set(); | ||
| } else { | ||
| writer.Clear(); | ||
| } | ||
| writer.Next(); | ||
| } | ||
| writer.Finish(); | ||
| } | ||
| }; | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
|
|
||
| typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&, | ||
|
|
@@ -913,6 +1014,20 @@ class CastKernel : public UnaryKernel { | |
| FN(TimestampType, Date64Type); \ | ||
| FN(TimestampType, Int64Type); | ||
|
|
||
| #define STRING_CASES(FN, IN_TYPE) \ | ||
| FN(StringType, StringType); \ | ||
| FN(StringType, BooleanType); \ | ||
| FN(StringType, UInt8Type); \ | ||
| FN(StringType, Int8Type); \ | ||
| FN(StringType, UInt16Type); \ | ||
| FN(StringType, Int16Type); \ | ||
| FN(StringType, UInt32Type); \ | ||
| FN(StringType, Int32Type); \ | ||
| FN(StringType, UInt64Type); \ | ||
| FN(StringType, Int64Type); \ | ||
| FN(StringType, FloatType); \ | ||
| FN(StringType, DoubleType); | ||
|
|
||
| #define DICTIONARY_CASES(FN, IN_TYPE) \ | ||
| FN(IN_TYPE, NullType); \ | ||
| FN(IN_TYPE, Time32Type); \ | ||
|
|
@@ -970,6 +1085,7 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type); | |
| GET_CAST_FUNCTION(TIME32_CASES, Time32Type); | ||
| GET_CAST_FUNCTION(TIME64_CASES, Time64Type); | ||
| GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); | ||
| GET_CAST_FUNCTION(STRING_CASES, StringType); | ||
| GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); | ||
|
|
||
| #define CAST_FUNCTION_CASE(InType) \ | ||
|
|
@@ -1017,6 +1133,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>& | |
| CAST_FUNCTION_CASE(Time32Type); | ||
| CAST_FUNCTION_CASE(Time64Type); | ||
| CAST_FUNCTION_CASE(TimestampType); | ||
| CAST_FUNCTION_CASE(StringType); | ||
| CAST_FUNCTION_CASE(DictionaryType); | ||
| case Type::LIST: | ||
| RETURN_NOT_OK(GetListCastFunc(in_type, out_type, options, kernel)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you test with a non-zero offset (e.g.
foo->Slice(2))?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@wesm It seems that the sliced pattern is already tested in
CheckCasemethod.https://github.com/Licht-T/arrow/blob/master/cpp/src/arrow/compute/compute-test.cc#L123