Skip to content

Commit

Permalink
feat: add arrow_cast function to support supports arbitrary arrow t…
Browse files Browse the repository at this point in the history
…ypes (#5166)

* Add `arrow_cast` function

* prettier

* Update datafusion/sql/src/expr/arrow_cast.rs

Co-authored-by: Wei-Ting Kuo <waitingkuo0527@gmail.com>

* Apply suggestions from code review

Co-authored-by: Wei-Ting Kuo <waitingkuo0527@gmail.com>

* Clarify intent of tests

* Add more error tests

* More tests

* fix test

* reuse buffer to avoid an allocation per word

* add ticket link

* allow trailing whitespace, add tests for whitespace

---------

Co-authored-by: Wei-Ting Kuo <waitingkuo0527@gmail.com>
  • Loading branch information
alamb and waitingkuo authored Mar 8, 2023
1 parent 84530a2 commit e46924d
Show file tree
Hide file tree
Showing 8 changed files with 1,056 additions and 53 deletions.
267 changes: 239 additions & 28 deletions datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt
Original file line number Diff line number Diff line change
Expand Up @@ -52,31 +52,242 @@ SELECT arrow_typeof(1.0::float)
Float32

# arrow_typeof_decimal
# query T
# SELECT arrow_typeof(1::Decimal)
# ----
# Decimal128(38, 10)

# # arrow_typeof_timestamp
# query T
# SELECT arrow_typeof(now()::timestamp)
# ----
# Timestamp(Nanosecond, None)

# # arrow_typeof_timestamp_utc
# query T
# SELECT arrow_typeof(now())
# ----
# Timestamp(Nanosecond, Some(\"+00:00\"))

# # arrow_typeof_timestamp_date32(
# query T
# SELECT arrow_typeof(now()::date)
# ----
# Date32

# # arrow_typeof_utf8
# query T
# SELECT arrow_typeof('1')
# ----
# Utf8
query T
SELECT arrow_typeof(1::Decimal)
----
Decimal128(38, 10)

# arrow_typeof_timestamp
query T
SELECT arrow_typeof(now()::timestamp)
----
Timestamp(Nanosecond, None)

# arrow_typeof_timestamp_utc
query T
SELECT arrow_typeof(now())
----
Timestamp(Nanosecond, Some("+00:00"))

# arrow_typeof_timestamp_date32(
query T
SELECT arrow_typeof(now()::date)
----
Date32

# arrow_typeof_utf8
query T
SELECT arrow_typeof('1')
----
Utf8


#### arrow_cast (in some ways opposite of arrow_typeof)

# Basic tests

query I
SELECT arrow_cast('1', 'Int16')
----
1

# Basic error test
query error Error during planning: arrow_cast needs 2 arguments, 1 provided
SELECT arrow_cast('1')

query error Error during planning: arrow_cast requires its second argument to be a constant string, got Int64\(43\)
SELECT arrow_cast('1', 43)

query error Error unrecognized word: unknown
SELECT arrow_cast('1', 'unknown')

# Round Trip tests:
query TTTTTTTTTTTTTTTTTTT
SELECT
arrow_typeof(arrow_cast(1, 'Int8')) as col_i8,
arrow_typeof(arrow_cast(1, 'Int16')) as col_i16,
arrow_typeof(arrow_cast(1, 'Int32')) as col_i32,
arrow_typeof(arrow_cast(1, 'Int64')) as col_i64,
arrow_typeof(arrow_cast(1, 'UInt8')) as col_u8,
arrow_typeof(arrow_cast(1, 'UInt16')) as col_u16,
arrow_typeof(arrow_cast(1, 'UInt32')) as col_u32,
arrow_typeof(arrow_cast(1, 'UInt64')) as col_u64,
-- can't seem to cast to Float16 for some reason
-- arrow_typeof(arrow_cast(1, 'Float16')) as col_f16,
arrow_typeof(arrow_cast(1, 'Float32')) as col_f32,
arrow_typeof(arrow_cast(1, 'Float64')) as col_f64,
arrow_typeof(arrow_cast('foo', 'Utf8')) as col_utf8,
arrow_typeof(arrow_cast('foo', 'LargeUtf8')) as col_large_utf8,
arrow_typeof(arrow_cast('foo', 'Binary')) as col_binary,
arrow_typeof(arrow_cast('foo', 'LargeBinary')) as col_large_binary,
arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)')) as col_ts_s,
arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)')) as col_ts_ms,
arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)')) as col_ts_us,
arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)')) as col_ts_ns,
arrow_typeof(arrow_cast('foo', 'Dictionary(Int32, Utf8)')) as col_dict
----
Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float32 Float64 Utf8 LargeUtf8 Binary LargeBinary Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) Dictionary(Int32, Utf8)




## Basic Types: Create a table

statement ok
create table foo as select
arrow_cast(1, 'Int8') as col_i8,
arrow_cast(1, 'Int16') as col_i16,
arrow_cast(1, 'Int32') as col_i32,
arrow_cast(1, 'Int64') as col_i64,
arrow_cast(1, 'UInt8') as col_u8,
arrow_cast(1, 'UInt16') as col_u16,
arrow_cast(1, 'UInt32') as col_u32,
arrow_cast(1, 'UInt64') as col_u64,
-- can't seem to cast to Float16 for some reason
-- arrow_cast(1.0, 'Float16') as col_f16,
arrow_cast(1.0, 'Float32') as col_f32,
arrow_cast(1.0, 'Float64') as col_f64
;

## Ensure each column in the table has the expected type

query TTTTTTTTTT
SELECT
arrow_typeof(col_i8),
arrow_typeof(col_i16),
arrow_typeof(col_i32),
arrow_typeof(col_i64),
arrow_typeof(col_u8),
arrow_typeof(col_u16),
arrow_typeof(col_u32),
arrow_typeof(col_u64),
-- arrow_typeof(col_f16),
arrow_typeof(col_f32),
arrow_typeof(col_f64)
FROM foo;
----
Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float32 Float64


statement ok
drop table foo

## Decimals: Create a table

statement ok
create table foo as select
arrow_cast(100, 'Decimal128(3,2)') as col_d128
-- Can't make a decimal 156:
-- This feature is not implemented: Can't create a scalar from array of type "Decimal256(3, 2)"
--arrow_cast(100, 'Decimal256(3,2)') as col_d256
;


## Ensure each column in the table has the expected type

query T
SELECT
arrow_typeof(col_d128)
-- arrow_typeof(col_d256),
FROM foo;
----
Decimal128(3, 2)


statement ok
drop table foo

## Strings, Binary: Create a table

statement ok
create table foo as select
arrow_cast('foo', 'Utf8') as col_utf8,
arrow_cast('foo', 'LargeUtf8') as col_large_utf8,
arrow_cast('foo', 'Binary') as col_binary,
arrow_cast('foo', 'LargeBinary') as col_large_binary
;

## Ensure each column in the table has the expected type

query TTTT
SELECT
arrow_typeof(col_utf8),
arrow_typeof(col_large_utf8),
arrow_typeof(col_binary),
arrow_typeof(col_large_binary)
FROM foo;
----
Utf8 LargeUtf8 Binary LargeBinary


statement ok
drop table foo


## Timestamps: Create a table

statement ok
create table foo as select
arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)') as col_ts_s,
arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)') as col_ts_ms,
arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)') as col_ts_us,
arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)') as col_ts_ns
;

## Ensure each column in the table has the expected type

query TTTT
SELECT
arrow_typeof(col_ts_s),
arrow_typeof(col_ts_ms),
arrow_typeof(col_ts_us),
arrow_typeof(col_ts_ns)
FROM foo;
----
Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None)


statement ok
drop table foo

## Dictionaries

statement ok
create table foo as select
arrow_cast('foo', 'Dictionary(Int32, Utf8)') as col_dict_int32_utf8,
arrow_cast('foo', 'Dictionary(Int8, LargeUtf8)') as col_dict_int8_largeutf8
;

## Ensure each column in the table has the expected type

query TT
SELECT
arrow_typeof(col_dict_int32_utf8),
arrow_typeof(col_dict_int8_largeutf8)
FROM foo;
----
Dictionary(Int32, Utf8) Dictionary(Int8, LargeUtf8)


statement ok
drop table foo


## Intervals:

query error Cannot automatically convert Interval\(DayTime\) to Interval\(MonthDayNano\)
---
select arrow_cast(interval '30 minutes', 'Interval(MonthDayNano)');

query error DataFusion error: Error during planning: Cannot automatically convert Utf8 to Interval\(MonthDayNano\)
select arrow_cast('30 minutes', 'Interval(MonthDayNano)');


## Duration

query error Cannot automatically convert Interval\(DayTime\) to Duration\(Second\)
---
select arrow_cast(interval '30 minutes', 'Duration(Second)');

query error DataFusion error: Error during planning: Cannot automatically convert Utf8 to Duration\(Second\)
select arrow_cast('30 minutes', 'Duration(Second)');
5 changes: 4 additions & 1 deletion datafusion/proto/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2120,8 +2120,11 @@ mod roundtrip_tests {
DataType::Float16,
DataType::Float32,
DataType::Float64,
// Add more timestamp tests
DataType::Timestamp(TimeUnit::Second, None),
DataType::Timestamp(TimeUnit::Millisecond, None),
DataType::Timestamp(TimeUnit::Microsecond, None),
DataType::Timestamp(TimeUnit::Nanosecond, None),
DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
DataType::Date32,
DataType::Date64,
DataType::Time32(TimeUnit::Second),
Expand Down
Loading

0 comments on commit e46924d

Please sign in to comment.