From 3ca41f50d0e8b6da95d83e5bf0b09fd518e2110f Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Sun, 13 Nov 2022 21:40:17 +1100 Subject: [PATCH] Parse Time32/Time64 from formatted string --- arrow-cast/src/parse.rs | 309 +++++++++++++++++++++++++++++++++++++++- arrow-csv/src/reader.rs | 35 +++++ 2 files changed, 340 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index b93d6c800240..9392b5d272a4 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -199,10 +199,123 @@ impl Parser for TimestampSecondType { } } -parser_primitive!(Time64NanosecondType); -parser_primitive!(Time64MicrosecondType); -parser_primitive!(Time32MillisecondType); -parser_primitive!(Time32SecondType); +impl Parser for Time64NanosecondType { + fn parse(string: &str) -> Option { + [ + "%I:%M:%S%.9f %P", + "%I:%M:%S%.9f %p", + "%l:%M:%S%.9f %P", + "%l:%M:%S%.9f %p", + "%H:%M:%S%.9f", + "%k:%M:%S%.9f", + "%I:%M:%S %P", + "%I:%M:%S %p", + "%l:%M:%S %P", + "%l:%M:%S %p", + "%H:%M:%S", + "%k:%M:%S", + "%I:%M %P", + "%I:%M %p", + "%l:%M %P", + "%l:%M %p", + "%H:%M", + "%k:%M", + ] + .iter() + .find_map(|f| NaiveTime::parse_from_str(string, f).ok()) + .map(|nt| { + nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64 + }) + .or_else(|| string.parse::().ok()) + } +} + +impl Parser for Time64MicrosecondType { + fn parse(string: &str) -> Option { + [ + "%I:%M:%S%.6f %P", + "%I:%M:%S%.6f %p", + "%l:%M:%S%.6f %P", + "%l:%M:%S%.6f %p", + "%H:%M:%S%.6f", + "%k:%M:%S%.6f", + "%I:%M:%S %P", + "%I:%M:%S %p", + "%l:%M:%S %P", + "%l:%M:%S %p", + "%H:%M:%S", + "%k:%M:%S", + "%I:%M %P", + "%I:%M %p", + "%l:%M %P", + "%l:%M %p", + "%H:%M", + "%k:%M", + ] + .iter() + .find_map(|f| NaiveTime::parse_from_str(string, f).ok()) + .map(|nt| { + nt.num_seconds_from_midnight() as i64 * 1_000_000 + + (nt.nanosecond() as i64) / 1_000 + }) + .or_else(|| string.parse::().ok()) + } +} + +impl Parser for Time32MillisecondType { + fn parse(string: &str) -> Option { + [ + "%I:%M:%S%.3f %P", + "%I:%M:%S%.3f %p", + "%l:%M:%S%.3f %P", + "%l:%M:%S%.3f %p", + "%H:%M:%S%.3f", + "%k:%M:%S%.3f", + "%I:%M:%S %P", + "%I:%M:%S %p", + "%l:%M:%S %P", + "%l:%M:%S %p", + "%H:%M:%S", + "%k:%M:%S", + "%I:%M %P", + "%I:%M %p", + "%l:%M %P", + "%l:%M %p", + "%H:%M", + "%k:%M", + ] + .iter() + .find_map(|f| NaiveTime::parse_from_str(string, f).ok()) + .map(|nt| { + nt.num_seconds_from_midnight() as i32 * 1_000 + + nt.nanosecond() as i32 / 1_000_000 + }) + .or_else(|| string.parse::().ok()) + } +} + +impl Parser for Time32SecondType { + fn parse(string: &str) -> Option { + [ + "%I:%M:%S %P", + "%I:%M:%S %p", + "%l:%M:%S %P", + "%l:%M:%S %p", + "%H:%M:%S", + "%k:%M:%S", + "%I:%M %P", + "%I:%M %p", + "%l:%M %P", + "%l:%M %p", + "%H:%M", + "%k:%M", + ] + .iter() + .find_map(|f| NaiveTime::parse_from_str(string, f).ok()) + .map(|nt| nt.num_seconds_from_midnight() as i32) + .or_else(|| string.parse::().ok()) + } +} /// Number of days between 0001-01-01 and 1970-01-01 const EPOCH_DAYS_FROM_CE: i32 = 719_163; @@ -411,4 +524,192 @@ mod tests { parse_timestamp("2020-09-08 13:42:29").unwrap() ); } + + #[test] + fn parse_time64_nanos() { + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 am"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 PM"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 pm"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 AM"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 am"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 PM"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 pm"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10 AM"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10 am"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 PM"), + Some(51_000_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 pm"), + Some(51_000_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("02:10"), + Some(7_800_000_000_000) + ); + assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); + assert_eq!(Time64NanosecondType::parse("1"), Some(1)); + } + + #[test] + fn parse_time64_micros() { + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 AM"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 PM"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 pm"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("02:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 AM"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 am"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 PM"), + Some(51_001_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 pm"), + Some(51_001_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("02:10:01"), + Some(7_801_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); + assert_eq!( + Time64MicrosecondType::parse("2:10 PM"), + Some(51_000_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10 pm"), + Some(51_000_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); + } + + #[test] + fn parse_time32_millis() { + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 AM"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 am"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 pm"), + Some(51_001_120) + ); + assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); + assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); + assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); + assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); + assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("1"), Some(1)); + } + + #[test] + fn parse_time32_secs() { + assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); + assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); + assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); + assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); + assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); + assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); + assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); + assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); + assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("1"), Some(1)); + } } diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 0bf05960a37d..4200e9329c54 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -584,6 +584,24 @@ fn parse( i, datetime_format, ), + DataType::Time32(TimeUnit::Second) => { + build_primitive_array::(line_number, rows, i, None) + } + DataType::Time32(TimeUnit::Millisecond) => build_primitive_array::< + Time32MillisecondType, + >( + line_number, rows, i, None + ), + DataType::Time64(TimeUnit::Microsecond) => build_primitive_array::< + Time64MicrosecondType, + >( + line_number, rows, i, None + ), + DataType::Time64(TimeUnit::Nanosecond) => build_primitive_array::< + Time64NanosecondType, + >( + line_number, rows, i, None + ), DataType::Timestamp(TimeUnit::Microsecond, _) => { build_primitive_array::( line_number, @@ -1593,6 +1611,23 @@ mod tests { assert_eq!(parse_item::("1945-05-08").unwrap(), -9004); } + #[test] + fn parse_time() { + assert_eq!( + parse_item::("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + parse_item::("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + parse_item::("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!(parse_item::("2:10:01 pm"), Some(51_001)); + } + #[test] fn parse_date64() { assert_eq!(parse_item::("1970-01-01T00:00:00").unwrap(), 0);