Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for timestamps #70

Merged
merged 18 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@ defaults:
jobs:
build:
runs-on: ubuntu-22.04
name: ${{ matrix.sys.compiler }}-${{ matrix.sys.version }}-${{ matrix.sys.stdlib }}-${{ matrix.config.name }}
name: ${{ matrix.sys.compiler }} / ${{ matrix.sys.version }} / ${{ matrix.sys.stdlib }} / ${{ matrix.config.name }} / date-polyfill ${{ matrix.sys.date-polyfill}}
strategy:
fail-fast: false
matrix:
sys:
- {compiler: clang, version: '16', config-flags: '', stdlib: 'libstdc++-12' }
# - {compiler: clang, version: '16', config-flags: '-DCMAKE_CXX_FLAGS=-stdlib=libc++', stdlib: 'libc++-17' }
- {compiler: clang, version: '17', config-flags: '', stdlib: 'libstdc++-12' }
# - {compiler: clang, version: '17', config-flags: '-DCMAKE_CXX_FLAGS=-stdlib=libc++', stdlib: 'libc++-17' }
- {compiler: gcc, version: '12', config-flags: '' }
- {compiler: gcc, version: '13', config-flags: '' }
- {compiler: clang, version: '16', config-flags: '', stdlib: 'libstdc++-12', date-polyfill: 'ON' }
# - {compiler: clang, version: '16', config-flags: '-DCMAKE_CXX_FLAGS=-stdlib=libc++', stdlib: 'libc++-17', date-polyfill: 'ON' }
- {compiler: clang, version: '17', config-flags: '', stdlib: 'libstdc++-12', date-polyfill: 'ON' }
# - {compiler: clang, version: '17', config-flags: '-DCMAKE_CXX_FLAGS=-stdlib=libc++', stdlib: 'libc++-17', date-polyfill: 'ON' }

- {compiler: gcc, version: '12', config-flags: '', date-polyfill: 'ON' }
- {compiler: gcc, version: '13', config-flags: '', date-polyfill: 'ON' }
- {compiler: gcc, version: '13', config-flags: '', date-polyfill: 'OFF' }

config:
- { name: Debug }
Expand Down Expand Up @@ -61,7 +63,7 @@ jobs:
cache-downloads: true

- name: Configure using CMake
run: cmake -G Ninja -Bbuild ${{matrix.sys.config-flags}} -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBUILD_TESTS=ON
run: cmake -G Ninja -Bbuild ${{matrix.sys.config-flags}} -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DUSE_DATE_POLYFILL=${{matrix.sys.date-polyfill}} -DBUILD_TESTS=ON

- name: Install
working-directory: build
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/osx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ defaults:
jobs:
build:
runs-on: macos-${{ matrix.os }}
name: ${{ matrix.os }}-${{ matrix.config.name }}-${{ matrix.compiler }}
name: ${{ matrix.os }} / ${{ matrix.config.name }} / ${{ matrix.compiler }}
strategy:
fail-fast: false
matrix:
Expand Down
18 changes: 13 additions & 5 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,16 @@ defaults:
jobs:
build:
runs-on: ${{ matrix.runs-on }}
name: ${{ matrix.sys.compiler }}-${{ matrix.build-system }}-${{ matrix.config.name }}
name: ${{ matrix.sys.compiler }} / ${{ matrix.build-system }} / ${{ matrix.config.name }} / date-polyfill ${{ matrix.sys.date-polyfill}}
strategy:
fail-fast: false
matrix:
runs-on: [windows-latest]
sys:
- {compiler: default}
- {compiler: msvc}
- {compiler: clang}
- { compiler: msvc, date-polyfill: 'ON' }
- { compiler: msvc, date-polyfill: 'OFF' }
- { compiler: clang, date-polyfill: 'ON', version: 17 }
- { compiler: clang, date-polyfill: 'OFF', version: 17 }
config:
- { name: Debug }
- { name: Release }
Expand All @@ -35,6 +36,13 @@ jobs:
if: matrix.sys.compiler == 'msvc' && matrix.build-system != 'Visual Studio 17 2022'
uses: ilammy/msvc-dev-cmd@v1

- name: Install LLVM and Clang
if: matrix.sys.compiler == 'clang'
uses: egor-tensin/setup-clang@v1
with:
version: ${{matrix.sys.version}}
platform: x64

- name: Setup clang
if: matrix.sys.compiler == 'clang'
run: |
Expand All @@ -55,7 +63,7 @@ jobs:
ninja

- name: Configure using CMake
run: cmake -Bbuild -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBUILD_TESTS=ON -G "${{matrix.build-system}}"
run: cmake -Bbuild -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBUILD_TESTS=ON -DUSE_DATE_POLYFILL=${{matrix.sys.date-polyfill}} -G "${{matrix.build-system}}"

- name: Install
working-directory: build
Expand Down
31 changes: 29 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,24 @@ message(STATUS "Building sparrow v${${PROJECT_NAME}_VERSION}")
# Build options
# =============

OPTION(BUILD_TESTS "sparrow test suite" OFF)
OPTION(BUILD_TESTS "Build sparrow test suite" OFF)
OPTION(USE_DATE_POLYFILL "Use date polyfill implementation" ON)

include(CheckCXXSymbolExists)

if(cxx_std_20 IN_LIST CMAKE_CXX_COMPILE_FEATURES)
set(header version)
else()
set(header ciso646)
endif()

check_cxx_symbol_exists(_LIBCPP_VERSION ${header} LIBCPP)
if(LIBCPP)
message(STATUS "Using libc++")
# Allow the use of not visible yet availabile features, such
# as some formatter for new types.
add_compile_definitions(_LIBCPP_DISABLE_AVAILABILITY)
endif()

# Linter options
# =============
Expand All @@ -55,6 +72,14 @@ endif()
# Dependencies
# ============

set(SPARROW_INTERFACE_DEPENDENCIES "" CACHE STRING "List of dependencies to be linked to the sparrow target")

if (USE_DATE_POLYFILL)
find_package(date CONFIG REQUIRED)
list(APPEND SPARROW_INTERFACE_DEPENDENCIES date::date date::date-tz)
add_compile_definitions(SPARROW_USE_DATE_POLYFILL)
endif()

# Build
# =====

Expand All @@ -65,7 +90,7 @@ set(SPARROW_HEADERS
${SPARROW_INCLUDE_DIR}/sparrow/buffer.hpp
${SPARROW_INCLUDE_DIR}/sparrow/buffer_view.hpp
${SPARROW_INCLUDE_DIR}/sparrow/config.hpp
${SPARROW_INCLUDE_DIR}/sparrow/contracts.hpp
${SPARROW_INCLUDE_DIR}/sparrow/contracts.hpp
${SPARROW_INCLUDE_DIR}/sparrow/data_traits.hpp
${SPARROW_INCLUDE_DIR}/sparrow/data_type.hpp
${SPARROW_INCLUDE_DIR}/sparrow/dynamic_bitset.hpp
Expand All @@ -86,6 +111,8 @@ target_include_directories(sparrow INTERFACE
$<BUILD_INTERFACE:${SPARROW_INCLUDE_DIR}>
$<INSTALL_INTERFACE:include>)

target_link_libraries(sparrow INTERFACE ${SPARROW_INTERFACE_DEPENDENCIES})

# We do not use non-standard C++
set_target_properties(sparrow PROPERTIES CMAKE_CXX_EXTENSIONS OFF)
target_compile_features(sparrow INTERFACE cxx_std_20)
Expand Down
5 changes: 5 additions & 0 deletions environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@ dependencies:
- ninja
# Tests
- doctest
# P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
# See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
# For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
# TODO: remove this once libc++ has full support for P0355R7.
- howardhinnant_date
2 changes: 1 addition & 1 deletion include/sparrow/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ namespace sparrow
{
if (new_size > size())
{
const size_t nb_init = new_size - size();
const std::size_t nb_init = new_size - size();
if (new_size <= capacity())
{
initializer(nb_init);
Expand Down
8 changes: 7 additions & 1 deletion include/sparrow/data_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ namespace sparrow
using default_layout = variable_size_binary_layout<value_type, std::span<byte_t>, const std::span<byte_t>>; // FIXME: this is incorrect, change when we have the right types
};

template <>
struct arrow_traits<timestamp> : common_native_types_traits<timestamp>
{
static constexpr data_type type_id = data_type::TIMESTAMP;
};

namespace predicate
{

Expand All @@ -147,4 +153,4 @@ namespace sparrow
} constexpr has_arrow_traits;
}

}
}
56 changes: 37 additions & 19 deletions include/sparrow/data_type.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2024 Man Group Operations Limited

Check notice on line 1 in include/sparrow/data_type.hpp

View workflow job for this annotation

GitHub Actions / build

Run clang-format on include/sparrow/data_type.hpp

File include/sparrow/data_type.hpp does not conform to Custom style guidelines. (lines 17, 21, 23, 36)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,15 @@

#pragma once

#include <version>
#include <chrono>

#if defined(SPARROW_USE_DATE_POLYFILL)
#include <date/tz.h>
#else
namespace date = std::chrono;
#endif

#include <climits>
#include <cstdint>
#include <optional>
Expand Down Expand Up @@ -45,6 +54,13 @@
using float64_t = std::float64_t;
#endif

// P0355R7 (Extending chrono to Calendars and Time Zones) has not been entirely implemented in libc++ yet.
// See: https://libcxx.llvm.org/Status/Cxx20.html#note-p0355
// For now, we use HowardHinnant/date as a replacement if we are compiling with libc++.
// TODO: use the following once libc++ has full support for P0355R7.
// using timestamp = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
using timestamp = date::zoned_time<std::chrono::nanoseconds>;

// We need to be sure the current target platform is setup to support correctly these types.
static_assert(sizeof(float16_t) == 2);
static_assert(sizeof(float32_t) == 4);
Expand All @@ -64,32 +80,33 @@
enum class data_type
{
NA = 0,
BOOL,
UINT8,
INT8,
UINT16,
INT16,
UINT32,
INT32,
UINT64,
INT64,
HALF_FLOAT,
FLOAT,
DOUBLE,
BOOL = 1,
UINT8 = 2,
INT8 = 3,
UINT16 = 4,
INT16 = 5,
UINT32 = 6,
INT32 = 7,
UINT64 = 8,
INT64 = 9,
HALF_FLOAT = 10,
FLOAT = 11,
DOUBLE = 12,
// UTF8 variable-length string
STRING,
STRING = 13,
// Variable-length bytes (no guarantee of UTF8-ness)
BINARY,
BINARY = 14,
// Fixed-size binary. Each value occupies the same number of bytes
FIXED_SIZE_BINARY,
FIXED_SIZE_BINARY = 15,
// Number of nanoseconds since the UNIX epoch with an optional timezone.
// See: https://arrow.apache.org/docs/python/timestamps.html#timestamps
TIMESTAMP = 18,
};

/// C++ types value representation types matching Arrow types.
// NOTE: this needs to be in sync-order with `data_type`
using all_base_types_t = mpl::typelist<
std::nullopt_t // REVIEW: not sure about if we need to have this one? for representing NA? is this
// the right type?
,
std::nullopt_t,
bool,
std::uint8_t,
std::int8_t,
Expand All @@ -103,7 +120,8 @@
float32_t,
float64_t,
std::string,
std::vector<byte_t>
std::vector<byte_t>,
sparrow::timestamp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is one type missing before that one, see compare with the order of the enum, BINARY matches std::vector<byte_t> but we dont have something for FIXED_SIZE_BINARY. Not sure which type is missing though. @JohanMabille any idea what it would match?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think FIXED_SIZE_BINARY is meant to be used for a fixed size binary type which is not a primitive type, in which case we could use a struct as a type to test FIXED_SIZE_BINARY.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took the values from Arrow implementation, not sure which type this value should match.

Copy link
Collaborator Author

@jjerphan jjerphan May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, must we work on supporting DATE32 and DATE64 which come before TIMESTAMP?

Edit: I guess we could support TIMESTAMP first if we were to have explicit integer identifiers for those types as introduced in arrow-cpp with apache/arrow#37149 as proposed by 2f961ef

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The support for DATE32 and DATE64 can be added later. Explicit integer identifiers would be required later for backward compatibility, but it doe snot matter for now, nor the enum "stability".

// TODO: add missing fundamental types here
>;

Expand Down
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ endif()

set(SPARROW_TESTS_SOURCES
main.cpp
array_data_creation.cpp
test_algorithm.cpp
test_allocator.cpp
test_array_data_creation.cpp
Expand All @@ -47,6 +48,7 @@ set(SPARROW_TESTS_SOURCES
test_mpl.cpp
test_traits.cpp
test_typed_array.cpp
test_typed_array_timestamp.cpp
test_variable_size_binary_layout.cpp
)
set(test_target "test_sparrow_lib")
Expand Down
47 changes: 47 additions & 0 deletions test/array_data_creation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include "array_data_creation.hpp"

Check notice on line 1 in test/array_data_creation.cpp

View workflow job for this annotation

GitHub Actions / build

Run clang-format on test/array_data_creation.cpp

File test/array_data_creation.cpp does not conform to Custom style guidelines. (lines 8, 9, 19, 20, 38)

#include <chrono>
#include <stdexcept>
#include <vector>

#if defined(SPARROW_USE_DATE_POLYFILL)
#include <date/date.h>
#include <date/tz.h>
#else
namespace date = std::chrono;
#endif

namespace sparrow::test
{
using sys_time = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;

template <>
sparrow::array_data
make_test_array_data<sparrow::timestamp>(std::size_t n, std::size_t offset, const std::vector<std::size_t>& false_bitmap)
{
sparrow::array_data ad;
ad.type = sparrow::data_descriptor(sparrow::arrow_traits<sparrow::timestamp>::type_id);
ad.bitmap = sparrow::dynamic_bitset<uint8_t>(n, true);
for (const auto i : false_bitmap)
{
if (i >= n)
{
throw std::invalid_argument("Index out of range");
}
ad.bitmap.set(i, false);
}
const std::size_t buffer_size = (n * sizeof(sparrow::timestamp)) / sizeof(uint8_t);
sparrow::buffer<uint8_t> b(buffer_size);

for (uint8_t i = 0; i < n; ++i)
{
b.data<sparrow::timestamp>()[i] = sparrow::timestamp(date::sys_days(date::year(1970)/date::January/date::day(1)) + date::days(i));
}

ad.buffers.push_back(b);
ad.length = static_cast<std::int64_t>(n);
ad.offset = static_cast<std::int64_t>(offset);
ad.child_data.emplace_back();
return ad;
}
} // namespace sparrow::test
4 changes: 4 additions & 0 deletions test/array_data_creation.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2024 Man Group Operations Limited

Check notice on line 1 in test/array_data_creation.hpp

View workflow job for this annotation

GitHub Actions / build

Run clang-format on test/array_data_creation.hpp

File test/array_data_creation.hpp does not conform to Custom style guidelines. (lines 24, 27, 30, 106)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -61,6 +61,10 @@
return ad;
}

template <>
sparrow::array_data
make_test_array_data<sparrow::timestamp>(size_t n, size_t offset, const std::vector<size_t>& false_bitmap);

// Creates an array_data object for testing with std::string elements.
//
// param n The number of elements in the array.
Expand Down
Loading
Loading