Skip to content

Commit

Permalink
dict encoded
Browse files Browse the repository at this point in the history
  • Loading branch information
DerThorsten committed Nov 17, 2024
1 parent c692392 commit 14cd756
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 23 deletions.
44 changes: 30 additions & 14 deletions include/sparrow/builder/builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ auto build_impl(T&& t, [[maybe_unused]] sparrow::mpl::typelist<OPTION_FLAGS...>

template <class T>
concept translates_to_dict_encoded =
is_lazy_dict_encoded_vector<T>
is_lazy_dict_encoded_vector<T> ||
is_lazy_dict_tagged_type<T>
;

template<class T>
Expand Down Expand Up @@ -170,14 +171,21 @@ struct builder<T, OPTION_FLAGS>
template<class U>
static type create(U && t)
{
auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
using raw_value_type = std::ranges::range_value_t<T>;

auto flat_list_view = tag<raw_value_type>(
std::ranges::views::join(ensure_value_range(t))
);

auto sizes = t | std::views::transform([](const auto& l){
return get_size_save(l);
});

auto typed_array = build_impl(flat_list_view, OPTION_FLAGS{});
auto detyped_array = array(std::move(typed_array));

return type(
array(build_impl(flat_list_view, OPTION_FLAGS{})),
std::move(detyped_array),
type::offset_from_sizes(sizes),
where_null(t)
);
Expand All @@ -189,11 +197,14 @@ struct builder<T, OPTION_FLAGS>
{
using type = sparrow::fixed_sized_list_array;
constexpr static std::size_t list_size = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
using raw_value_type = std::ranges::range_value_t<T>;

template<class U>
static type create(U && t)
{
auto flat_list_view = std::ranges::views::join(ensure_value_range(t));
auto flat_list_view = tag<raw_value_type>(
std::ranges::views::join(ensure_value_range(t))
);

return type(
static_cast<std::uint64_t>(list_size),
Expand All @@ -208,18 +219,23 @@ struct builder<T, OPTION_FLAGS>
{
using type = sparrow::struct_array;
static constexpr std::size_t n_children = std::tuple_size_v<mnv_t<std::ranges::range_value_t<T>>>;
using tuple_type = ensured_range_value_t<T>;

template<class U>
static type create(U&& t)
{
std::vector<array> detyped_children(n_children);
for_each_index<n_children>([&](auto i)
{
auto tuple_i_col = t | std::views::transform([](const auto& maybe_nullable_tuple)
{
auto tuple_i_col_raw = t | std::views::transform([](const auto& maybe_nullable_tuple)
{
const auto & tuple_val = ensure_value(maybe_nullable_tuple);
return std::get<decltype(i)::value>(tuple_val);
});


using tuple_element_type= std::tuple_element_t<decltype(i)::value, tuple_type>;
auto tuple_i_col = tag<tuple_element_type>(std::move(tuple_i_col_raw));
detyped_children[decltype(i)::value] = array(build_impl(tuple_i_col, OPTION_FLAGS{}));
});

Expand Down Expand Up @@ -269,11 +285,12 @@ struct builder<T, OPTION_FLAGS>
for_each_index<variant_size>([&](auto i)
{
using type_at_index = std::variant_alternative_t<decltype(i)::value, variant_type>;
auto type_i_col = t | std::views::transform([](const auto& variant)
auto type_i_col_raw= t | std::views::transform([](const auto& variant)
{
return variant.index() == decltype(i)::value ?
std::get<type_at_index>(variant) : type_at_index{};
});
auto type_i_col = tag<type_at_index>(std::move(type_i_col_raw));
detyped_children[decltype(i)::value] = array(build_impl(type_i_col, OPTION_FLAGS{}));
});

Expand All @@ -295,16 +312,17 @@ struct builder<T, OPTION_FLAGS>
template< translates_to_dict_encoded T, class OPTION_FLAGS>
struct builder<T, OPTION_FLAGS>
{
using key_type = dict_dict_encoded_key_t<std::decay_t<T>>;
using untagged_type = untagged_type_t<std::decay_t<T>>;
using key_type = dict_dict_encoded_key_t<untagged_type>;
using type = sparrow::dictionary_encoded_array<key_type>;

// keep the nulls
using raw_range_value_type = std::ranges::range_value_t<T>;
using raw_range_value_type = std::ranges::range_value_t<untagged_type>;

template<class U>
static type create(U && t)
static type create(U && tagged)
{

auto t = untag(std::forward<U>(tagged));
std::cout<<"dict encoded!"<<std::endl;
key_type key = 0;
std::map<raw_range_value_type, key_type, nested_less<raw_range_value_type>> value_map;
std::vector<raw_range_value_type> values;
Expand All @@ -325,8 +343,6 @@ struct builder<T, OPTION_FLAGS>
}

}

//auto keys = t | std::views::transform([&](const auto& v){ return value_to_key.find(v)->second; });
auto keys_buffer = u8_buffer<key_type>(keys);

auto values_array = build_impl(values, OPTION_FLAGS{});
Expand Down
94 changes: 94 additions & 0 deletions include/sparrow/builder/builder_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ namespace sparrow
{






template<class T, class KEY_TYPE = std::uint64_t>
class lazy_dict_encoded_vector : public std::vector<T>
{
Expand Down Expand Up @@ -64,6 +68,96 @@ namespace detail



template<class T, class INDEX_TYPE>
struct lazy_dict_tagged_type
{
using index_type = INDEX_TYPE;
using value_type = T;

template<class U>
lazy_dict_tagged_type(U&& value)
: value(std::forward<U>(value))
{
}
T value;
};

template<class T>
concept is_lazy_dict_tagged_type =
sparrow::mpl::is_type_instance_of_v<std::decay_t<T>, lazy_dict_tagged_type>;



template<class PRED>
struct tag_type;

template<class PRED>
requires(!is_lazy_dict_encoded_vector<PRED>)
struct tag_type<PRED>
{
template<class U>
static decltype(auto) tag(U&& value)
{
return std::forward<U>(value);
}
};

template<class PRED>
requires(is_lazy_dict_encoded_vector<PRED>)
struct tag_type<PRED>
{
template<class U>
static decltype(auto) tag(U&& value)
{
using key_type = typename dict_encoded_key_type<std::decay_t<U>>::type;
return lazy_dict_tagged_type<U, key_type>(std::forward<U>(value));
}
};

template<class PRED, class T>
decltype(auto) tag(T&& value)
{
return tag_type<PRED>::tag(std::forward<T>(value));
}


template<class T>
struct untagged_type;

template<class T>
requires(is_lazy_dict_tagged_type<T>)
struct untagged_type<T>
{
using type = typename std::decay_t<T>::value_type;
template<class U>
static decltype(auto) untag(U&& value)
{
return value.value;
}
};

template<class T>
requires(!is_lazy_dict_tagged_type<T>)
struct untagged_type<T>
{
using type = T;
template<class U>
static decltype(auto) untag(U&& value)
{
return std::forward<U>(value);
}
};

template<class T>
using untagged_type_t = typename untagged_type<T>::type;

template<class T>
auto untag(T&& value)
{
return untagged_type<T>::untag(std::forward<T>(value));
}


// only for side effects (ie lambda which is called for each index without
// returning anything but can have side effects)
template <class F, std::size_t... Is>
Expand Down
4 changes: 2 additions & 2 deletions include/sparrow/layout/array_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ namespace sparrow
t.end();
t.cbegin();
t.cend();
t.bitmap();
t.values();
//t.bitmap(); // NOPE
//t.values(); // NOPE
};

namespace detail
Expand Down
31 changes: 24 additions & 7 deletions test/test_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ namespace sparrow
{
SUBCASE("simple")
{
lazy_dict_encoded_vector<std::string,unsigned int> v{"he", "world","w","world","world!", " he","he"};
lazy_dict_encoded_vector<std::string> v{"he", "world","w","world","world!", " he","he"};
auto arr = sparrow::build(v);
using key_type = typename std::decay_t<decltype(v)>::key_type;
static_assert(std::is_same_v<decltype(arr), sparrow::dictionary_encoded_array<key_type>>);
Expand All @@ -250,12 +250,29 @@ namespace sparrow
CHECK_NULLABLE_VARIANT_EQ(arr[6], std::string_view("he"));

}
// SUBCASE("simple-explicit-index")
// {
// lazy_dict_encoded_vector<std::string, std::uint32_t> v{"he", "world","w","world","world!", " he","he"};
// auto arr = sparrow::build(v);
// static_assert(std::is_same_v<decltype(arr), sparrow::dictionary_encoded_array<std::uint32_t>>);
// }
SUBCASE("dict-endcoded-as-child")
{
// list[dict-encoded[string]]
std::vector<lazy_dict_encoded_vector<std::string>> v{
{"hello", "the", "world"},
{"hello", "world"},
{"world", "!"}
};

auto arr = sparrow::build(v);
using array_type = std::decay_t<decltype(arr)>;
static_assert(std::is_same_v<array_type, sparrow::list_array>);
sanity_check(arr);



REQUIRE_EQ(arr.size(), 3);

CHECK_EQ(arr[0].value().size(), 3);
CHECK_EQ(arr[1].value().size(), 2);
CHECK_EQ(arr[2].value().size(), 2);

}
}

}
Expand Down

0 comments on commit 14cd756

Please sign in to comment.