diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 3345b32a5b..f41dc73708 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -157,6 +157,14 @@ void ManagedQuery::setup_read() { for (int i = 0; i < attribute_num; i++) { columns_.push_back(schema.attribute(i).name()); } + + auto is_internal = [](std::string name) { + return name.rfind(SOMA_GEOMETRY_DIMENSION_PREFIX, 0) == 0; + }; + + auto internal_end = std::remove_if( + columns_.begin(), columns_.end(), is_internal); + columns_.erase(internal_end, columns_.end()); } // Allocate and attach buffers diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index a15a2ef111..ce6b13c190 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -33,9 +33,8 @@ #ifndef SOMA_ARRAY #define SOMA_ARRAY -#include // for windows: error C2039: 'runtime_error': is not a member of 'std' - #include +#include // for windows: error C2039: 'runtime_error': is not a member of 'std' #include #include @@ -997,7 +996,7 @@ class SOMAArray : public SOMAObject { * @tparam T Domain datatype * @return Pair of [lower, upper] inclusive bounds. */ - ArrowTable get_soma_domain() { + virtual ArrowTable get_soma_domain() { if (has_current_domain()) { return _get_core_current_domain(); } else { @@ -1020,7 +1019,7 @@ class SOMAArray : public SOMAObject { * @tparam T Domain datatype * @return Pair of [lower, upper] inclusive bounds. */ - ArrowTable get_soma_maxdomain() { + virtual ArrowTable get_soma_maxdomain() { return _get_core_domain(); } @@ -1028,7 +1027,7 @@ class SOMAArray : public SOMAObject { * Returns the core non-empty domain in its entirety, as an Arrow * table for return to Python/R. */ - ArrowTable get_non_empty_domain() { + virtual ArrowTable get_non_empty_domain() { return _get_core_domainish(Domainish::kind_non_empty_domain); } diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc index 0e647df3ca..99e0aa17c0 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc @@ -54,7 +54,6 @@ void SOMAGeometryDataFrame::create( std::shared_ptr ctx, PlatformConfig platform_config, std::optional timestamp) { - std::vector spatial_axes; auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( ctx->tiledb_ctx(), schema, @@ -98,7 +97,20 @@ std::unique_ptr SOMAGeometryDataFrame::schema() const { const std::vector SOMAGeometryDataFrame::index_column_names() const { - return this->dimension_names(); + std::vector dim_names = this->dimension_names(); + + auto is_internal = [](std::string name) { + return name.rfind(SOMA_GEOMETRY_DIMENSION_PREFIX, 0) == 0; + }; + + auto first_dim = std::find_if( + begin(dim_names), end(dim_names), is_internal); + dim_names.insert(first_dim, SOMA_GEOMETRY_COLUMN_NAME); + auto internal_end = std::remove_if( + begin(dim_names), end(dim_names), is_internal); + dim_names.erase(internal_end, dim_names.end()); + + return dim_names; } const std::vector SOMAGeometryDataFrame::spatial_column_names() @@ -172,6 +184,18 @@ void SOMAGeometryDataFrame::set_array_data( SOMAArray::set_array_data(std::move(arrow_schema), std::move(arrow_array)); } +ArrowTable SOMAGeometryDataFrame::get_soma_domain() { + return _reconstruct_geometry_domain(SOMAArray::get_soma_domain()); +} + +ArrowTable SOMAGeometryDataFrame::get_soma_maxdomain() { + return _reconstruct_geometry_domain(SOMAArray::get_soma_maxdomain()); +} + +ArrowTable SOMAGeometryDataFrame::get_non_empty_domain() { + return _reconstruct_geometry_domain(SOMAArray::get_non_empty_domain()); +} + //=================================================================== //= private non-static //=================================================================== @@ -352,4 +376,51 @@ ArrowTable SOMAGeometryDataFrame::_reconstruct_geometry_data_table( return ArrowTable(std::move(arrow_array), std::move(arrow_schema)); } +ArrowTable SOMAGeometryDataFrame::_reconstruct_geometry_domain( + const ArrowTable& domain) { + std::unique_ptr schema = std::make_unique( + ArrowSchema{}); + std::unique_ptr array = std::make_unique( + ArrowArray{}); + + int64_t internal_axes = 2 * spatial_column_names().size(); + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema.get(), NANOARROW_TYPE_STRUCT)); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateChildren( + schema.get(), domain.second->n_children - internal_axes + 1)); + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromType(array.get(), NANOARROW_TYPE_STRUCT)); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren( + array.get(), domain.first->n_children - internal_axes + 1)); + + std::vector dim_names = this->dimension_names(); + auto is_internal = [](std::string name) { + return name.rfind(SOMA_GEOMETRY_DIMENSION_PREFIX, 0) == 0; + }; + auto first_dim = static_cast( + std::find_if(begin(dim_names), end(dim_names), is_internal) - + dim_names.begin()); + + for (int64_t i = 0, orig_i = 0; i < schema->n_children; ++i, ++orig_i) { + ArrowSchemaMove(domain.second->children[orig_i], schema->children[i]); + ArrowArrayMove(domain.first->children[orig_i], array->children[i]); + + if (i == first_dim) { + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName( + schema->children[i], SOMA_GEOMETRY_COLUMN_NAME.c_str())); + std::vector data; + for (; orig_i < i + internal_axes; orig_i += 2) { + data.push_back( + ((double_t*)domain.first->children[orig_i]->buffers[1])[0]); + data.push_back( + ((double_t*)domain.first->children[orig_i]->buffers[1])[1]); + } + array->children[i] = ArrowAdapter::make_arrow_array_child(data); + } + } + + return ArrowTable(std::move(array), std::move(schema)); +} + } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.h b/libtiledbsoma/src/soma/soma_geometry_dataframe.h index 1551eb8105..133716f8d6 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.h +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.h @@ -33,6 +33,7 @@ #ifndef SOMA_GEOMETRY_DATAFRAME #define SOMA_GEOMETRY_DATAFRAME +#include #include #include @@ -175,10 +176,75 @@ class SOMAGeometryDataFrame : virtual public SOMAArray { */ uint64_t count(); + /** + * @brief Set the spatial axis slice using multiple ranges + * + * @note Partitioning is not supported + * + * @tparam T + * @param axis + * @param ranges + */ + template + void set_spatial_dim_ranges( + const std::string& axis, const std::vector>& ranges) { + std::vector> min_range; + std::vector> max_range; + + if (ranges.size() != 1) { + throw TileDBSOMAError( + "Multi ranges are not supported for axis dimensions"); + } + + T min_domain, max_domain; + + // Both min and max dimension share the same domain + if (ArraySchemaExperimental::current_domain( + *this->ctx()->tiledb_ctx(), *this->tiledb_schema()) + .is_empty()) { + std::pair domain = this->tiledb_schema() + ->domain() + .dimension( + SOMA_GEOMETRY_DIMENSION_PREFIX + + axis + "__min") + .domain(); + min_domain = domain.first; + max_domain = domain.second; + } else { + auto current_domain = ArraySchemaExperimental::current_domain( + *this->ctx()->tiledb_ctx(), + *this->tiledb_schema().get()) + .ndrectangle() + .range( + SOMA_GEOMETRY_DIMENSION_PREFIX + + axis + "__min"); + min_domain = current_domain[0]; + max_domain = current_domain[1]; + } + + for (const std::pair& range : ranges) { + min_range.push_back( + std::make_pair(min_domain, std::min(range.second, max_domain))); + max_range.push_back( + std::make_pair(std::max(range.first, min_domain), max_domain)); + } + + this->set_dim_ranges( + SOMA_GEOMETRY_DIMENSION_PREFIX + axis + "__min", min_range); + this->set_dim_ranges( + SOMA_GEOMETRY_DIMENSION_PREFIX + axis + "__max", max_range); + } + void set_array_data( std::unique_ptr arrow_schema, std::unique_ptr arrow_array) override; + ArrowTable get_soma_domain() override; + + ArrowTable get_soma_maxdomain() override; + + ArrowTable get_non_empty_domain() override; + private: //=================================================================== //= private non-static @@ -200,6 +266,13 @@ class SOMAGeometryDataFrame : virtual public SOMAArray { */ ArrowTable _reconstruct_geometry_data_table( ArrowTable original_data, const std::vector& wkb_data); + + /** + * @brief Create a new ArrowTable by merging the internal spatial dimensions + * and setting the ``soma_geometry`` domain as the stacked domain of each + * spatial axis. + */ + ArrowTable _reconstruct_geometry_domain(const ArrowTable& domain); }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4353451dd2..52990c23d7 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -30,8 +30,11 @@ * This file defines the ArrowAdapter class. */ -#include "arrow_adapter.h" +#include +#include + #include "../soma/column_buffer.h" +#include "arrow_adapter.h" #include "logger.h" namespace tiledbsoma { @@ -329,16 +332,44 @@ json ArrowAdapter::_get_filter_list_json(FilterList filter_list) { std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array) { + auto is_internal = [](const Dimension& dim) { + return dim.name().rfind(SOMA_GEOMETRY_DIMENSION_PREFIX, 0) == 0; + }; + auto tiledb_schema = tiledb_array->schema(); - auto ndim = tiledb_schema.domain().ndim(); - auto nattr = tiledb_schema.attribute_num(); + auto dimensions = tiledb_schema.domain().dimensions(); + + // For geometry dataframe replace the internal dim with the geometry column + int internal_dim_idx = std::find_if( + dimensions.begin(), + dimensions.end(), + is_internal) - + dimensions.begin(); + auto internal_dim_iter = std::remove_if( + dimensions.begin(), dimensions.end(), is_internal); + dimensions.erase(internal_dim_iter, dimensions.end()); + + std::vector> columns; + for (size_t i = 0; i < dimensions.size(); ++i) { + columns.push_back(dimensions[i]); + } + + for (size_t i = 0; i < tiledb_schema.attribute_num(); ++i) { + auto attr = tiledb_schema.attribute(i); + if (strcmp(attr.name().c_str(), SOMA_GEOMETRY_COLUMN_NAME.c_str()) == + 0) { + columns.insert(columns.begin() + internal_dim_idx, attr); + } else { + columns.push_back(attr); + } + } std::unique_ptr arrow_schema = std::make_unique(); arrow_schema->format = strdup("+s"); arrow_schema->name = strdup("parent"); arrow_schema->metadata = nullptr; arrow_schema->flags = 0; - arrow_schema->n_children = ndim + nattr; + arrow_schema->n_children = columns.size(); arrow_schema->dictionary = nullptr; arrow_schema->release = &ArrowAdapter::release_schema; arrow_schema->private_data = nullptr; @@ -351,87 +382,104 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( ArrowSchema* child = nullptr; - for (uint32_t i = 0; i < ndim; ++i) { - auto dim = tiledb_schema.domain().dimension(i); - child = arrow_schema->children[i] = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); - child->format = strdup( - ArrowAdapter::to_arrow_format(dim.type()).data()); - child->name = strdup(dim.name().c_str()); - child->metadata = nullptr; - child->flags = 0; - child->n_children = 0; - child->children = nullptr; - child->dictionary = nullptr; - child->release = &ArrowAdapter::release_schema; - child->private_data = nullptr; - LOG_TRACE(fmt::format( - "[ArrowAdapter] arrow_schema_from_tiledb_array dim {} format {} " - "name {}", - i, - child->format, - child->name)); + for (size_t i = 0; i < columns.size(); ++i) { + std::visit( + [&](auto&& arg) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + child = arrow_schema->children[i] = + arrow_schema_from_tiledb_dimension(arg).release(); + } else if constexpr (std::is_same_v) { + child = arrow_schema->children[i] = + arrow_schema_from_tiledb_attribute( + arg, *ctx, *tiledb_array) + .release(); + } + }, + columns[i]); } - for (uint32_t i = 0; i < nattr; ++i) { - auto attr = tiledb_schema.attribute(i); - child = arrow_schema->children[ndim + i] = (ArrowSchema*)malloc( - sizeof(ArrowSchema)); - child->format = strdup( - ArrowAdapter::to_arrow_format(attr.type()).data()); - child->name = strdup(attr.name().c_str()); - child->metadata = nullptr; - child->flags = 0; - if (attr.nullable()) { - child->flags |= ARROW_FLAG_NULLABLE; - } else { - child->flags &= ~ARROW_FLAG_NULLABLE; - } - child->n_children = 0; - child->children = nullptr; - child->dictionary = nullptr; - child->release = &ArrowAdapter::release_schema; - child->private_data = nullptr; + return arrow_schema; +} - LOG_TRACE(fmt::format( - "[ArrowAdapter] arrow_schema_from_tiledb_array attr {} format {} " - "name {}", - i, - child->format, - child->name)); - - auto enmr_name = AttributeExperimental::get_enumeration_name( - *ctx, attr); - if (enmr_name.has_value()) { - auto enmr = ArrayExperimental::get_enumeration( - *ctx, *tiledb_array, attr.name()); - auto dict = (ArrowSchema*)malloc(sizeof(ArrowSchema)); +std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_dimension( + const Dimension& dimension) { + std::unique_ptr arrow_schema = std::make_unique(); + arrow_schema->format = strdup( + ArrowAdapter::to_arrow_format(dimension.type()).data()); + arrow_schema->name = strdup(dimension.name().c_str()); + arrow_schema->metadata = nullptr; + arrow_schema->flags = 0; + arrow_schema->n_children = 0; + arrow_schema->children = nullptr; + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->private_data = nullptr; + LOG_TRACE(fmt::format( + "[ArrowAdapter] arrow_schema_from_tiledb_dimension format {} " + "name {}", + arrow_schema->format, + arrow_schema->name)); + + return arrow_schema; +} + +std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_attribute( + Attribute& attribute, const Context& ctx, const Array& tiledb_array) { + std::unique_ptr arrow_schema = std::make_unique(); + arrow_schema->format = strdup( + ArrowAdapter::to_arrow_format(attribute.type()).data()); + arrow_schema->name = strdup(attribute.name().c_str()); + arrow_schema->metadata = nullptr; + arrow_schema->flags = 0; + if (attribute.nullable() && + strcmp(attribute.name().c_str(), SOMA_GEOMETRY_COLUMN_NAME.c_str()) != + 0) { + arrow_schema->flags |= ARROW_FLAG_NULLABLE; + } else { + arrow_schema->flags &= ~ARROW_FLAG_NULLABLE; + } + arrow_schema->n_children = 0; + arrow_schema->children = nullptr; + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->private_data = nullptr; + + LOG_TRACE(fmt::format( + "[ArrowAdapter] arrow_schema_from_tiledb_array format {} " + "name {}", + arrow_schema->format, + arrow_schema->name)); + + auto enmr_name = AttributeExperimental::get_enumeration_name( + ctx, attribute); + if (enmr_name.has_value()) { + auto enmr = ArrayExperimental::get_enumeration( + ctx, tiledb_array, attribute.name()); + auto dict = (ArrowSchema*)malloc(sizeof(ArrowSchema)); + dict->format = strdup( + ArrowAdapter::to_arrow_format(enmr.type(), false).data()); + if (enmr.type() == TILEDB_STRING_ASCII || enmr.type() == TILEDB_CHAR) { + dict->format = strdup("z"); + } else { dict->format = strdup( ArrowAdapter::to_arrow_format(enmr.type(), false).data()); - if (enmr.type() == TILEDB_STRING_ASCII || - enmr.type() == TILEDB_CHAR) { - dict->format = strdup("z"); - } else { - dict->format = strdup( - ArrowAdapter::to_arrow_format(enmr.type(), false).data()); - } - dict->name = strdup(enmr.name().c_str()); - dict->metadata = nullptr; - if (enmr.ordered()) { - child->flags |= ARROW_FLAG_DICTIONARY_ORDERED; - } else { - child->flags &= ~ARROW_FLAG_DICTIONARY_ORDERED; - } - dict->n_children = 0; - dict->children = nullptr; - dict->dictionary = nullptr; - dict->release = &ArrowAdapter::release_schema; - dict->private_data = nullptr; - child->dictionary = dict; } - child->release = &ArrowAdapter::release_schema; + dict->name = strdup(enmr.name().c_str()); + dict->metadata = nullptr; + if (enmr.ordered()) { + arrow_schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; + } else { + arrow_schema->flags &= ~ARROW_FLAG_DICTIONARY_ORDERED; + } + dict->n_children = 0; + dict->children = nullptr; + dict->dictionary = nullptr; + dict->release = &ArrowAdapter::release_schema; + dict->private_data = nullptr; + arrow_schema->dictionary = dict; } - + arrow_schema->release = &ArrowAdapter::release_schema; return arrow_schema; } diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 4674768566..d20a66a6df 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -229,6 +229,22 @@ class ArrowAdapter { static std::unique_ptr arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array); + /** + * @brief Create an ArrowSchema from TileDB Dimension + * + * @return ArrowSchema + */ + static std::unique_ptr arrow_schema_from_tiledb_dimension( + const Dimension& dimension); + + /** + * @brief Create an ArrowSchema from TileDB Attribute + * + * @return ArrowSchema + */ + static std::unique_ptr arrow_schema_from_tiledb_attribute( + Attribute& attribute, const Context& ctx, const Array& tiledb_array); + /** * @brief Get members of the TileDB Schema in the form of a PlatformConfig * diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc index 3416a2e9e2..fc2c3784d1 100644 --- a/libtiledbsoma/test/common.cc +++ b/libtiledbsoma/test/common.cc @@ -153,7 +153,7 @@ std::unique_ptr create_index_cols_info_schema( auto schema = ArrowAdapter::make_arrow_schema(names, tiledb_datatypes); - for (size_t i = 0; i < static_cast(schema->n_children); ++i) { + for (int64_t i = 0; i < schema->n_children; ++i) { if (strcmp(schema->children[i]->name, "soma_geometry")) { nanoarrow::UniqueBuffer buffer; ArrowMetadataBuilderInit(buffer.get(), nullptr); diff --git a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc index f8766e63c4..5602040500 100644 --- a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc @@ -126,15 +126,7 @@ TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { REQUIRE(soma_geometry->ctx() == ctx); REQUIRE(soma_geometry->type() == "SOMAGeometryDataFrame"); std::vector expected_index_column_names = { - dim_infos[0].name, - SOMA_GEOMETRY_DIMENSION_PREFIX + spatial_dim_infos[0].name + - "__min", - SOMA_GEOMETRY_DIMENSION_PREFIX + spatial_dim_infos[1].name + - "__min", - SOMA_GEOMETRY_DIMENSION_PREFIX + spatial_dim_infos[0].name + - "__max", - SOMA_GEOMETRY_DIMENSION_PREFIX + spatial_dim_infos[1].name + - "__max"}; + dim_infos[0].name, dim_infos[1].name}; std::vector expected_spatial_column_names = { spatial_dim_infos[0].name, spatial_dim_infos[1].name}; @@ -321,43 +313,11 @@ TEST_CASE("SOMAGeometryDataFrame: Roundtrip", "[SOMAGeometryDataFrame]") { while (auto batch = soma_geometry->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at(dim_infos[0].name)->data(); - auto d1span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[0].name + "__min") - ->data(); - auto d2span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[0].name + "__max") - ->data(); - auto d3span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[1].name + "__min") - ->data(); - auto d4span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[1].name + "__max") - ->data(); auto wkbs = arrbuf->at(dim_infos[1].name)->binaries(); auto a0span = arrbuf->at(attr_infos[0].name)->data(); CHECK( std::vector({1}) == std::vector(d0span.begin(), d0span.end())); - CHECK( - std::vector({0}) == - std::vector(d1span.begin(), d1span.end())); - CHECK( - std::vector({1}) == - std::vector(d2span.begin(), d2span.end())); - CHECK( - std::vector({0}) == - std::vector(d3span.begin(), d3span.end())); - CHECK( - std::vector({1}) == - std::vector(d4span.begin(), d4span.end())); CHECK(geometry::to_wkb(polygon) == wkbs[0]); CHECK( std::vector({63}) ==