diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index 831dc58228..4abde4f438 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -36,6 +36,7 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_column.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_attribute.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dimension.cc + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_geometry_column.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.cc @@ -205,6 +206,7 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_column.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_attribute.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dimension.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_geometry_column.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_collection.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dense_ndarray.h diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc new file mode 100644 index 0000000000..51e5fefecf --- /dev/null +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -0,0 +1,418 @@ +/** + * @file soma_geometry_column.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAGeometryColumn class. + */ + +#include "soma_geometry_column.h" + +namespace tiledbsoma { + +std::shared_ptr SOMAGeometryColumn::create( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowSchema* spatial_schema, + ArrowArray* spatial_array, + const std::string& soma_type, + std::string_view type_metadata, + PlatformConfig platform_config) { + std::vector dims; + if (type_metadata.compare("WKB") != 0) { + throw TileDBSOMAError(std::format( + "[SOMAGeometryColumn] " + "Unkwown type metadata for `{}`: " + "Expected 'WKB', got {}", + SOMA_GEOMETRY_COLUMN_NAME, + type_metadata)); + } + + for (int64_t j = 0; j < spatial_schema->n_children; ++j) { + dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema( + ctx, + spatial_schema->children[j], + spatial_array->children[j], + soma_type, + type_metadata, + SOMA_GEOMETRY_DIMENSION_PREFIX, + "__min", + platform_config)); + } + + for (int64_t j = 0; j < spatial_schema->n_children; ++j) { + dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema( + ctx, + spatial_schema->children[j], + spatial_array->children[j], + soma_type, + type_metadata, + SOMA_GEOMETRY_DIMENSION_PREFIX, + "__max", + platform_config)); + } + + auto attribute = ArrowAdapter::tiledb_attribute_from_arrow_schema( + ctx, schema, type_metadata, platform_config); + + return std::make_shared( + SOMAGeometryColumn(dims, attribute.first)); +} + +void SOMAGeometryColumn::_set_dim_points( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& points) const { + std::vector> + transformed_points = _transform_points( + std::any_cast>>(points)); + + // The limits of the current domain if it exists or the core domain + // otherwise. + auto limits = _limits(ctx, *query->schema()); + + // Create a range object and reuse if for all dimensions + std::vector> range(1); + size_t dimensionality = dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; + + for (size_t i = 0; i < transformed_points.size(); ++i) { + range[0] = std::make_pair( + limits[i].first, + std::min(transformed_points[i].second, limits[i].second)); + query->select_ranges(dimensions[i].name(), range); + + range[0] = std::make_pair( + std::max(transformed_points[i].first, limits[i].first), + limits[i].second); + query->select_ranges(dimensions[i + dimensionality].name(), range); + } +} + +void SOMAGeometryColumn::_set_dim_ranges( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& ranges) const { + std::vector> + transformed_ranges = _transform_ranges( + std::any_cast, std::vector>>>( + ranges)); + + // The limits of the current domain if it exists or the core domain + // otherwise. + auto limits = _limits(ctx, *query->schema()); + + // Create a range object and reuse if for all dimensions + std::vector> range(1); + size_t dimensionality = dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; + + for (size_t i = 0; i < transformed_ranges.size(); ++i) { + range[0] = std::make_pair( + limits[i].first, + std::min(transformed_ranges[i].second, limits[i].second)); + query->select_ranges(dimensions[i].name(), range); + + range[0] = std::make_pair( + std::max(transformed_ranges[i].first, limits[i].first), + limits[i].second); + query->select_ranges(dimensions[i + dimensionality].name(), range); + } +} + +void SOMAGeometryColumn::_set_current_domain_slot( + NDRectangle& rectangle, + std::span new_current_domain) const { + if (TDB_DIM_PER_SPATIAL_AXIS * new_current_domain.size() != + dimensions.size()) { + throw TileDBSOMAError(std::format( + "[SOMAGeometryColumn] Dimension - Current Domain mismatch. " + "Expected current domain of size {}, found {}", + dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS, + new_current_domain.size())); + } + + for (size_t i = 0; i < new_current_domain.size(); ++i) { + auto range = std::any_cast>( + new_current_domain[i]); + rectangle.set_range(dimensions[i].name(), range[0], range[1]); + } + + for (size_t i = 0; i < new_current_domain.size(); ++i) { + auto range = std::any_cast>( + new_current_domain[i]); + rectangle.set_range( + dimensions[i + new_current_domain.size()].name(), + range[0], + range[1]); + } +} + +std::pair SOMAGeometryColumn::_can_set_current_domain_slot( + std::optional& rectangle, + std::span new_current_domain) const { + if (new_current_domain.size() != + dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS) { + throw TileDBSOMAError(std::format( + "[SOMADimension][_can_set_current_domain_slot] Expected current " + "domain " + "size is 2, found {}", + new_current_domain.size())); + } + + for (size_t i = 0; i < new_current_domain.size(); ++i) { + auto range = std::any_cast>( + new_current_domain[i]); + + if (range[0] > range[1]) { + return std::pair( + false, + std::format( + "index-column name {}: new lower > new upper", + dimensions[i].name())); + } + + auto dimension_min = dimensions[i]; + auto dimension_max = + dimensions[i + dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS]; + + if (rectangle.has_value()) { + auto range_min = rectangle.value().range( + dimension_min.name()); + auto range_max = rectangle.value().range( + dimension_max.name()); + + if (range[0] > range_min[0]) { + return std::pair( + false, + std::format( + "index-column name {}: new lower > old lower (downsize " + "is unsupported)", + dimension_min.name())); + } + if (range[0] > range_max[0]) { + return std::pair( + false, + std::format( + "index-column name {}: new lower > old lower (downsize " + "is unsupported)", + dimension_max.name())); + } + if (range[1] < range_min[1]) { + return std::pair( + false, + std::format( + "index-column name {}: new upper < old upper (downsize " + "is unsupported)", + dimension_min.name())); + } + if (range[1] < range_max[1]) { + return std::pair( + false, + std::format( + "index-column name {}: new upper < old upper (downsize " + "is unsupported)", + dimension_max.name())); + } + } else { + auto core_domain = std::any_cast< + std::pair, std::vector>>( + _core_domain_slot()); + + if (range[0] > core_domain.first[i]) { + return std::pair( + false, + std::format( + "index-column name {}: new lower < limit lower", + dimension_min.name())); + } + if (range[1] < core_domain.second[i]) { + return std::pair( + false, + std::format( + "index-column name {}: new upper > limit upper", + dimension_min.name())); + } + } + } + + return std::pair(true, ""); +} + +std::vector> SOMAGeometryColumn::_limits( + const SOMAContext& ctx, const ArraySchema& schema) const { + std::vector> limits; + + if (ArraySchemaExperimental::current_domain(*ctx.tiledb_ctx(), schema) + .is_empty()) { + for (size_t i = 0; i < dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; + ++i) { + std::pair core_domain = dimensions[i] + .domain(); + + limits.push_back( + std::make_pair(core_domain.first, core_domain.second)); + } + } else { + NDRectangle ndrect = ArraySchemaExperimental::current_domain( + *ctx.tiledb_ctx(), schema) + .ndrectangle(); + for (size_t i = 0; i < dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; + ++i) { + std::array range = ndrect.range( + dimensions.at(i).name()); + + limits.push_back(std::make_pair(range[0], range[1])); + } + } + + return limits; +} + +std::vector> +SOMAGeometryColumn::_transform_ranges( + const std::vector, std::vector>>& + ranges) const { + if (ranges.size() != 1) { + throw TileDBSOMAError( + "Multiranges are not supported for geometry dimension"); + } + + std::vector> transformed_ranges; + std::vector min_ranges = ranges.front().first; + std::vector max_ranges = ranges.front().second; + for (size_t i = 0; i < dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; ++i) { + transformed_ranges.push_back( + std::make_pair(min_ranges[i], max_ranges[i])); + } + + return transformed_ranges; +} + +std::vector> +SOMAGeometryColumn::_transform_points( + const std::span>& points) const { + if (points.size() != 1) { + throw TileDBSOMAError( + "Multipoints are not supported for geometry dimension"); + } + + std::vector> transformed_ranges; + for (size_t i = 0; i < dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; ++i) { + transformed_ranges.push_back( + std::make_pair(points.front()[i], points.front()[i])); + } + + return transformed_ranges; +} + +std::any SOMAGeometryColumn::_core_domain_slot() const { + std::vector min, max; + for (size_t i = 0; i < dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; ++i) { + std::pair core_domain = dimensions[i] + .domain(); + + min.push_back(core_domain.first); + max.push_back(core_domain.second); + } + + return std::make_any< + std::pair, std::vector>>( + std::make_pair(min, max)); +}; + +std::any SOMAGeometryColumn::_non_empty_domain_slot(Array& array) const { + std::vector min, max; + size_t dimensionality = dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; + for (size_t i = 0; i < dimensionality; ++i) { + std::pair + min_non_empty_dom = array.non_empty_domain( + dimensions[i].name()); + std::pair + max_non_empty_dom = array.non_empty_domain( + dimensions[i + dimensionality].name()); + + min.push_back(min_non_empty_dom.first); + max.push_back(max_non_empty_dom.second); + } + + return std::make_any< + std::pair, std::vector>>( + std::make_pair(min, max)); +} + +std::any SOMAGeometryColumn::_core_current_domain_slot( + const SOMAContext& ctx, Array& array) const { + CurrentDomain + current_domain = tiledb::ArraySchemaExperimental::current_domain( + *ctx.tiledb_ctx(), array.schema()); + NDRectangle ndrect = current_domain.ndrectangle(); + + return _core_current_domain_slot(ndrect); +} + +std::any SOMAGeometryColumn::_core_current_domain_slot( + NDRectangle& ndrect) const { + std::vector min, max; + + for (size_t i = 0; i < dimensions.size() / TDB_DIM_PER_SPATIAL_AXIS; ++i) { + std::array range = ndrect.range( + dimensions[i].name()); + + min.push_back(range[0]); + max.push_back(range[1]); + } + + return std::make_any< + std::pair, std::vector>>( + std::make_pair(min, max)); +} + +ArrowArray* SOMAGeometryColumn::arrow_domain_slot( + const SOMAContext& ctx, Array& array, enum Domainish kind) const { + switch (domain_type().value()) { + case TILEDB_FLOAT64: + return ArrowAdapter::make_arrow_array_child_var( + domain_slot>(ctx, array, kind)); + break; + default: + throw TileDBSOMAError(std::format( + "[SOMAGeometryColumn][arrow_domain_slot] dim {} has unhandled " + "extended type " + "{}", + name(), + tiledb::impl::type_to_str(domain_type().value()))); + } +} + +ArrowSchema* SOMAGeometryColumn::arrow_schema_slot( + const SOMAContext& ctx, Array& array) { + return ArrowAdapter::arrow_schema_from_tiledb_attribute( + attribute, *ctx.tiledb_ctx(), array) + .release(); +} + +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_geometry_column.h b/libtiledbsoma/src/soma/soma_geometry_column.h new file mode 100644 index 0000000000..db1995bb5d --- /dev/null +++ b/libtiledbsoma/src/soma/soma_geometry_column.h @@ -0,0 +1,175 @@ +/** + * @file soma_geometry_column.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAGeometryColumn class. SOMAGeometryColumn wraps a + * TileDB Attribute storing the WKB (Well-Known Binary) encoded geometry and + * adds a collection of internal TileDB dimension to provide spatial indexing. + * It implements function to perform queries as well as core domain and current + * domain operations. The purpose of this class is to provide a common interface + * identical to TileDB dimensions, attributes and other composite columns. + * + * The current indexing mechanish adapts the idea of Priority R-tree on top of a + * TileDB Array using a set of TIleDB Dimensions to store the MBR corners of + * each geometry. + */ + +#ifndef SOMA_GEOMETRY_COLUMN_H +#define SOMA_GEOMETRY_COLUMN_H + +#include +#include + +#include +#include "soma_column.h" + +namespace tiledbsoma { + +class ArrayBuffers; + +using namespace tiledb; + +class SOMAGeometryColumn : public SOMAColumn { + public: + static std::shared_ptr create( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowSchema* spatial_schema, + ArrowArray* spatial_array, + const std::string& soma_type, + std::string_view type_metadata, + PlatformConfig platform_config); + + SOMAGeometryColumn(std::vector dimensions, Attribute attribute) + : dimensions(dimensions) + , attribute(attribute){}; + + inline std::string name() const override { + return SOMA_GEOMETRY_COLUMN_NAME; + } + + inline bool isIndexColumn() const override { + return true; + } + + inline void select_columns( + const std::unique_ptr& query, + bool if_not_empty = false) const override { + query->select_columns(std::vector({attribute.name()}), if_not_empty); + }; + + inline soma_column_datatype_t type() const override { + return soma_column_datatype_t::SOMA_COLUMN_GEOMETRY; + } + + inline std::optional domain_type() const override { + return dimensions.front().type(); + } + + inline std::optional data_type() const override { + return attribute.type(); + } + + inline std::optional> tiledb_dimensions() override { + return dimensions; + } + + inline std::optional> tiledb_attributes() override { + return std::vector({attribute}); + } + + inline std::optional> tiledb_enumerations() + override { + return std::nullopt; + } + + ArrowArray* arrow_domain_slot( + const SOMAContext& ctx, + Array& array, + enum Domainish kind) const override; + + ArrowSchema* arrow_schema_slot( + const SOMAContext& ctx, Array& array) override; + + protected: + void _set_dim_points( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& points) const override; + + void _set_dim_ranges( + const std::unique_ptr& query, + const SOMAContext& ctx, + const std::any& ranges) const override; + + void _set_current_domain_slot( + NDRectangle& rectangle, + std::span new_current_domain) const override; + + std::pair _can_set_current_domain_slot( + std::optional& rectangle, + std::span new_current_domain) const override; + + std::any _core_domain_slot() const override; + + std::any _non_empty_domain_slot(Array& array) const override; + + std::any _core_current_domain_slot( + const SOMAContext& ctx, Array& array) const override; + + std::any _core_current_domain_slot(NDRectangle& ndrect) const override; + + private: + /** + * The current implementation of SOMAGeometryColumn uses a pair of TileDB + * dimensions to store the min and max point of the bounding box per + * dimension. E.g. a 2D geometry will have 4 TileDB dimensions (2 * + * num_spatial_axes) to provide spatial indexing. + */ + const size_t TDB_DIM_PER_SPATIAL_AXIS = 2; + std::vector dimensions; + Attribute attribute; + + /** + * Compute the usable domain limits. If the array has a current domain then + * it is used to compute the limits, otherwise the core domain is used. + */ + std::vector> _limits( + const SOMAContext& ctx, const ArraySchema& schema) const; + + std::vector> _transform_ranges( + const std::vector< + std::pair, std::vector>>& ranges) + const; + + std::vector> _transform_points( + const std::span>& points) const; +}; + +} // namespace tiledbsoma +#endif \ No newline at end of file diff --git a/libtiledbsoma/src/tiledbsoma/tiledbsoma b/libtiledbsoma/src/tiledbsoma/tiledbsoma index 8dda098f30..3f71c123a4 100644 --- a/libtiledbsoma/src/tiledbsoma/tiledbsoma +++ b/libtiledbsoma/src/tiledbsoma/tiledbsoma @@ -49,9 +49,6 @@ #include "soma/column_buffer.h" #include "soma/soma_array.h" #include "soma/soma_collection.h" -#include "soma/soma_column.h" -#include "soma/soma_attribute.h" -#include "soma/soma_dimension.h" #include "soma/soma_dataframe.h" #include "soma/soma_group.h" #include "soma/soma_experiment.h" diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 8dc3084912..ce7e50d1bf 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -976,8 +976,8 @@ void ArrowAdapter::_set_current_domain_slot( LOG_DEBUG(std::format( "[ArrowAdapter] {} current_domain float {} to {}", name, - lo, - hi)); + std::to_string(lo), + std::to_string(hi))); } break; case TILEDB_FLOAT64: { double lo = ((double*)buff)[3]; @@ -986,8 +986,8 @@ void ArrowAdapter::_set_current_domain_slot( LOG_DEBUG(std::format( "[ArrowAdapter] {} current_domain double {} to {}", name, - lo, - hi)); + std::to_string(lo), + std::to_string(hi))); } break; default: throw TileDBSOMAError(std::format( diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 9680b58d9f..19c76c1e53 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -27,13 +27,16 @@ * * @section DESCRIPTION * - * This file manages unit tests for implementation of SOMAColumn class. This is - * temparary and to be removed once SOMAColumn is fully integrated. + * This file manages unit tests for implementation of SOMAColumn class */ #include #include #include +#include "../src/soma/soma_attribute.h" +#include "../src/soma/soma_column.h" +#include "../src/soma/soma_dimension.h" +#include "../src/soma/soma_geometry_column.h" #include "common.h" const int64_t SOMA_JOINID_DIM_MAX = 99; @@ -285,7 +288,7 @@ TEST_CASE("SOMAColumn: SOMADimension") { TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, "SOMAColumn: query variant-indexed dataframe dim-str-u32 attr-sjid", - "[SOMADataFrame]") { + "[SOMAColumn]") { auto specify_domain = GENERATE(false, true); SECTION(std::format("- specify_domain={}", specify_domain)) { std::string suffix1 = specify_domain ? "true" : "false"; @@ -314,11 +317,6 @@ TEST_CASE_METHOD( std::make_shared(SOMADimension(dimension))); } - for (size_t i = 0; i < sdf->tiledb_schema()->attribute_num(); ++i) { - columns.push_back(std::make_shared( - SOMAAttribute(sdf->tiledb_schema()->attribute(i)))); - } - CurrentDomain current_domain = sdf->get_current_domain_for_test(); REQUIRE(!current_domain.is_empty());