Skip to content

Commit

Permalink
temp/WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jan 30, 2025
1 parent 2f12828 commit 2c8fa01
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 7 deletions.
2 changes: 2 additions & 0 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ def run(self):
str(tiledb_dir / "lib"),
]

# Debug (ineffectual on MacOS):
# CXX_FLAGS = ["-g", "-O0"]
CXX_FLAGS = ["-O3"]

if platform.machine() == "x86_64":
Expand Down
23 changes: 19 additions & 4 deletions apis/python/src/tiledbsoma/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

namespace tiledbsoma {

using namespace pybind11::literals; // to bring in the `_a` literal

std::unordered_map<tiledb_datatype_t, std::string> _tdb_to_np_name_dtype = {
{TILEDB_INT32, "int32"},
{TILEDB_INT64, "int64"},
Expand Down Expand Up @@ -185,22 +187,35 @@ bool is_tdb_str(tiledb_datatype_t type) {
py::object _buffer_to_table(std::shared_ptr<ArrayBuffers> buffers) {
auto pa = py::module::import("pyarrow");
auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays");
auto pa_array_import = pa.attr("Array").attr("_import_from_c");
auto pa_schema_import = pa.attr("Schema").attr("_import_from_c");
auto py_array_importer = pa.attr("Array").attr("_import_from_c");
auto pa_schema_importer = pa.attr("Schema").attr("_import_from_c");

py::list array_list;
py::list names;

for (auto& name : buffers->names()) {
auto column = buffers->at(name);
auto [pa_array, pa_schema] = ArrowAdapter::to_arrow(column);
auto array = pa_array_import(
auto array = py_array_importer(
py::capsule(pa_array.get()), py::capsule(pa_schema.get()));
array_list.append(array);
names.append(name);
}

return pa_table_from_arrays(array_list, names);
auto py_arrow_table = pa_table_from_arrays(array_list, names);

// Set Arrow-table metadata so that when someone does .to_pandas()
// on our data, they'll more likely get the intended column nullabilities.
// Seee also https://github.com/single-cell-data/TileDB-SOMA/issues/3642.
ArrowSchema c_arrow_schema;
uintptr_t c_arrow_schema_ptr = (uintptr_t)(&c_arrow_schema);
py_arrow_table.attr("schema").attr("_export_to_c")(c_arrow_schema_ptr);
ArrowAdapter::set_metadata_for_pandas(&c_arrow_schema);
auto py_arrow_schema = pa_schema_importer(py::capsule(&c_arrow_schema));
py_arrow_table = pa_table_from_arrays(
array_list, "schema"_a = py_arrow_schema);

return py_arrow_table;
}

std::optional<py::object> to_table(
Expand Down
135 changes: 132 additions & 3 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include "../soma/column_buffer.h"
#include "arrow_adapter.h"
#include "logger.h"
#include "nlohmann/json.hpp"
#include "util.h"
#include "version.h"

#include "../soma/soma_attribute.h"
#include "../soma/soma_dimension.h"
Expand Down Expand Up @@ -348,6 +350,127 @@ json ArrowAdapter::_get_filter_list_json(FilterList filter_list) {
return filter_list_as_json;
}

// Here we are setting Arrow-table metadata so that when someone does
// .to_pandas() on it they'll more likely get the desired column nullabilities.
void ArrowAdapter::set_metadata_for_pandas(ArrowSchema* arrow_schema) {
// Lookup tables
// clang-format off
static std::map<std::string, std::string> arrow_format_to_pandas_type = {
{ "c", "int8"},
{ "s", "int16"},
{ "i", "int32"},
{ "l", "int64"},
{ "C", "uint8"},
{ "S", "uint16"},
{ "I", "uint32"},
{ "L", "uint64"},
{ "f", "float32"},
{ "g", "float64"},
{ "u", "string"},
{ "z", "binary"},
{ "b", "bool"},
// { "tss:", xxx}, // TILEDB_DATETIME_SEC,
// { "tsm:", xxx}, // TILEDB_DATETIME_MS,
// { "tsu:", xxx}, // TILEDB_DATETIME_US,
// { "tsn:", xxx}, // TILEDB_DATETIME_NS,
};
static std::map<std::string, std::string> arrow_format_to_numpy_type = {
{ "c", "Int8"},
{ "s", "Int16"},
{ "i", "Int32"},
{ "l", "Int64"},
{ "C", "UInt8"},
{ "S", "UInt16"},
{ "I", "UInt32"},
{ "L", "UInt64"},
{ "f", "Float32"},
{ "g", "Float64"},
{ "u", "string"},
{ "z", "binary"},
{ "b", "bool"},
};
// clang-format on

arrow_schema->metadata = nullptr;

nlohmann::json creator = {
{"library", "tiledbsoma"},
// This gets us "2.28.0", not "1.15.5" ... where that latter is
// not available directly in C++ (unless we take it as a function
// argument).
{"version", tiledbsoma::version::as_string().c_str()}};

std::vector<nlohmann::json> columns;
for (auto i = 0; i < arrow_schema->n_children; i++) {
if (arrow_schema->children[i] == nullptr) {
continue;
}
auto child = arrow_schema->children[i];

auto arrow_format = std::string(child->format);

std::string numpy_type = "object";
auto it = arrow_format_to_numpy_type.find(arrow_format);
if (it != arrow_format_to_numpy_type.end()) {
numpy_type = it->second;
}

std::string pandas_type = "object";
nlohmann::json metadata_info; // JSON null
if (child->dictionary == nullptr) {
it = arrow_format_to_pandas_type.find(arrow_format);
if (it != arrow_format_to_pandas_type.end()) {
pandas_type = it->second;
}
} else {
numpy_type = pandas_type; // empirically determined
pandas_type = "categorical";
metadata_info = {
// There is also a "num_categories" key but we don't have access
// to that information here
{"ordered",
(child->flags & ARROW_FLAG_DICTIONARY_ORDERED) ? true : false},
};
}

// clang-format off
nlohmann::json column = {
{"name", child->name},
{"field_name", child->name},
{"pandas_type", pandas_type.c_str()},
{"numpy_type", numpy_type.c_str()},
{"metadata", metadata_info},
};
// clang-format on
columns.push_back(column);
}

// clang-format off
nlohmann::json pandas_info = {
{"columns", columns},
{"creator", creator},
// Any template type will do, as all we want to produce is empty `[]`:
{"index_columns", std::vector<int>({})},
// Any template type will do, as all we want to produce is empty `[]`:
{"column_indices", std::vector<int>({})},
// Announce that this is the API version we're conforming to:
{"pandas_version", "2.2.3"}
};
// clang-format on

nanoarrow::UniqueBuffer buffer;
ArrowMetadataBuilderInit(buffer.get(), nullptr);

ArrowMetadataBuilderAppend(
buffer.get(),
ArrowCharView("pandas"),
ArrowCharView(pandas_info.dump(2).c_str()));

ArrowSchemaSetMetadata(
arrow_schema,
std::string((char*)buffer->data, buffer->size_bytes).c_str());
}

std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::shared_ptr<Context> ctx, std::shared_ptr<Array> tiledb_array) {
auto tiledb_schema = tiledb_array->schema();
Expand All @@ -357,7 +480,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::unique_ptr<ArrowSchema> arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = strdup("+s");
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = ndim + nattr;
arrow_schema->dictionary = nullptr;
Expand Down Expand Up @@ -453,6 +576,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
child->release = &ArrowAdapter::release_schema;
}

set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand Down Expand Up @@ -1559,7 +1684,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
auto arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = "+s"; // structure, i.e. non-leaf node
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = num_names; // non-leaf node
arrow_schema->children = (ArrowSchema**)malloc(
Expand Down Expand Up @@ -1607,6 +1732,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
}
}

set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand All @@ -1615,7 +1742,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
auto arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = "+s"; // structure, i.e. non-leaf node
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = num_columns; // non-leaf node
arrow_schema->children = (ArrowSchema**)malloc(
Expand All @@ -1632,6 +1759,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
"[ArrowAdapter] make_arrow_schema n_children {}",
arrow_schema->n_children));

set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand Down
9 changes: 9 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,15 @@ class ArrowAdapter {
}
}

/**
* This sets the Arrow table's metadata in a way that helps users
* get the Arrow table's column nullabilities preserved in Pandas
* when they do a `.to_pandas()` on Arrow tables we produce.
* See for context
* https://github.com/single-cell-data/TileDB-SOMA/issues/3642
*/
static void set_metadata_for_pandas(ArrowSchema* arrow_schema);

private:
static std::pair<const void*, std::size_t> _get_data_and_length(
Enumeration& enmr, const void* dst);
Expand Down

0 comments on commit 2c8fa01

Please sign in to comment.