Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Enhance Arrow-to-Pandas nullability conveyance for .to_pandas() output [WIP] #3645

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ def run(self):
str(tiledb_dir / "lib"),
]

# Debug (ineffectual on MacOS):
# CXX_FLAGS = ["-g", "-O0"]
CXX_FLAGS = ["-O3"]

if platform.machine() == "x86_64":
Expand Down
23 changes: 19 additions & 4 deletions apis/python/src/tiledbsoma/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

namespace tiledbsoma {

using namespace pybind11::literals; // to bring in the `_a` literal

std::unordered_map<tiledb_datatype_t, std::string> _tdb_to_np_name_dtype = {
{TILEDB_INT32, "int32"},
{TILEDB_INT64, "int64"},
Expand Down Expand Up @@ -185,22 +187,35 @@ bool is_tdb_str(tiledb_datatype_t type) {
py::object _buffer_to_table(std::shared_ptr<ArrayBuffers> buffers) {
auto pa = py::module::import("pyarrow");
auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays");
auto pa_array_import = pa.attr("Array").attr("_import_from_c");
auto pa_schema_import = pa.attr("Schema").attr("_import_from_c");
auto py_array_importer = pa.attr("Array").attr("_import_from_c");
auto pa_schema_importer = pa.attr("Schema").attr("_import_from_c");

py::list array_list;
py::list names;

for (auto& name : buffers->names()) {
auto column = buffers->at(name);
auto [pa_array, pa_schema] = ArrowAdapter::to_arrow(column);
auto array = pa_array_import(
auto array = py_array_importer(
py::capsule(pa_array.get()), py::capsule(pa_schema.get()));
array_list.append(array);
names.append(name);
}

return pa_table_from_arrays(array_list, names);
auto py_arrow_table = pa_table_from_arrays(array_list, names);

// Set Arrow-table metadata so that when someone does .to_pandas()
// on our data, they'll more likely get the intended column nullabilities.
// Seee also https://github.com/single-cell-data/TileDB-SOMA/issues/3642.
ArrowSchema c_arrow_schema;
uintptr_t c_arrow_schema_ptr = (uintptr_t)(&c_arrow_schema);
py_arrow_table.attr("schema").attr("_export_to_c")(c_arrow_schema_ptr);
ArrowAdapter::set_metadata_for_pandas(&c_arrow_schema);
auto py_arrow_schema = pa_schema_importer(py::capsule(&c_arrow_schema));
py_arrow_table = pa_table_from_arrays(
array_list, "schema"_a = py_arrow_schema);

return py_arrow_table;
}

std::optional<py::object> to_table(
Expand Down
135 changes: 132 additions & 3 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include "../soma/column_buffer.h"
#include "arrow_adapter.h"
#include "logger.h"
#include "nlohmann/json.hpp"
#include "util.h"
#include "version.h"

#include "../soma/soma_attribute.h"
#include "../soma/soma_dimension.h"
Expand Down Expand Up @@ -348,6 +350,127 @@ json ArrowAdapter::_get_filter_list_json(FilterList filter_list) {
return filter_list_as_json;
}

// Here we are setting Arrow-table metadata so that when someone does
// .to_pandas() on it they'll more likely get the desired column nullabilities.
void ArrowAdapter::set_metadata_for_pandas(ArrowSchema* arrow_schema) {
// Lookup tables
// clang-format off
static std::map<std::string, std::string> arrow_format_to_pandas_type = {
{ "c", "int8"},
{ "s", "int16"},
{ "i", "int32"},
{ "l", "int64"},
{ "C", "uint8"},
{ "S", "uint16"},
{ "I", "uint32"},
{ "L", "uint64"},
{ "f", "float32"},
{ "g", "float64"},
{ "u", "string"},
{ "z", "binary"},
{ "b", "bool"},
// { "tss:", xxx}, // TILEDB_DATETIME_SEC,
// { "tsm:", xxx}, // TILEDB_DATETIME_MS,
// { "tsu:", xxx}, // TILEDB_DATETIME_US,
// { "tsn:", xxx}, // TILEDB_DATETIME_NS,
};
static std::map<std::string, std::string> arrow_format_to_numpy_type = {
{ "c", "Int8"},
{ "s", "Int16"},
{ "i", "Int32"},
{ "l", "Int64"},
{ "C", "UInt8"},
{ "S", "UInt16"},
{ "I", "UInt32"},
{ "L", "UInt64"},
{ "f", "Float32"},
{ "g", "Float64"},
{ "u", "string"},
{ "z", "binary"},
{ "b", "bool"},
};
// clang-format on

arrow_schema->metadata = nullptr;

nlohmann::json creator = {
{"library", "tiledbsoma"},
// This gets us "2.28.0", not "1.15.5" ... where that latter is
// not available directly in C++ (unless we take it as a function
// argument).
{"version", tiledbsoma::version::as_string().c_str()}};

std::vector<nlohmann::json> columns;
for (auto i = 0; i < arrow_schema->n_children; i++) {
if (arrow_schema->children[i] == nullptr) {
continue;
}
auto child = arrow_schema->children[i];

auto arrow_format = std::string(child->format);

std::string numpy_type = "object";
auto it = arrow_format_to_numpy_type.find(arrow_format);
if (it != arrow_format_to_numpy_type.end()) {
numpy_type = it->second;
}

std::string pandas_type = "object";
nlohmann::json metadata_info; // JSON null
if (child->dictionary == nullptr) {
it = arrow_format_to_pandas_type.find(arrow_format);
if (it != arrow_format_to_pandas_type.end()) {
pandas_type = it->second;
}
} else {
numpy_type = pandas_type; // empirically determined
pandas_type = "categorical";
metadata_info = {
// There is also a "num_categories" key but we don't have access
// to that information here
{"ordered",
(child->flags & ARROW_FLAG_DICTIONARY_ORDERED) ? true : false},
};
}

// clang-format off
nlohmann::json column = {
{"name", child->name},
{"field_name", child->name},
{"pandas_type", pandas_type.c_str()},
{"numpy_type", numpy_type.c_str()},
{"metadata", metadata_info},
};
// clang-format on
columns.push_back(column);
}

// clang-format off
nlohmann::json pandas_info = {
{"columns", columns},
{"creator", creator},
// Any template type will do, as all we want to produce is empty `[]`:
{"index_columns", std::vector<int>({})},
// Any template type will do, as all we want to produce is empty `[]`:
{"column_indices", std::vector<int>({})},
// Announce that this is the API version we're conforming to:
{"pandas_version", "2.2.3"}
};
// clang-format on

nanoarrow::UniqueBuffer buffer;
ArrowMetadataBuilderInit(buffer.get(), nullptr);

ArrowMetadataBuilderAppend(
buffer.get(),
ArrowCharView("pandas"),
ArrowCharView(pandas_info.dump(2).c_str()));

ArrowSchemaSetMetadata(
arrow_schema,
std::string((char*)buffer->data, buffer->size_bytes).c_str());
}

std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::shared_ptr<Context> ctx, std::shared_ptr<Array> tiledb_array) {
auto tiledb_schema = tiledb_array->schema();
Expand All @@ -357,7 +480,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
std::unique_ptr<ArrowSchema> arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = strdup("+s");
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = ndim + nattr;
arrow_schema->dictionary = nullptr;
Expand Down Expand Up @@ -453,6 +576,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
child->release = &ArrowAdapter::release_schema;
}

set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand Down Expand Up @@ -1559,7 +1684,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
auto arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = "+s"; // structure, i.e. non-leaf node
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = num_names; // non-leaf node
arrow_schema->children = (ArrowSchema**)malloc(
Expand Down Expand Up @@ -1607,6 +1732,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
}
}

set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand All @@ -1615,7 +1742,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
auto arrow_schema = std::make_unique<ArrowSchema>();
arrow_schema->format = "+s"; // structure, i.e. non-leaf node
arrow_schema->name = strdup("parent");
arrow_schema->metadata = nullptr;

arrow_schema->flags = 0;
arrow_schema->n_children = num_columns; // non-leaf node
arrow_schema->children = (ArrowSchema**)malloc(
Expand All @@ -1632,6 +1759,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
"[ArrowAdapter] make_arrow_schema n_children {}",
arrow_schema->n_children));

set_metadata_for_pandas(arrow_schema.get());

return arrow_schema;
}

Expand Down
9 changes: 9 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,15 @@ class ArrowAdapter {
}
}

/**
* This sets the Arrow table's metadata in a way that helps users
* get the Arrow table's column nullabilities preserved in Pandas
* when they do a `.to_pandas()` on Arrow tables we produce.
* See for context
* https://github.com/single-cell-data/TileDB-SOMA/issues/3642
*/
static void set_metadata_for_pandas(ArrowSchema* arrow_schema);

private:
static std::pair<const void*, std::size_t> _get_data_and_length(
Enumeration& enmr, const void* dst);
Expand Down
Loading