single-cell-data · johnkerl · Jan 30, 2025
diff --git a/apis/python/setup.py b/apis/python/setup.py
@@ -244,6 +244,8 @@ def run(self):
     str(tiledb_dir / "lib"),
 ]
 
+# Debug (ineffectual on MacOS):
+# CXX_FLAGS = ["-g", "-O0"]
 CXX_FLAGS = ["-O3"]
 
 if platform.machine() == "x86_64":

diff --git a/apis/python/src/tiledbsoma/common.cc b/apis/python/src/tiledbsoma/common.cc
@@ -15,6 +15,8 @@
 
 namespace tiledbsoma {
 
+using namespace pybind11::literals;  // to bring in the `_a` literal
+
 std::unordered_map<tiledb_datatype_t, std::string> _tdb_to_np_name_dtype = {
     {TILEDB_INT32, "int32"},
     {TILEDB_INT64, "int64"},
@@ -185,22 +187,35 @@ bool is_tdb_str(tiledb_datatype_t type) {
 py::object _buffer_to_table(std::shared_ptr<ArrayBuffers> buffers) {
     auto pa = py::module::import("pyarrow");
     auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays");
-    auto pa_array_import = pa.attr("Array").attr("_import_from_c");
-    auto pa_schema_import = pa.attr("Schema").attr("_import_from_c");
+    auto py_array_importer = pa.attr("Array").attr("_import_from_c");
+    auto pa_schema_importer = pa.attr("Schema").attr("_import_from_c");
 
     py::list array_list;
     py::list names;
 
     for (auto& name : buffers->names()) {
         auto column = buffers->at(name);
         auto [pa_array, pa_schema] = ArrowAdapter::to_arrow(column);
-        auto array = pa_array_import(
+        auto array = py_array_importer(
             py::capsule(pa_array.get()), py::capsule(pa_schema.get()));
         array_list.append(array);
         names.append(name);
     }
 
-    return pa_table_from_arrays(array_list, names);
+    auto py_arrow_table = pa_table_from_arrays(array_list, names);
+
+    // Set Arrow-table metadata so that when someone does .to_pandas()
+    // on our data, they'll more likely get the intended column nullabilities.
+    // Seee also https://github.com/single-cell-data/TileDB-SOMA/issues/3642.
+    ArrowSchema c_arrow_schema;
+    uintptr_t c_arrow_schema_ptr = (uintptr_t)(&c_arrow_schema);
+    py_arrow_table.attr("schema").attr("_export_to_c")(c_arrow_schema_ptr);
+    ArrowAdapter::set_metadata_for_pandas(&c_arrow_schema);
+    auto py_arrow_schema = pa_schema_importer(py::capsule(&c_arrow_schema));
+    py_arrow_table = pa_table_from_arrays(
+        array_list, "schema"_a = py_arrow_schema);
+
+    return py_arrow_table;
 }
 
 std::optional<py::object> to_table(

diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc
@@ -16,7 +16,9 @@
 #include "../soma/column_buffer.h"
 #include "arrow_adapter.h"
 #include "logger.h"
+#include "nlohmann/json.hpp"
 #include "util.h"
+#include "version.h"
 
 #include "../soma/soma_attribute.h"
 #include "../soma/soma_dimension.h"
@@ -348,6 +350,127 @@ json ArrowAdapter::_get_filter_list_json(FilterList filter_list) {
     return filter_list_as_json;
 }
 
+// Here we are setting Arrow-table metadata so that when someone does
+// .to_pandas() on it they'll more likely get the desired column nullabilities.
+void ArrowAdapter::set_metadata_for_pandas(ArrowSchema* arrow_schema) {
+    // Lookup tables
+    // clang-format off
+    static std::map<std::string, std::string> arrow_format_to_pandas_type = {
+      { "c", "int8"},
+      { "s", "int16"},
+      { "i", "int32"},
+      { "l", "int64"},
+      { "C", "uint8"},
+      { "S", "uint16"},
+      { "I", "uint32"},
+      { "L", "uint64"},
+      { "f", "float32"},
+      { "g", "float64"},
+      { "u", "string"},
+      { "z", "binary"},
+      { "b", "bool"},
+      // { "tss:", xxx}, // TILEDB_DATETIME_SEC,
+      // { "tsm:", xxx}, // TILEDB_DATETIME_MS,
+      // { "tsu:", xxx}, // TILEDB_DATETIME_US,
+      // { "tsn:", xxx}, // TILEDB_DATETIME_NS,
+    };
+    static std::map<std::string, std::string> arrow_format_to_numpy_type = {
+      { "c", "Int8"},
+      { "s", "Int16"},
+      { "i", "Int32"},
+      { "l", "Int64"},
+      { "C", "UInt8"},
+      { "S", "UInt16"},
+      { "I", "UInt32"},
+      { "L", "UInt64"},
+      { "f", "Float32"},
+      { "g", "Float64"},
+      { "u", "string"},
+      { "z", "binary"},
+      { "b", "bool"},
+    };
+    // clang-format on
+
+    arrow_schema->metadata = nullptr;
+
+    nlohmann::json creator = {
+        {"library", "tiledbsoma"},
+        // This gets us "2.28.0", not "1.15.5" ... where that latter is
+        // not available directly in C++ (unless we take it as a function
+        // argument).
+        {"version", tiledbsoma::version::as_string().c_str()}};
+
+    std::vector<nlohmann::json> columns;
+    for (auto i = 0; i < arrow_schema->n_children; i++) {
+        if (arrow_schema->children[i] == nullptr) {
+            continue;
+        }
+        auto child = arrow_schema->children[i];
+
+        auto arrow_format = std::string(child->format);
+
+        std::string numpy_type = "object";
+        auto it = arrow_format_to_numpy_type.find(arrow_format);
+        if (it != arrow_format_to_numpy_type.end()) {
+            numpy_type = it->second;
+        }
+
+        std::string pandas_type = "object";
+        nlohmann::json metadata_info;  // JSON null
+        if (child->dictionary == nullptr) {
+            it = arrow_format_to_pandas_type.find(arrow_format);
+            if (it != arrow_format_to_pandas_type.end()) {
+                pandas_type = it->second;
+            }
+        } else {
+            numpy_type = pandas_type;  // empirically determined
+            pandas_type = "categorical";
+            metadata_info = {
+                // There is also a "num_categories" key but we don't have access
+                // to that information here
+                {"ordered",
+                 (child->flags & ARROW_FLAG_DICTIONARY_ORDERED) ? true : false},
+            };
+        }
+
+        // clang-format off
+        nlohmann::json column = {
+          {"name",        child->name},
+          {"field_name",  child->name},
+          {"pandas_type", pandas_type.c_str()},
+          {"numpy_type",  numpy_type.c_str()},
+          {"metadata",    metadata_info},
+        };
+        // clang-format on
+        columns.push_back(column);
+    }
+
+    // clang-format off
+    nlohmann::json pandas_info = {
+      {"columns", columns},
+      {"creator", creator},
+      // Any template type will do, as all we want to produce is empty `[]`:
+      {"index_columns", std::vector<int>({})},
+      // Any template type will do, as all we want to produce is empty `[]`:
+      {"column_indices", std::vector<int>({})},
+      // Announce that this is the API version we're conforming to:
+      {"pandas_version", "2.2.3"}
+    };
+    // clang-format on
+
+    nanoarrow::UniqueBuffer buffer;
+    ArrowMetadataBuilderInit(buffer.get(), nullptr);
+
+    ArrowMetadataBuilderAppend(
+        buffer.get(),
+        ArrowCharView("pandas"),
+        ArrowCharView(pandas_info.dump(2).c_str()));
+
+    ArrowSchemaSetMetadata(
+        arrow_schema,
+        std::string((char*)buffer->data, buffer->size_bytes).c_str());
+}
+
 std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
     std::shared_ptr<Context> ctx, std::shared_ptr<Array> tiledb_array) {
     auto tiledb_schema = tiledb_array->schema();
@@ -357,7 +480,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
     std::unique_ptr<ArrowSchema> arrow_schema = std::make_unique<ArrowSchema>();
     arrow_schema->format = strdup("+s");
     arrow_schema->name = strdup("parent");
-    arrow_schema->metadata = nullptr;
+
     arrow_schema->flags = 0;
     arrow_schema->n_children = ndim + nattr;
     arrow_schema->dictionary = nullptr;
@@ -453,6 +576,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::arrow_schema_from_tiledb_array(
         child->release = &ArrowAdapter::release_schema;
     }
 
+    set_metadata_for_pandas(arrow_schema.get());
+
     return arrow_schema;
 }
 
@@ -1559,7 +1684,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
     auto arrow_schema = std::make_unique<ArrowSchema>();
     arrow_schema->format = "+s";  // structure, i.e. non-leaf node
     arrow_schema->name = strdup("parent");
-    arrow_schema->metadata = nullptr;
+
     arrow_schema->flags = 0;
     arrow_schema->n_children = num_names;  // non-leaf node
     arrow_schema->children = (ArrowSchema**)malloc(
@@ -1607,6 +1732,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema(
         }
     }
 
+    set_metadata_for_pandas(arrow_schema.get());
+
     return arrow_schema;
 }
 
@@ -1615,7 +1742,7 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
     auto arrow_schema = std::make_unique<ArrowSchema>();
     arrow_schema->format = "+s";  // structure, i.e. non-leaf node
     arrow_schema->name = strdup("parent");
-    arrow_schema->metadata = nullptr;
+
     arrow_schema->flags = 0;
     arrow_schema->n_children = num_columns;  // non-leaf node
     arrow_schema->children = (ArrowSchema**)malloc(
@@ -1632,6 +1759,8 @@ std::unique_ptr<ArrowSchema> ArrowAdapter::make_arrow_schema_parent(
         "[ArrowAdapter] make_arrow_schema n_children {}",
         arrow_schema->n_children));
 
+    set_metadata_for_pandas(arrow_schema.get());
+
     return arrow_schema;
 }
 

diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h
@@ -1191,6 +1191,15 @@ class ArrowAdapter {
         }
     }
 
+    /**
+     * This sets the Arrow table's metadata in a way that helps users
+     * get the Arrow table's column nullabilities preserved in Pandas
+     * when they do a `.to_pandas()` on Arrow tables we produce.
+     * See for context
+     * https://github.com/single-cell-data/TileDB-SOMA/issues/3642
+     */
+    static void set_metadata_for_pandas(ArrowSchema* arrow_schema);
+
    private:
     static std::pair<const void*, std::size_t> _get_data_and_length(
         Enumeration& enmr, const void* dst);