in tfx_bsl/cc/arrow/arrow_submodule.cc [222:290]
void DefineTableUtilSubmodule(pybind11::module arrow_module) {
auto m = arrow_module.def_submodule("table_util");
m.doc() = "Arrow Table utilities.";
m.def(
"MergeRecordBatches",
[](const std::vector<std::shared_ptr<arrow::RecordBatch>>&
record_batches) {
std::shared_ptr<arrow::RecordBatch> result;
absl::Status s = MergeRecordBatches(record_batches, &result);
if (!s.ok()) {
throw std::runtime_error(s.ToString());
}
return result;
},
py::doc(
"Merges a list of record batches into one. "
"The columns are concatenated. "
"Columns of the same name must be of compatible types.\n"
"Two types are compatible if:\n"
" - they are equal, or\n"
" - one of them is Null\n"
" - both are list<> or large_list<>, and their child types are "
"compatible (note that large_list<> and list<> are not compatible), "
"or\n"
" - both are struct<>, and their children of the same name are "
"compatible. they don't need to have the same set of children.\n"
"Rules for concatanating two compatible but not equal arrays:\n"
" 1. if one of them is Null, then the result is of the other type, "
"with nulls representing elements from the NullArray.\n"
" 2. two compatible list<> arrays are concatenated recursively."
" 3. two compatible struct<> arrays will result in a struct<> that "
"contains children from both arrays. If on array is missing a child, "
"it is considered as if it had that child as a NullArray. Child "
"arrays are concatenated recusrively.\n"
"Returns an error if there's any incompatibility."),
py::call_guard<py::gil_scoped_release>());
m.def(
"TotalByteSize",
[](const std::shared_ptr<arrow::RecordBatch>& record_batch,
const bool ignore_unsupported) {
size_t result;
absl::Status s =
TotalByteSize(*record_batch, ignore_unsupported, &result);
if (!s.ok()) {
throw std::runtime_error(s.ToString());
}
return result;
},
py::arg("record_batch"), py::arg("ignore_unsupported") = false,
py::doc("Returns the total byte size of all the buffers a record batch "
"consists of. This value might be larger than the actual memory "
"occupied by those buffers because buffers might share the "
"underlying memory"),
py::call_guard<py::gil_scoped_release>());
// TODO(zhuo): pa.RecordBatch.take is available starting from arrow 0.17.
m.def(
"RecordBatchTake",
[](const std::shared_ptr<arrow::RecordBatch>& record_batch,
const std::shared_ptr<arrow::Array>& indices) {
std::shared_ptr<arrow::RecordBatch> result;
absl::Status s = RecordBatchTake(record_batch, indices, &result);
if (!s.ok()) {
throw std::runtime_error(s.ToString());
}
return result;
},
py::doc("Returns a RecordBatch that contains rows in `indices`."),
py::call_guard<py::gil_scoped_release>());
}