void DefineArrayUtilSubmodule()

in tfx_bsl/cc/arrow/arrow_submodule.cc [47:220]


void DefineArrayUtilSubmodule(py::module arrow_module) {
  auto m = arrow_module.def_submodule("array_util");
  m.doc() = "Arrow Array utilities.";
  m.def(
      "ListLengthsFromListArray",
      WrapUnaryArrayFunction(&GetElementLengths),
      py::doc("DEPRECATED. Use GetElementLengths instead."),
      py::call_guard<py::gil_scoped_release>());

  m.def(
      "GetElementLengths",
      WrapUnaryArrayFunction(&GetElementLengths),
      py::doc(
          "Get lengths of elements from a list-alike `array` (including binary "
          "and string arrays) in an int64 array. \n"
          "Note that null and empty elements both are of length 0 and the "
          "returned array does not have any null.\n"
          "For example [[1,2,3], [], None, [4,5]] => [3, 0, 0, 2]."),
      py::call_guard<py::gil_scoped_release>());

  m.def("GetFlattenedArrayParentIndices",
        WrapUnaryArrayFunction(&GetFlattenedArrayParentIndices),
        py::doc("Makes a int32 or int64 array of the same length as flattened "
                "`list_array`. returned_array[i] == j means i-th element in "
                "flattened `list_array` came from j-th list in `list_array`.\n"
                "Returns an Int32Array if the input is a ListArray, or "
                "Int64Array if the input is a LargeListArray. \n"
                "For example [[1,2,3], [], None, [4,5]] => [0, 0, 0, 3, 3]."),
        py::call_guard<py::gil_scoped_release>());

  m.def(
      "GetArrayNullBitmapAsByteArray",
      WrapUnaryArrayFunction(&GetArrayNullBitmapAsByteArray),
      py::doc(
          "Makes a uint8 array of the same length as `array`. "
          "returned_array[i] == True iff array[i] is null.\n"
          "Note that this returned array can be converted to a numpy bool array"
          "copy-free."),
      py::call_guard<py::gil_scoped_release>());

  m.def(
      "GetBinaryArrayTotalByteSize",
      [](const std::shared_ptr<arrow::Array>& array) {
        size_t result;
        absl::Status s = GetBinaryArrayTotalByteSize(*array, &result);
        if (!s.ok()) {
          throw std::runtime_error(s.ToString());
        }
        return result;
      },
      py::doc(
          "Returns the total byte size of a BinaryArray (note that StringArray "
          "is a subclass of that so is also accepted here) i.e. the length of "
          "the concatenation of all the binary strings in the list), in a"
          "Python Long."),
      py::call_guard<py::gil_scoped_release>());

  m.def(
      "IndexIn",
      [](const std::shared_ptr<arrow::Array>& values,
         const std::shared_ptr<arrow::Array>& value_set) {
        std::shared_ptr<arrow::Array> result;
        absl::Status s = IndexIn(values, value_set, &result);
        if (!s.ok()) {
          throw std::runtime_error(s.ToString());
        }
        return result;
      },
      py::doc(
          "IndexIn examines each slot in a vaues against a value_set array.\n\n"
          "If the value is not found in value_set, null will be output."
          "If found, the index of occurrence within value_set "
          "(ignoring duplicates) will be output.\n\n"
          "For example given values = [99, 42, 3, null] and"
          "value_set = [3, 99], the output will be = [1, null, 0, null]\n\n"
          "Note: Null in the values is considered to match"
          "a null in the value_set array. For example given"
          "values = [99, 42, 3, null] and value_set = [3, 99, null],"
          "the output will be = [1, null, 0, 2]\n\n"
          "TODO(b/203116559): IndexIn links against Apache Arrow code "
          "which has different behavior for duplicate elements in value_set."
          "Version 2.0 would silently remove duplicates, version 4.0 "
          "disallowed them and from version 5.0 the index of the first element "
          "is used."),
      py::call_guard<py::gil_scoped_release>());

  m.def(
      "MakeListArrayFromParentIndicesAndValues",
      [](size_t num_parents,
         const std::shared_ptr<arrow::Array>& parent_indices,
         const std::shared_ptr<arrow::Array>& values_array,
         const bool empty_list_as_null) {
        std::shared_ptr<arrow::Array> result;
        absl::Status s = MakeListArrayFromParentIndicesAndValues(
            num_parents, parent_indices, values_array, empty_list_as_null,
            &result);
        if (!s.ok()) {
          throw std::runtime_error(s.ToString());
        }
        return result;
      },
      py::doc(
          "Makes an Arrow LargeListArray from parent indices and values."
          "For example, if num_parents = 6, parent_indices = [0, 1, 1, 3, 3] "
          "and values_array_py is (an arrow Array of) [0, 1, 2, 3, 4], then "
          "the result will be a LargeListArray of integers: "
          "[[0], [1, 2], <empty_list>, [3, 4], <empty_list>] "
          "where <empty_list> is `null` if `empyt_list_as_null` is true, or "
          "`[]`if false. `num_parents` must be a Python integer (int or long) "
          "and it must be greater than or equal to max(parent_indices) + 1. "
          "`parent_indices` must be a int64 1-D numpy array and the indices "
          "must be sorted in increasing order."
          "`values_array` must be an arrow Array and its length must equal to "
          "the length of `parent_indices`."),
      py::call_guard<py::gil_scoped_release>());

  m.def(
      "CooFromListArray",
      [](const std::shared_ptr<arrow::Array>& list_array) {
        std::shared_ptr<arrow::Array> coo;
        std::shared_ptr<arrow::Array> dense_shape;
        absl::Status s = CooFromListArray(list_array, &coo, &dense_shape);
        if (!s.ok()) {
          throw std::runtime_error(s.ToString());
        }
        return std::make_pair(coo, dense_shape);
      },
      py::doc(
          "Converts a ListArray to a COO (coordinate list) represented sparse "
          "tensor.\n"
          "`list_array` should be a (Large)ListArray<InnerArray> where "
          "InnerArray is a (Large)ListArray<InnerArray> or any primitive array "
          "or binary array (i.e. nested lists are supported). \n"
          "Two arrays are produced: `coo_array` is an Int64Array that contains "
          "the coordinates of flattened values of `list_array`. If "
          "`list_array` is N-nested (ListArray<primitive> is 1-nested), each "
          "coordinate will contain N + 1 numbers. The coordinates are "
          "concatenated together. `dense_shape_array` is an Int64Array that "
          "contains the size of the bounding-box of `list_array`. Note that "
          "nulls and empty lists are not distinguished in the COO form."),
      py::call_guard<py::gil_scoped_release>());

  m.def("FillNullLists", [](const std::shared_ptr<arrow::Array>& list_array,
                            const std::shared_ptr<arrow::Array>& fill_with) {
    std::shared_ptr<arrow::Array> result;
    absl::Status s = FillNullLists(list_array, fill_with, &result);
    if (!s.ok()) {
      throw std::runtime_error(s.ToString());
    }
    return result;
  });

  m.def("GetByteSize", [](const std::shared_ptr<arrow::Array>& array) {
    size_t result;
    absl::Status s = GetByteSize(*array, &result);
    if (!s.ok()) {
      throw std::runtime_error(s.ToString());
    }
    return result;
  });

  m.def(
      "CountInvalidUTF8",
      [](const std::shared_ptr<arrow::Array>& array) {
        absl::StatusOr<size_t> count_or = CountInvalidUtf8(*array);
        if (!count_or.ok()) {
          throw std::runtime_error(count_or.status().ToString());
        }
        return *count_or;
      },
      py::doc("Returns the count of invalid utf8 strings from a (large) string "
              "or binary array."),
      py::call_guard<py::gil_scoped_release>());
}