core/maxframe/dataframe/misc/apply.py

# Copyright 1999-2025 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect from typing import Any, Union import numpy as np import pandas as pd from pandas import DataFrame, Series from ... import opcodes from ...core import OutputType from ...core.operator import OperatorLogicKeyGeneratorMixin from ...serialization.serializables import ( AnyField, BoolField, DictField, FunctionField, StringField, TupleField, ) from ...utils import get_func_token, quiet_stdio, tokenize from ..operators import DataFrameOperator, DataFrameOperatorMixin from ..utils import ( build_df, build_series, copy_func_scheduling_hints, make_dtype, make_dtypes, pack_func_args, parse_index, validate_axis, validate_output_types, ) class ApplyOperandLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin): def _get_logic_key_token_values(self): token_values = super()._get_logic_key_token_values() + [ self.axis, self.convert_dtype, self.raw, self.result_type, self.elementwise, ] if self.func: return token_values + [get_func_token(self.func)] else: # pragma: no cover return token_values class ApplyOperator( DataFrameOperator, DataFrameOperatorMixin, ApplyOperandLogicKeyGeneratorMixin ): _op_type_ = opcodes.APPLY func = FunctionField("func") axis = AnyField("axis", default=0) convert_dtype = BoolField("convert_dtype", default=True) raw = BoolField("raw", default=False) result_type = StringField("result_type", default=None) elementwise = BoolField("elementwise") logic_key = StringField("logic_key") need_clean_up_func = BoolField("need_clean_up_func") args = TupleField("args", default=()) kwds = DictField("kwds", default={}) def __init__(self, output_type=None, **kw): if output_type: kw["_output_types"] = [output_type] super().__init__(**kw) if hasattr(self, "func"): copy_func_scheduling_hints(self.func, self) def _update_key(self): values = [v for v in self._values_ if v is not self.func] + [ get_func_token(self.func) ] self._obj_set("_key", tokenize(type(self).__name__, *values)) return self def _infer_df_func_returns(self, df, dtypes, dtype=None, name=None, index=None): if isinstance(self.func, np.ufunc): output_type = OutputType.dataframe new_dtypes = None index_value = "inherit" new_elementwise = True else: if self.output_types is not None and ( dtypes is not None or dtype is not None ): ret_dtypes = dtypes if dtypes is not None else (name, dtype) ret_index_value = parse_index(index) if index is not None else None self.elementwise = False return ret_dtypes, ret_index_value output_type = new_dtypes = index_value = None new_elementwise = False try: empty_df = build_df(df, size=2) with np.errstate(all="ignore"), quiet_stdio(): infer_df = empty_df.apply( self.func, axis=self.axis, raw=self.raw, result_type=self.result_type, args=self.args, **self.kwds, ) if index_value is None: if infer_df.index is empty_df.index: index_value = "inherit" else: index_value = parse_index(pd.RangeIndex(-1)) if isinstance(infer_df, pd.DataFrame): output_type = output_type or OutputType.dataframe new_dtypes = new_dtypes or infer_df.dtypes else: output_type = output_type or OutputType.series new_dtypes = (name or infer_df.name, dtype or infer_df.dtype) new_elementwise = False if new_elementwise is None else new_elementwise except: # noqa: E722 # nosec pass self.output_types = ( [output_type] if not self.output_types and output_type else self.output_types ) dtypes = new_dtypes if dtypes is None else dtypes index_value = index_value if index is None else parse_index(index) self.elementwise = ( new_elementwise if self.elementwise is None else self.elementwise ) return dtypes, index_value def _call_df_or_series(self, df): return self.new_df_or_series([df]) def _call_dataframe(self, df, dtypes=None, dtype=None, name=None, index=None): # for backward compatibility dtype = dtype if dtype is not None else dtypes dtypes, index_value = self._infer_df_func_returns( df, dtypes, dtype=dtype, name=name, index=index ) if index_value is None: index_value = parse_index(None, (df.key, df.index_value.key)) for arg, desc in zip((self.output_types, dtypes), ("output_types", "dtypes")): if arg is None: raise TypeError( f"Cannot determine {desc} by calculating with enumerate data, " "please specify it as arguments" ) if index_value == "inherit": index_value = df.index_value if self.elementwise: shape = df.shape elif self.output_types[0] == OutputType.dataframe: shape = [np.nan, np.nan] shape[1 - self.axis] = df.shape[1 - self.axis] if self.axis == 1: shape[1] = len(dtypes) shape = tuple(shape) else: shape = (df.shape[1 - self.axis],) if self.output_types[0] == OutputType.dataframe: if self.axis == 0: return self.new_dataframe( [df], shape=shape, dtypes=dtypes, index_value=index_value, columns_value=parse_index(dtypes.index, store_data=True), ) else: return self.new_dataframe( [df], shape=shape, dtypes=dtypes, index_value=df.index_value, columns_value=parse_index(dtypes.index, store_data=True), ) else: name, dtype = dtypes return self.new_series( [df], shape=shape, name=name, dtype=dtype, index_value=index_value ) def _call_series(self, series, dtypes=None, dtype=None, name=None, index=None): # for backward compatibility dtype = dtype if dtype is not None else dtypes if self.convert_dtype: if self.output_types is not None and ( dtypes is not None or dtype is not None ): infer_series = test_series = None else: test_series = build_series(series, size=2, name=series.name) try: with np.errstate(all="ignore"), quiet_stdio(): infer_series = test_series.apply( self.func, args=self.args, **self.kwds ) except: # noqa: E722 # nosec # pylint: disable=bare-except infer_series = None output_type = self._output_types[0] if index is not None: index_value = parse_index(index) elif infer_series is not None: if infer_series.index is test_series.index: index_value = series.index_value else: # pragma: no cover index_value = parse_index(infer_series.index) else: index_value = parse_index(series.index_value) if output_type == OutputType.dataframe: if dtypes is None: if infer_series is not None and infer_series.ndim == 2: dtypes = infer_series.dtypes else: raise TypeError( "Cannot determine dtypes, " "please specify `dtypes` as argument" ) columns_value = parse_index(dtypes.index, store_data=True) return self.new_dataframe( [series], shape=(series.shape[0], len(dtypes)), index_value=index_value, columns_value=columns_value, dtypes=dtypes, ) else: if ( dtype is None and infer_series is not None and infer_series.ndim == 1 ): dtype = infer_series.dtype else: dtype = dtype if dtype is not None else np.dtype(object) if infer_series is not None and infer_series.ndim == 1: name = name or infer_series.name return self.new_series( [series], dtype=dtype, shape=series.shape, index_value=index_value, name=name, ) else: dtype = dtype if dtype is not None else np.dtype("object") return self.new_series( [series], dtype=dtype, shape=series.shape, index_value=series.index_value, name=name, ) def __call__(self, df_or_series, dtypes=None, dtype=None, name=None, index=None): axis = getattr(self, "axis", None) or 0 dtypes = make_dtypes(dtypes) dtype = make_dtype(dtype) self.axis = validate_axis(axis, df_or_series) if self.output_types and self.output_types[0] == OutputType.df_or_series: return self._call_df_or_series(df_or_series) if df_or_series.op.output_types[0] == OutputType.dataframe: return self._call_dataframe( df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index ) else: return self._call_series( df_or_series, dtypes=dtypes, dtype=dtype, name=name, index=index ) def _build_stub_pandas_obj(self, df_or_series) -> Union[DataFrame, Series]: if self.output_types[0] == OutputType.dataframe: return build_df(df_or_series, size=2) return build_series(df_or_series, size=2, name=df_or_series.name) def get_packed_funcs(self, df=None) -> Any: stub_df = self._build_stub_pandas_obj(df or self.inputs[0]) return pack_func_args(stub_df, self.func, *self.args, **self.kwds) def df_apply( df, func, axis=0, raw=False, result_type=None, args=(), dtypes=None, dtype=None, name=None, output_type=None, index=None, elementwise=None, skip_infer=False, **kwds, ): # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50 """ Apply a function along an axis of the DataFrame. Objects passed to the function are Series objects whose index is either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). By default (``result_type=None``), the final return type is inferred from the return type of the applied function. Otherwise, it depends on the `result_type` argument. Parameters ---------- func : function Function to apply to each column or row. axis : {0 or 'index', 1 or 'columns'}, default 0 Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. raw : bool, default False Determines if row or column is passed as a Series or ndarray object: * ``False`` : passes each row or column as a Series to the function. * ``True`` : the passed function will receive ndarray objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. result_type : {'expand', 'reduce', 'broadcast', None}, default None These only act when ``axis=1`` (columns): * 'expand' : list-like results will be turned into columns. * 'reduce' : returns a Series if possible rather than expanding list-like results. This is the opposite of 'expand'. * 'broadcast' : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained. The default behaviour (None) depends on the return value of the applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. output_type : {'dataframe', 'series'}, default None Specify type of returned object. See `Notes` for more details. dtypes : Series, default None Specify dtypes of returned DataFrames. See `Notes` for more details. dtype : numpy.dtype, default None Specify dtype of returned Series. See `Notes` for more details. name : str, default None Specify name of returned Series. See `Notes` for more details. index : Index, default None Specify index of returned object. See `Notes` for more details. elementwise : bool, default False Specify whether ``func`` is an elementwise function: * ``False`` : The function is not elementwise. MaxFrame will try concatenating chunks in rows (when ``axis=0``) or in columns (when ``axis=1``) and then apply ``func`` onto the concatenated chunk. The concatenation step can cause extra latency. * ``True`` : The function is elementwise. MaxFrame will apply ``func`` to original chunks. This will not introduce extra concatenation step and reduce overhead. skip_infer: bool, default False Whether infer dtypes when dtypes or output_type is not specified. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- Series or DataFrame Result of applying ``func`` along the given axis of the DataFrame. See Also -------- DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Notes ----- When deciding output dtypes and shape of the return value, MaxFrame will try applying ``func`` onto a mock DataFrame, and the apply call may fail. When this happens, you need to specify the type of apply call (DataFrame or Series) in output_type. * For DataFrame output, you need to specify a list or a pandas Series as ``dtypes`` of output DataFrame. ``index`` of output can also be specified. * For Series output, you need to specify ``dtype`` and ``name`` of output Series. * For any input with data type ``pandas.ArrowDtype(pyarrow.MapType)``, it will always be converted to a Python dict. And for any output with this data type, it must be returned as a Python dict as well. Examples -------- >>> import numpy as np >>> import maxframe.tensor as mt >>> import maxframe.dataframe as md >>> df = md.DataFrame([[4, 9]] * 3, columns=['A', 'B']) >>> df.execute() A B 0 4 9 1 4 9 2 4 9 Using a reducing function on either axis >>> df.apply(np.sum, axis=0).execute() A 12 B 27 dtype: int64 >>> df.apply(lambda row: int(np.sum(row)), axis=1).execute() 0 13 1 13 2 13 dtype: int64 Passing ``result_type='expand'`` will expand list-like results to columns of a Dataframe >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand').execute() 0 1 0 1 2 1 1 2 2 1 2 Returning a Series inside the function is similar to passing ``result_type='expand'``. The resulting column names will be the Series index. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute() foo bar 0 1 2 1 1 2 2 1 2 Passing ``result_type='broadcast'`` will ensure the same shape result, whether list-like or scalar is returned by the function, and broadcast it along the axis. The resulting column names will be the originals. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast').execute() A B 0 1 2 1 1 2 2 1 2 Create a dataframe with a map type. >>> import pyarrow as pa >>> import pandas as pd >>> from maxframe.lib.dtypes_extension import dict_ >>> col_a = pd.Series( ... data=[[("k1", 1), ("k2", 2)], [("k1", 3)], None], ... index=[1, 2, 3], ... dtype=dict_(pa.string(), pa.int64()), ... ) >>> col_b = pd.Series( ... data=["A", "B", "C"], ... index=[1, 2, 3], ... ) >>> df = md.DataFrame({"A": col_a, "B": col_b}) >>> df.execute() A B 1 [('k1', 1), ('k2', 2)] A 2 [('k1', 3)] B 3 <NA> C Define a function that updates the map type with a new key-value pair. >>> def custom_set_item(x): ... if x["A"] is not None: ... x["A"]["k2"] = 10 ... return x >>> df.apply( ... custom_set_item, ... axis=1, ... output_type="dataframe", ... dtypes=df.dtypes.copy(), ... ).execute() A B 1 [('k1', 1), ('k2', 10)] A 2 [('k1', 3), ('k2', 10)] B 3 <NA> C """ if isinstance(func, (list, dict)): return df.aggregate(func, axis) output_types = kwds.pop("output_types", None) object_type = kwds.pop("object_type", None) output_types = validate_output_types( output_type=output_type, output_types=output_types, object_type=object_type ) output_type = output_types[0] if output_types else None if skip_infer and output_type is None: output_type = OutputType.df_or_series # calling member function if isinstance(func, str): func = getattr(df, func) sig = inspect.getfullargspec(func) if "axis" in sig.args: kwds["axis"] = axis return func(*args, **kwds) op = ApplyOperator( func=func, axis=axis, raw=raw, result_type=result_type, args=args, kwds=kwds, output_type=output_type, elementwise=elementwise, ) return op(df, dtypes=dtypes, dtype=dtype, name=name, index=index) def series_apply( series, func, convert_dtype=True, output_type=None, args=(), dtypes=None, dtype=None, name=None, index=None, skip_infer=False, **kwds, ): """ Invoke function on values of Series. Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. Parameters ---------- func : function Python function or NumPy ufunc to apply. convert_dtype : bool, default True Try to find better dtype for elementwise function results. If False, leave as dtype=object. output_type : {'dataframe', 'series'}, default None Specify type of returned object. See `Notes` for more details. dtypes : Series, default None Specify dtypes of returned DataFrames. See `Notes` for more details. dtype : numpy.dtype, default None Specify dtype of returned Series. See `Notes` for more details. name : str, default None Specify name of returned Series. See `Notes` for more details. index : Index, default None Specify index of returned object. See `Notes` for more details. args : tuple Positional arguments passed to func after the series value. skip_infer: bool, default False Whether infer dtypes when dtypes or output_type is not specified. **kwds Additional keyword arguments passed to func. Returns ------- Series or DataFrame If func returns a Series object the result will be a DataFrame. See Also -------- Series.map: For element-wise operations. Series.agg: Only perform aggregating type operations. Series.transform: Only perform transforming type operations. Notes ----- When deciding output dtypes and shape of the return value, MaxFrame will try applying ``func`` onto a mock Series, and the apply call may fail. When this happens, you need to specify the type of apply call (DataFrame or Series) in output_type. * For DataFrame output, you need to specify a list or a pandas Series as ``dtypes`` of output DataFrame. ``index`` of output can also be specified. * For Series output, you need to specify ``dtype`` and ``name`` of output Series. * For any input with data type ``pandas.ArrowDtype(pyarrow.MapType)``, it will always be converted to a Python dict. And for any output with this data type, it must be returned as a Python dict as well. Examples -------- Create a series with typical summer temperatures for each city. >>> import maxframe.tensor as mt >>> import maxframe.dataframe as md >>> s = md.Series([20, 21, 12], ... index=['London', 'New York', 'Helsinki']) >>> s.execute() London 20 New York 21 Helsinki 12 dtype: int64 Square the values by defining a function and passing it as an argument to ``apply()``. >>> def square(x): ... return x ** 2 >>> s.apply(square).execute() London 400 New York 441 Helsinki 144 dtype: int64 Square the values by passing an anonymous function as an argument to ``apply()``. >>> s.apply(lambda x: x ** 2).execute() London 400 New York 441 Helsinki 144 dtype: int64 Define a custom function that needs additional positional arguments and pass these additional arguments using the ``args`` keyword. >>> def subtract_custom_value(x, custom_value): ... return x - custom_value >>> s.apply(subtract_custom_value, args=(5,)).execute() London 15 New York 16 Helsinki 7 dtype: int64 Define a custom function that takes keyword arguments and pass these arguments to ``apply``. >>> def add_custom_values(x, **kwargs): ... for month in kwargs: ... x += kwargs[month] ... return x >>> s.apply(add_custom_values, june=30, july=20, august=25).execute() London 95 New York 96 Helsinki 87 dtype: int64 Create a series with a map type. >>> import pyarrow as pa >>> from maxframe.lib.dtypes_extension import dict_ >>> s = md.Series( ... data=[[("k1", 1), ("k2", 2)], [("k1", 3)], None], ... index=[1, 2, 3], ... dtype=dict_(pa.string(), pa.int64()), ... ) >>> s.execute() 1 [('k1', 1), ('k2', 2)] 2 [('k1', 3)] 3 <NA> dtype: map<string, int64>[pyarrow] Define a function that updates the map type with a new key-value pair. >>> def custom_set_item(x): ... if x is not None: ... x["k2"] = 10 ... return x >>> s.apply(custom_set_item, output_type="series", dtype=dict_(pa.string(), pa.int64())).execute() 1 [('k1', 1), ('k2', 10)] 2 [('k1', 3), ('k2', 10)] 3 <NA> dtype: map<string, int64>[pyarrow] """ if isinstance(func, (list, dict)): return series.aggregate(func) # calling member function if isinstance(func, str): func_body = getattr(series, func, None) if func_body is not None: return func_body(*args, **kwds) func_str = func func = getattr(np, func_str, None) if func is None: raise AttributeError( f"'{func_str!r}' is not a valid function " f"for '{type(series).__name__}' object" ) if skip_infer and output_type is None: output_type = OutputType.df_or_series output_types = kwds.pop("output_types", None) object_type = kwds.pop("object_type", None) output_types = validate_output_types( output_type=output_type, output_types=output_types, object_type=object_type ) output_type = output_types[0] if output_types else OutputType.series op = ApplyOperator( func=func, convert_dtype=convert_dtype, args=args, kwds=kwds, output_type=output_type, ) return op(series, dtypes=dtypes, dtype=dtype, name=name, index=index)

core/maxframe/dataframe/misc/apply.py (346 lines of code) (raw):