core/maxframe/dataframe/indexing/iloc.py (322 lines of code) (raw):

# Copyright 1999-2025 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from numbers import Integral import numpy as np import pandas as pd from pandas.core.dtypes.cast import find_common_type from pandas.core.indexing import IndexingError from ... import opcodes from ...config import options from ...core import ENTITY_TYPE, OutputType from ...serialization.serializables import AnyField, KeyField, ListField from ...tensor import asarray from ...tensor.indexing.core import calc_shape from ..operators import DATAFRAME_TYPE, DataFrameOperator, DataFrameOperatorMixin from ..utils import indexing_index_value _ILOC_ERROR_MSG = ( "Location based indexing can only have [integer, " "integer slice (START point is INCLUDED, END point is EXCLUDED), " "listlike of integers, boolean array] types" ) def process_iloc_indexes(inp, indexes): ndim = inp.ndim if not isinstance(indexes, tuple): indexes = (indexes,) if len(indexes) < ndim: indexes += (slice(None),) * (ndim - len(indexes)) if len(indexes) > ndim: raise IndexingError("Too many indexers") new_indexes = [] # check each index for ax, index in enumerate(indexes): if isinstance(index, tuple): # a tuple should already have been caught by this point # so don't treat a tuple as a valid indexer raise IndexingError("Too many indexers") elif isinstance(index, slice): if any(v is not None for v in [index.start, index.stop, index.step]): pd_index = ( inp.index_value if ax == 0 else inp.columns_value ).to_pandas() for val in [index.start, index.stop, index.step]: if val is not None: try: pd_index[val] # check on the pandas except IndexError: pass except TypeError: raise TypeError( f"cannot do slice indexing on {type(pd_index)} " f"with these indexers [{val}] of {type(val)}" ) new_indexes.append(index) elif isinstance(index, (list, np.ndarray, pd.Series, ENTITY_TYPE)): if not isinstance(index, ENTITY_TYPE): index = np.asarray(index) else: index = asarray(index) if ax == 1: # do not support tensor index on axis 1 # because if so, the dtypes and columns_value would be unknown try: index = index.fetch() except (RuntimeError, ValueError): raise NotImplementedError( "indexer on axis columns cannot be non-executed tensor" ) if index.dtype != np.bool_: index = index.astype(np.int64) if index.ndim != 1: raise ValueError( "Buffer has wrong number of dimensions " f"(expected 1, got {index.ndim})" ) new_indexes.append(index) elif isinstance(index, Integral): shape = inp.shape[ax] if not np.isnan(shape): if index < -shape or index >= shape: raise IndexError("single positional indexer is out-of-bounds") new_indexes.append(index) else: raise ValueError(_ILOC_ERROR_MSG) return new_indexes class DataFrameIloc: def __init__(self, obj): self._obj = obj def __getitem__(self, indexes): if isinstance(self._obj, DATAFRAME_TYPE): op = DataFrameIlocGetItem(indexes=process_iloc_indexes(self._obj, indexes)) else: op = SeriesIlocGetItem(indexes=process_iloc_indexes(self._obj, indexes)) return op(self._obj) def __setitem__(self, indexes, value): if not np.isscalar(value): raise NotImplementedError("Only scalar value is supported to set by iloc") if isinstance(self._obj, DATAFRAME_TYPE): op = DataFrameIlocSetItem( indexes=process_iloc_indexes(self._obj, indexes), value=value ) else: op = SeriesIlocSetItem( indexes=process_iloc_indexes(self._obj, indexes), value=value ) ret = op(self._obj) self._obj.data = ret.data class HeadTailOptimizedOperatorMixin(DataFrameOperatorMixin): __slots__ = () @classmethod def _is_head(cls, index0): return ( (index0.start is None or index0.start == 0) and index0.stop is not None and index0.stop > 0 ) @classmethod def _is_tail(cls, index0): return index0.start is not None and index0.start < 0 and index0.stop is None @classmethod def _is_indexes_head_or_tail(cls, indexes): index0 = indexes[0] if not isinstance(index0, slice): # have to be slice return False if index0.step is not None and index0.step != 1: return False if len(indexes) == 2: if not isinstance(indexes[1], slice): return False if indexes[1] != slice(None): return False if cls._is_tail(index0): # tail return True if cls._is_head(index0): # head return True return False def can_be_optimized(self): return ( self._is_indexes_head_or_tail(self.indexes) and self._is_head(self.indexes[0]) and self.indexes[0].stop <= options.optimize.head_optimize_threshold ) class DataFrameIlocGetItem(DataFrameOperator, HeadTailOptimizedOperatorMixin): _op_type_ = opcodes.DATAFRAME_ILOC_GETITEM _input = KeyField("input") indexes = ListField("indexes", default=None) def __init__(self, gpu=None, sparse=False, output_types=None, **kw): super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw) if not self.output_types: self.output_types = [OutputType.dataframe] @property def input(self): return self._input def _set_inputs(self, inputs): super()._set_inputs(inputs) inputs_iter = iter(self._inputs) self._input = next(inputs_iter) indexes = [] for index in self.indexes: if isinstance(index, ENTITY_TYPE): indexes.append(next(inputs_iter)) else: indexes.append(index) self.indexes = indexes def __call__(self, df): # Note [Fancy Index of Numpy and Pandas] # # The numpy and pandas.iloc have different semantic when processing fancy index: # # >>> np.ones((3,3))[[1,2],[1,2]] # array([1., 1.]) # # >>> pd.DataFrame(np.ones((3,3))).iloc[[1,2],[1,2]] # 1 2 # 1 1.0 1.0 # 2 1.0 1.0 # # Thus, we processing the index along two axis of DataFrame separately. shape0 = tuple(calc_shape((df.shape[0],), (self.indexes[0],))) shape1 = tuple(calc_shape((df.shape[1],), (self.indexes[1],))) inputs = [df] + [ index for index in self.indexes if isinstance(index, ENTITY_TYPE) ] # NB: pandas only compresses the result to series when index on one of axis is integral if isinstance(self.indexes[1], Integral): shape = shape0 dtype = df.dtypes.iloc[self.indexes[1]] index_value = indexing_index_value(df.index_value, self.indexes[0]) if isinstance(self.indexes[0], Integral): # scalar return self.new_scalar(inputs, dtype=dtype) else: return self.new_series( inputs, shape=shape, dtype=dtype, index_value=index_value, name=df.dtypes.index[self.indexes[1]], ) elif isinstance(self.indexes[0], Integral): shape = shape1 dtype = find_common_type(list(df.dtypes.iloc[self.indexes[1]].values)) index_value = indexing_index_value(df.columns_value, self.indexes[1]) return self.new_series( inputs, shape=shape, dtype=dtype, index_value=index_value ) else: return self.new_dataframe( inputs, shape=shape0 + shape1, dtypes=df.dtypes.iloc[self.indexes[1]], index_value=indexing_index_value(df.index_value, self.indexes[0]), columns_value=indexing_index_value( df.columns_value, self.indexes[1], store_data=True ), ) class SeriesIlocGetItem(DataFrameOperator, HeadTailOptimizedOperatorMixin): _op_module_ = "series" _op_type_ = opcodes.DATAFRAME_ILOC_GETITEM _input = KeyField("input") indexes = ListField("indexes", default=None) def __init__(self, gpu=None, sparse=False, output_types=None, **kw): super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw) if not self.output_types: self.output_types = [OutputType.series] @property def input(self): return self._input def _set_inputs(self, inputs): super()._set_inputs(inputs) inputs_iter = iter(self._inputs) self._input = next(inputs_iter) indexes = [] for index in self.indexes: if isinstance(index, ENTITY_TYPE): indexes.append(next(inputs_iter)) else: indexes.append(index) self.indexes = indexes def __call__(self, series): if isinstance(self.indexes[0], Integral): return self.new_scalar([series], dtype=series.dtype) else: shape = tuple(calc_shape(series.shape, self.indexes)) index_value = indexing_index_value(series.index_value, self.indexes[0]) inputs = [series] + [ index for index in self.indexes if isinstance(index, ENTITY_TYPE) ] return self.new_series( inputs, shape=shape, dtype=series.dtype, index_value=index_value, name=series.name, ) class IndexIlocGetItem(DataFrameOperator, DataFrameOperatorMixin): _op_module_ = "index" _op_type_ = opcodes.DATAFRAME_ILOC_GETITEM _input = KeyField("input") indexes = ListField("indexes", default=None) def __init__(self, gpu=None, sparse=False, output_types=None, **kw): super().__init__(gpu=gpu, sparse=sparse, _output_types=output_types, **kw) if not self.output_types: self.output_types = [OutputType.index] @property def input(self): return self._input def _set_inputs(self, inputs): super()._set_inputs(inputs) inputs_iter = iter(self._inputs) self._input = next(inputs_iter) indexes = [] for index in self.indexes: if isinstance(index, ENTITY_TYPE): indexes.append(next(inputs_iter)) else: indexes.append(index) self.indexes = indexes def __call__(self, idx): if isinstance(self.indexes[0], Integral): return self.new_scalar([idx], dtype=idx.dtype) else: shape = tuple(calc_shape(idx.shape, self.indexes)) index_value = indexing_index_value(idx.index_value, self.indexes[0]) inputs = [idx] + [ index for index in self.indexes if isinstance(index, ENTITY_TYPE) ] return self.new_index( inputs, shape=shape, dtype=idx.dtype, index_value=index_value, name=idx.name, ) class DataFrameIlocSetItem(DataFrameOperator, DataFrameOperatorMixin): _op_type_ = opcodes.DATAFRAME_ILOC_SETITEM indexes = ListField("indexes", default=None) value = AnyField("value", default=None) def __init__(self, gpu=None, sparse=False, output_types=None, **kw): super().__init__( gpu=gpu, sparse=sparse, _output_types=output_types, **kw, ) if not self.output_types: self.output_types = [OutputType.dataframe] def __call__(self, df): return self.new_dataframe( [df], shape=df.shape, dtypes=df.dtypes, index_value=df.index_value, columns_value=df.columns_value, ) class SeriesIlocSetItem(DataFrameOperator, DataFrameOperatorMixin): _op_module_ = "series" _op_type_ = opcodes.DATAFRAME_ILOC_SETITEM indexes = ListField("indexes", default=None) value = AnyField("value", default=None) def __init__(self, gpu=None, sparse=False, **kw): super().__init__( gpu=gpu, sparse=sparse, _output_types=[OutputType.series], **kw, ) def __call__(self, series): return self.new_series( [series], shape=series.shape, dtype=series.dtype, index_value=series.index_value, name=series.name, ) def index_getitem(idx, indexes): op = IndexIlocGetItem(indexes=process_iloc_indexes(idx, indexes)) return op(idx) def index_setitem(_idx, *_): raise TypeError("Index does not support mutable operations") def iloc(a): return DataFrameIloc(a) def head(a, n=5): """ Return the first `n` rows. This function returns the first `n` rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. For negative values of `n`, this function returns all rows except the last `n` rows, equivalent to ``df[:-n]``. Parameters ---------- n : int, default 5 Number of rows to select. Returns ------- same type as caller The first `n` rows of the caller object. See Also -------- DataFrame.tail: Returns the last `n` rows. Examples -------- >>> import maxframe.dataframe as md >>> df = md.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df.execute() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the first 5 lines >>> df.head().execute() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey Viewing the first `n` lines (three in this case) >>> df.head(3).execute() animal 0 alligator 1 bee 2 falcon For negative values of `n` >>> df.head(-3).execute() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot """ return DataFrameIloc(a)[0:n] def tail(a, n=5): """ Return the last `n` rows. This function returns last `n` rows from the object based on position. It is useful for quickly verifying data, for example, after sorting or appending rows. For negative values of `n`, this function returns all rows except the first `n` rows, equivalent to ``df[n:]``. Parameters ---------- n : int, default 5 Number of rows to select. Returns ------- type of caller The last `n` rows of the caller object. See Also -------- DataFrame.head : The first `n` rows of the caller object. Examples -------- >>> import maxframe.dataframe as md >>> df = md.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df.execute() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the last 5 lines >>> df.tail().execute() animal 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the last `n` lines (three in this case) >>> df.tail(3).execute() animal 6 shark 7 whale 8 zebra For negative values of `n` >>> df.tail(-3).execute() animal 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra """ return DataFrameIloc(a)[-n:]