core/maxframe/dataframe/misc/drop_duplicates.py (74 lines of code) (raw):
# Copyright 1999-2025 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
from ... import opcodes
from ...serialization.serializables import BoolField
from ..operators import OutputType
from ..utils import gen_unknown_index_value, parse_index
from ._duplicate import DuplicateOperand, validate_subset
class DataFrameDropDuplicates(DuplicateOperand):
_op_type_ = opcodes.DROP_DUPLICATES
ignore_index = BoolField("ignore_index", default=True)
def __init__(self, output_types=None, **kw):
super().__init__(_output_types=output_types, **kw)
@classmethod
def _get_shape(cls, input_shape, op: "DataFrameDropDuplicates"):
shape = (np.nan,) + input_shape[1:]
if op.output_types[0] == OutputType.dataframe and len(shape) == 1:
shape += (3,)
return shape
def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
params = input_params.copy()
if op.ignore_index and self._output_types[0] != OutputType.index:
params["index_value"] = parse_index(pd.RangeIndex(-1))
else:
params["index_value"] = gen_unknown_index_value(
input_params["index_value"],
op.keep,
op.subset,
type(op).__name__,
normalize_range_index=True,
)
params["shape"] = self._get_shape(input_params["shape"], op)
return params
def __call__(self, inp, inplace=False):
self._output_types = inp.op.output_types
params = self._gen_tileable_params(self, inp.params)
ret = self.new_tileable([inp], kws=[params])
if inplace:
inp.data = ret.data
return ret
def df_drop_duplicates(
df, subset=None, keep="first", inplace=False, ignore_index=False, method="auto"
):
"""
Return DataFrame with duplicate rows removed.
Considering certain columns is optional. Indexes, including time indexes
are ignored.
Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
Determines which duplicates (if any) to keep.
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- ``any`` : Drop duplicates except for a random occurrence.
- False : Drop all duplicates.
inplace : bool, default False
Whether to drop duplicates in place or to return a copy.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.
Returns
-------
DataFrame
DataFrame with duplicates removed or None if ``inplace=True``.
"""
if keep not in ("first", "last", "any", False):
raise ValueError("keep could only be one of 'first', 'last' or False")
if method not in ("auto", "tree", "subset_tree", "shuffle", None):
raise ValueError(
"method could only be one of "
"'auto', 'tree', 'subset_tree', 'shuffle' or None"
)
subset = validate_subset(df, subset)
op = DataFrameDropDuplicates(
subset=subset, keep=keep, ignore_index=ignore_index, method=method
)
return op(df, inplace=inplace)
def series_drop_duplicates(
series, keep="first", inplace=False, ignore_index=False, method="auto"
):
"""
Return Series with duplicate values removed.
Parameters
----------
keep : {'first', 'last', ``False``}, default 'first'
Method to handle dropping duplicates:
- 'first' : Drop duplicates except for the first occurrence.
- 'last' : Drop duplicates except for the last occurrence.
- 'any' : Drop duplicates except for a random occurrence.
- ``False`` : Drop all duplicates.
inplace : bool, default ``False``
If ``True``, performs operation inplace and returns None.
Returns
-------
Series
Series with duplicates dropped.
See Also
--------
Index.drop_duplicates : Equivalent method on Index.
DataFrame.drop_duplicates : Equivalent method on DataFrame.
Series.duplicated : Related method on Series, indicating duplicate
Series values.
Examples
--------
Generate a Series with duplicated entries.
>>> import maxframe.dataframe as md
>>> s = md.Series(['lame', 'cow', 'lame', 'beetle', 'lame', 'hippo'],
... name='animal')
>>> s.execute()
0 lame
1 cow
2 lame
3 beetle
4 lame
5 hippo
Name: animal, dtype: object
With the 'keep' parameter, the selection behaviour of duplicated values
can be changed. The value 'first' keeps the first occurrence for each
set of duplicated entries. The default value of keep is 'first'.
>>> s.drop_duplicates().execute()
0 lame
1 cow
3 beetle
5 hippo
Name: animal, dtype: object
The value 'last' for parameter 'keep' keeps the last occurrence for
each set of duplicated entries.
>>> s.drop_duplicates(keep='last').execute()
1 cow
3 beetle
4 lame
5 hippo
Name: animal, dtype: object
The value ``False`` for parameter 'keep' discards all sets of
duplicated entries. Setting the value of 'inplace' to ``True`` performs
the operation inplace and returns ``None``.
>>> s.drop_duplicates(keep=False, inplace=True)
>>> s.execute()
1 cow
3 beetle
5 hippo
Name: animal, dtype: object
"""
if keep not in ("first", "last", "any", False):
raise ValueError("keep could only be one of 'first', 'last' or False")
if method not in ("auto", "tree", "shuffle", None):
raise ValueError(
"method could only be one of 'auto', 'tree', 'shuffle' or None"
)
op = DataFrameDropDuplicates(keep=keep, ignore_index=ignore_index, method=method)
return op(series, inplace=inplace)
def index_drop_duplicates(index, keep="first", method="auto"):
"""
Return Index with duplicate values removed.
Parameters
----------
keep : {'first', 'last', ``False``}, default 'first'
- 'first' : Drop duplicates except for the first occurrence.
- 'last' : Drop duplicates except for the last occurrence.
- 'any' : Drop duplicates except for a random occurrence.
- ``False`` : Drop all duplicates.
Returns
-------
deduplicated : Index
See Also
--------
Series.drop_duplicates : Equivalent method on Series.
DataFrame.drop_duplicates : Equivalent method on DataFrame.
Index.duplicated : Related method on Index, indicating duplicate
Index values.
Examples
--------
Generate a pandas.Index with duplicate values.
>>> import maxframe.dataframe as md
>>> idx = md.Index(['lame', 'cow', 'lame', 'beetle', 'lame', 'hippo'])
The `keep` parameter controls which duplicate values are removed.
The value 'first' keeps the first occurrence for each
set of duplicated entries. The default value of keep is 'first'.
>>> idx.drop_duplicates(keep='first').execute()
Index(['lame', 'cow', 'beetle', 'hippo'], dtype='object')
The value 'last' keeps the last occurrence for each set of duplicated
entries.
>>> idx.drop_duplicates(keep='last').execute()
Index(['cow', 'beetle', 'lame', 'hippo'], dtype='object')
The value ``False`` discards all sets of duplicated entries.
>>> idx.drop_duplicates(keep=False).execute()
Index(['cow', 'beetle', 'hippo'], dtype='object')
"""
if keep not in ("first", "last", "any", False):
raise ValueError("keep could only be one of 'first', 'last' or False")
if method not in ("auto", "tree", "shuffle", None):
raise ValueError(
"method could only be one of 'auto', 'tree', 'shuffle' or None"
)
op = DataFrameDropDuplicates(keep=keep, method=method)
return op(index)