core/maxframe/dataframe/misc/drop_duplicates.py (74 lines of code) (raw):

# Copyright 1999-2025 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import pandas as pd from ... import opcodes from ...serialization.serializables import BoolField from ..operators import OutputType from ..utils import gen_unknown_index_value, parse_index from ._duplicate import DuplicateOperand, validate_subset class DataFrameDropDuplicates(DuplicateOperand): _op_type_ = opcodes.DROP_DUPLICATES ignore_index = BoolField("ignore_index", default=True) def __init__(self, output_types=None, **kw): super().__init__(_output_types=output_types, **kw) @classmethod def _get_shape(cls, input_shape, op: "DataFrameDropDuplicates"): shape = (np.nan,) + input_shape[1:] if op.output_types[0] == OutputType.dataframe and len(shape) == 1: shape += (3,) return shape def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params): params = input_params.copy() if op.ignore_index and self._output_types[0] != OutputType.index: params["index_value"] = parse_index(pd.RangeIndex(-1)) else: params["index_value"] = gen_unknown_index_value( input_params["index_value"], op.keep, op.subset, type(op).__name__, normalize_range_index=True, ) params["shape"] = self._get_shape(input_params["shape"], op) return params def __call__(self, inp, inplace=False): self._output_types = inp.op.output_types params = self._gen_tileable_params(self, inp.params) ret = self.new_tileable([inp], kws=[params]) if inplace: inp.data = ret.data return ret def df_drop_duplicates( df, subset=None, keep="first", inplace=False, ignore_index=False, method="auto" ): """ Return DataFrame with duplicate rows removed. Considering certain columns is optional. Indexes, including time indexes are ignored. Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' Determines which duplicates (if any) to keep. - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - ``any`` : Drop duplicates except for a random occurrence. - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- DataFrame DataFrame with duplicates removed or None if ``inplace=True``. """ if keep not in ("first", "last", "any", False): raise ValueError("keep could only be one of 'first', 'last' or False") if method not in ("auto", "tree", "subset_tree", "shuffle", None): raise ValueError( "method could only be one of " "'auto', 'tree', 'subset_tree', 'shuffle' or None" ) subset = validate_subset(df, subset) op = DataFrameDropDuplicates( subset=subset, keep=keep, ignore_index=ignore_index, method=method ) return op(df, inplace=inplace) def series_drop_duplicates( series, keep="first", inplace=False, ignore_index=False, method="auto" ): """ Return Series with duplicate values removed. Parameters ---------- keep : {'first', 'last', ``False``}, default 'first' Method to handle dropping duplicates: - 'first' : Drop duplicates except for the first occurrence. - 'last' : Drop duplicates except for the last occurrence. - 'any' : Drop duplicates except for a random occurrence. - ``False`` : Drop all duplicates. inplace : bool, default ``False`` If ``True``, performs operation inplace and returns None. Returns ------- Series Series with duplicates dropped. See Also -------- Index.drop_duplicates : Equivalent method on Index. DataFrame.drop_duplicates : Equivalent method on DataFrame. Series.duplicated : Related method on Series, indicating duplicate Series values. Examples -------- Generate a Series with duplicated entries. >>> import maxframe.dataframe as md >>> s = md.Series(['lame', 'cow', 'lame', 'beetle', 'lame', 'hippo'], ... name='animal') >>> s.execute() 0 lame 1 cow 2 lame 3 beetle 4 lame 5 hippo Name: animal, dtype: object With the 'keep' parameter, the selection behaviour of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. >>> s.drop_duplicates().execute() 0 lame 1 cow 3 beetle 5 hippo Name: animal, dtype: object The value 'last' for parameter 'keep' keeps the last occurrence for each set of duplicated entries. >>> s.drop_duplicates(keep='last').execute() 1 cow 3 beetle 4 lame 5 hippo Name: animal, dtype: object The value ``False`` for parameter 'keep' discards all sets of duplicated entries. Setting the value of 'inplace' to ``True`` performs the operation inplace and returns ``None``. >>> s.drop_duplicates(keep=False, inplace=True) >>> s.execute() 1 cow 3 beetle 5 hippo Name: animal, dtype: object """ if keep not in ("first", "last", "any", False): raise ValueError("keep could only be one of 'first', 'last' or False") if method not in ("auto", "tree", "shuffle", None): raise ValueError( "method could only be one of 'auto', 'tree', 'shuffle' or None" ) op = DataFrameDropDuplicates(keep=keep, ignore_index=ignore_index, method=method) return op(series, inplace=inplace) def index_drop_duplicates(index, keep="first", method="auto"): """ Return Index with duplicate values removed. Parameters ---------- keep : {'first', 'last', ``False``}, default 'first' - 'first' : Drop duplicates except for the first occurrence. - 'last' : Drop duplicates except for the last occurrence. - 'any' : Drop duplicates except for a random occurrence. - ``False`` : Drop all duplicates. Returns ------- deduplicated : Index See Also -------- Series.drop_duplicates : Equivalent method on Series. DataFrame.drop_duplicates : Equivalent method on DataFrame. Index.duplicated : Related method on Index, indicating duplicate Index values. Examples -------- Generate a pandas.Index with duplicate values. >>> import maxframe.dataframe as md >>> idx = md.Index(['lame', 'cow', 'lame', 'beetle', 'lame', 'hippo']) The `keep` parameter controls which duplicate values are removed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. >>> idx.drop_duplicates(keep='first').execute() Index(['lame', 'cow', 'beetle', 'hippo'], dtype='object') The value 'last' keeps the last occurrence for each set of duplicated entries. >>> idx.drop_duplicates(keep='last').execute() Index(['cow', 'beetle', 'lame', 'hippo'], dtype='object') The value ``False`` discards all sets of duplicated entries. >>> idx.drop_duplicates(keep=False).execute() Index(['cow', 'beetle', 'hippo'], dtype='object') """ if keep not in ("first", "last", "any", False): raise ValueError("keep could only be one of 'first', 'last' or False") if method not in ("auto", "tree", "shuffle", None): raise ValueError( "method could only be one of 'auto', 'tree', 'shuffle' or None" ) op = DataFrameDropDuplicates(keep=keep, method=method) return op(index)