def sanitize_dataframe_columns_names()

in awswrangler/catalog/_utils.py [0:0]


def sanitize_dataframe_columns_names(df: pd.DataFrame, handle_duplicate_columns: str | None = "warn") -> pd.DataFrame:
    """Normalize all columns names to be compatible with Amazon Athena.

    https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html

    Possible transformations:
    - Strip accents
    - Remove non alphanumeric characters

    Note
    ----
    After transformation, some column names might not be unique anymore.
    Example: the columns ["A", "a"] will be sanitized to ["a", "a"]

    Parameters
    ----------
    df
        Original Pandas DataFrame.
    handle_duplicate_columns
        How to handle duplicate columns. Can be "warn" or "drop" or "rename".
        "drop" will drop all but the first duplicated column.
        "rename" will rename all duplicated columns with an incremental number.
        Defaults to "warn".

    Returns
    -------
        Original Pandas DataFrame with columns names normalized.

    Examples
    --------
    >>> import awswrangler as wr
    >>> df_normalized = wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({"A": [1, 2]}))
    >>> df_normalized_drop = wr.catalog.sanitize_dataframe_columns_names(
            df=pd.DataFrame({"A": [1, 2], "a": [3, 4]}), handle_duplicate_columns="drop"
        )
    >>> df_normalized_rename = wr.catalog.sanitize_dataframe_columns_names(
            df=pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [4, 6]}), handle_duplicate_columns="rename"
        )

    """
    df.columns = [sanitize_column_name(x) for x in df.columns]
    df.index.names = [None if x is None else sanitize_column_name(x) for x in df.index.names]
    if df.columns.duplicated().any():  # type: ignore[attr-defined]
        if handle_duplicate_columns == "warn":
            warnings.warn(
                "Duplicate columns were detected, consider using `handle_duplicate_columns='[drop|rename]'`",
                UserWarning,
            )
        elif handle_duplicate_columns == "drop":
            df = drop_duplicated_columns(df)
        elif handle_duplicate_columns == "rename":
            df = rename_duplicated_columns(df)
        else:
            raise ValueError("handle_duplicate_columns must be one of ['warn', 'drop', 'rename']")
    return df