def convert_data_types()

in cost-optimization/hpa-config-recommender/src/hpaconfigrecommender/plan_workload_simulation.py [0:0]


def convert_data_types(workload_df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts data types of specific columns in a workload DataFrame for
    memory efficiency and optimized performance.

    - Converts 'window_begin' to datetime64 format.
    - Converts 'num_replicas_at_usage_window' to Int16 (nullable).
    - Converts CPU-related columns to float16 for reduced memory usage.
    - Converts memory-related columns to Int64 (nullable) for larger
      integer capacity.

    Parameters:
        workload_df (pd.DataFrame): The DataFrame containing workload data
                                    with specific columns for conversion.

    Returns:
        pd.DataFrame: DataFrame with columns converted to optimized data types.

    Raises:
        KeyError: If any required column is missing from the input DataFrame.
    """
    # Convert to datetime[s] and remove timezone awareness
    workload_df["window_begin"] = pd.to_datetime(
        workload_df["window_begin"], errors="coerce"
    ).dt.tz_localize(None).astype("datetime64[s]")

    # Convert `num_replicas_at_usage_window` to Int16, allowing for NaN
    workload_df["num_replicas_at_usage_window"] = (
        pd.to_numeric(
            workload_df["num_replicas_at_usage_window"], errors="coerce")
        .astype("Int16")
    )

    # Convert metrics to appropriate numeric types
    float16_columns = [
        "avg_container_cpu_usage",
        "stddev_containers_cpu_usage",
        "sum_containers_cpu_request",
        "sum_containers_cpu_usage"
    ]

    float32_columns = [
        "sum_containers_mem_request_mi",
        "sum_containers_mem_usage_mi"
    ]

    for col in float16_columns:
        workload_df[col] = (
            pd.to_numeric(workload_df[col], errors="coerce").astype("float16")
            )

    for col in float32_columns:
        workload_df[col] = (
            pd.to_numeric(workload_df[col], errors="coerce").astype("float32")
            )
    return workload_df