in cost-optimization/hpa-config-recommender/src/hpaconfigrecommender/plan_workload_simulation.py [0:0]
def convert_data_types(workload_df: pd.DataFrame) -> pd.DataFrame:
"""
Converts data types of specific columns in a workload DataFrame for
memory efficiency and optimized performance.
- Converts 'window_begin' to datetime64 format.
- Converts 'num_replicas_at_usage_window' to Int16 (nullable).
- Converts CPU-related columns to float16 for reduced memory usage.
- Converts memory-related columns to Int64 (nullable) for larger
integer capacity.
Parameters:
workload_df (pd.DataFrame): The DataFrame containing workload data
with specific columns for conversion.
Returns:
pd.DataFrame: DataFrame with columns converted to optimized data types.
Raises:
KeyError: If any required column is missing from the input DataFrame.
"""
# Convert to datetime[s] and remove timezone awareness
workload_df["window_begin"] = pd.to_datetime(
workload_df["window_begin"], errors="coerce"
).dt.tz_localize(None).astype("datetime64[s]")
# Convert `num_replicas_at_usage_window` to Int16, allowing for NaN
workload_df["num_replicas_at_usage_window"] = (
pd.to_numeric(
workload_df["num_replicas_at_usage_window"], errors="coerce")
.astype("Int16")
)
# Convert metrics to appropriate numeric types
float16_columns = [
"avg_container_cpu_usage",
"stddev_containers_cpu_usage",
"sum_containers_cpu_request",
"sum_containers_cpu_usage"
]
float32_columns = [
"sum_containers_mem_request_mi",
"sum_containers_mem_usage_mi"
]
for col in float16_columns:
workload_df[col] = (
pd.to_numeric(workload_df[col], errors="coerce").astype("float16")
)
for col in float32_columns:
workload_df[col] = (
pd.to_numeric(workload_df[col], errors="coerce").astype("float32")
)
return workload_df