def _process_dataset()

in src/utils.py [0:0]

18 lines of code
4 McCabe index (conditional complexity)


def _process_dataset(x: pd.DataFrame, y: np.ndarray, normalize: bool):
    x.columns = x.columns.astype(str)
    transformers = []
    str_transform = [("ordinalEncoder", OrdinalEncoder())]
    numeric_transform = [("passthrough", "passthrough")]
    if normalize:
        str_transform.append(("StandardScaler", StandardScaler()))
        numeric_transform.append(("StandardScaler", StandardScaler()))
    for col in x.columns:
        pp = str_transform if x[col].dtype == "object" else numeric_transform
        transformers.append((col, Pipeline(pp), [col]))
    x = (
        ColumnTransformer(transformers, sparse_threshold=0)
        .fit_transform(x)
        .astype(float)
    )
    y = _encode_target(y)
    return x, y