def coalesce_columns()

in sourcecode/scoring/mf_base_scorer.py [0:0]


def coalesce_columns(df: pd.DataFrame, columnPrefix: str) -> pd.DataFrame:
  """Condense all columns beginning with columnPrefix into a single column.

  With each row there must be at most one column with a non-NaN value in the set of
  columns beginning with columnPrefix.  If a non-NaN value is present that will
  become the value in the condensed column, otherwise the value will be NaN.  After
  column values are condensed the original (prefixed) columns will be dropped.

  Args:
    df: DataFrame containing columns to condense
    collumnPrefix: Prefix used to detect columns to coalesce, and the name for
      the output column.

  Returns:
    DataFrame with all columns prefixed by columnPrefix dropped and replaced by
    a single column named columnPrefix

  Raises:
    AssertionError if multiple columns prefixed by columnPrefix have non-NaN values
    for any row.
  """
  # Identify columns to coalesce
  columns = [col for col in df.columns if col.startswith(f"{columnPrefix}_")]
  if not columns:
    return df
  # Validate that at most one column is set, and store which rows have a column set
  rowResults = np.invert(df[columns].isna()).sum(axis=1)
  assert all(rowResults <= 1), "each row should only be in one modeling group"

  # Coalesce results
  def _get_value(row):
    idx = row.first_valid_index()
    return row[idx] if idx is not None else np.nan

  coalesced = df[columns].apply(_get_value, axis=1)
  # Drop old columns and replace with new
  df = df.drop(columns=columns)
  df[columnPrefix] = coalesced
  return df