def _transform()

in src/beanmachine/tutorials/utils/nba.py [0:0]


    def _transform(self) -> pd.DataFrame:
        # Copy the data so we can manipulate it.
        df = self.extracted_data.copy()

        # Ensure the date column is a date object.
        df["date"] = pd.to_datetime(df["date"].values, format="%Y%m%d")

        # Append the season name.
        self._season_name(df)

        # Fix spelling errors.
        teams = {
            "NKY": "NYK",
            "COS": "BOS",
            "SAT": "SAS",
            "CHi": "CHI",
            "LA)": "LAC",
            "AT)": "ATL",
            "ARL": "ATL",
        }
        columns = ["away", "home", "committing_team", "disadvantaged_team"]
        for column in columns:
            df[column] = df[column].rename(teams)

        # Fill in NaN review_decision values with INC.
        df["review_decision"] = df["review_decision"].fillna("INC")

        # Filter the data for specific foul call_types and keep only the
        # descriptors (word after the :). These types of fouls generally
        # involve two players. See
        # https://austinrochford.com/posts/2018-02-04-nba-irt-2.html for more
        # info.
        fouls = [
            "Foul: Personal",
            "Foul: Shooting",
            "Foul: Offensive",
            "Foul: Loose Ball",
            "Foul: Away from Play",
        ]
        df = df[df["call_type"].isin(fouls)]
        df["call_type"] = df["call_type"].str.split(": ", expand=True)[1].values

        # Filter the data on fourth quarters only. Then remove that column.
        df = df[df["period"] == "Q4"]
        df = df.drop("period", axis=1)

        # Only keep records that have a named season value.
        df = df.dropna(subset=["season"])

        # Remove any NaN values that may be in the players columns.
        df = df.dropna(subset=["committing_player", "disadvantaged_player"])

        # Create IDs for the players.
        committing_players = df["committing_player"].tolist()
        disadvantaged_players = df["disadvantaged_player"].tolist()
        players = sorted(set(committing_players + disadvantaged_players))
        players = {player: i for i, player in enumerate(players)}
        df["committing_player_id"] = df["committing_player"].map(players)
        df["disadvantaged_player_id"] = df["disadvantaged_player"].map(players)

        # Create IDs for the foul type.
        fouls = {name: i for i, name in enumerate(sorted(df["call_type"].unique()))}
        df["call_type_id"] = df["call_type"].map(fouls)

        # Create IDs for the season.
        seasons = {name: i for i, name in enumerate(sorted(df["season"].unique()))}
        df["season_id"] = df["season"].map(seasons)

        # New score columns.
        df["score_committing"] = (
            df["score_home"]
            .where(df["committing_team"] == df["home"], df["score_away"])
            .astype(int)
        )
        df["score_disadvantaged"] = (
            df["score_home"]
            .where(
                df["disadvantaged_team"] == df["home"],
                df["score_away"],
            )
            .astype(int)
        )

        # Round the seconds left in the game.
        df["seconds_left"] = df["seconds_left"].round(0).astype(int)

        # Foul called ID.
        df["foul_called"] = 1 * df["review_decision"].isin(["CC", "INC"])

        # Trailing flag
        df["trailing_committing"] = (
            df["score_committing"] < df["score_disadvantaged"]
        ).astype(int)

        # Calculate the difference between the teams scores.
        df["score_diff"] = df["score_disadvantaged"] - df["score_committing"]

        # Calculate the trailing possessions needed.
        df["trailing_poss"] = np.ceil(df["score_diff"].values / 3).astype(int)

        # Possessions needed ID.
        df["trailing_poss_id"] = df["trailing_poss"].map(
            {poss: i for i, poss in enumerate(sorted(df["trailing_poss"].unique()))}
        )

        # Remaining possessions.
        df["remaining_poss"] = df["seconds_left"].floordiv(25).add(1).astype(int)

        # Remaining possessions ID.
        df["remaining_poss_id"] = df["remaining_poss"].map(
            {poss: i for i, poss in enumerate(sorted(df["remaining_poss"].unique()))}
        )

        # Keep only a few columns.
        columns = [
            "seconds_left",
            "call_type",
            "call_type_id",
            "foul_called",
            "committing_player",
            "committing_player_id",
            "disadvantaged_player",
            "disadvantaged_player_id",
            "score_committing",
            "score_disadvantaged",
            "season",
            "season_id",
            "trailing_committing",
            "score_diff",
            "trailing_poss",
            "trailing_poss_id",
            "remaining_poss",
            "remaining_poss_id",
        ]
        df = df[columns]

        # Drop any duplicates.
        df = df.drop_duplicates().reset_index(drop=True)
        return df