in src/beanmachine/tutorials/utils/nba.py [0:0]
def _transform(self) -> pd.DataFrame:
# Copy the data so we can manipulate it.
df = self.extracted_data.copy()
# Ensure the date column is a date object.
df["date"] = pd.to_datetime(df["date"].values, format="%Y%m%d")
# Append the season name.
self._season_name(df)
# Fix spelling errors.
teams = {
"NKY": "NYK",
"COS": "BOS",
"SAT": "SAS",
"CHi": "CHI",
"LA)": "LAC",
"AT)": "ATL",
"ARL": "ATL",
}
columns = ["away", "home", "committing_team", "disadvantaged_team"]
for column in columns:
df[column] = df[column].rename(teams)
# Fill in NaN review_decision values with INC.
df["review_decision"] = df["review_decision"].fillna("INC")
# Filter the data for specific foul call_types and keep only the
# descriptors (word after the :). These types of fouls generally
# involve two players. See
# https://austinrochford.com/posts/2018-02-04-nba-irt-2.html for more
# info.
fouls = [
"Foul: Personal",
"Foul: Shooting",
"Foul: Offensive",
"Foul: Loose Ball",
"Foul: Away from Play",
]
df = df[df["call_type"].isin(fouls)]
df["call_type"] = df["call_type"].str.split(": ", expand=True)[1].values
# Filter the data on fourth quarters only. Then remove that column.
df = df[df["period"] == "Q4"]
df = df.drop("period", axis=1)
# Only keep records that have a named season value.
df = df.dropna(subset=["season"])
# Remove any NaN values that may be in the players columns.
df = df.dropna(subset=["committing_player", "disadvantaged_player"])
# Create IDs for the players.
committing_players = df["committing_player"].tolist()
disadvantaged_players = df["disadvantaged_player"].tolist()
players = sorted(set(committing_players + disadvantaged_players))
players = {player: i for i, player in enumerate(players)}
df["committing_player_id"] = df["committing_player"].map(players)
df["disadvantaged_player_id"] = df["disadvantaged_player"].map(players)
# Create IDs for the foul type.
fouls = {name: i for i, name in enumerate(sorted(df["call_type"].unique()))}
df["call_type_id"] = df["call_type"].map(fouls)
# Create IDs for the season.
seasons = {name: i for i, name in enumerate(sorted(df["season"].unique()))}
df["season_id"] = df["season"].map(seasons)
# New score columns.
df["score_committing"] = (
df["score_home"]
.where(df["committing_team"] == df["home"], df["score_away"])
.astype(int)
)
df["score_disadvantaged"] = (
df["score_home"]
.where(
df["disadvantaged_team"] == df["home"],
df["score_away"],
)
.astype(int)
)
# Round the seconds left in the game.
df["seconds_left"] = df["seconds_left"].round(0).astype(int)
# Foul called ID.
df["foul_called"] = 1 * df["review_decision"].isin(["CC", "INC"])
# Trailing flag
df["trailing_committing"] = (
df["score_committing"] < df["score_disadvantaged"]
).astype(int)
# Calculate the difference between the teams scores.
df["score_diff"] = df["score_disadvantaged"] - df["score_committing"]
# Calculate the trailing possessions needed.
df["trailing_poss"] = np.ceil(df["score_diff"].values / 3).astype(int)
# Possessions needed ID.
df["trailing_poss_id"] = df["trailing_poss"].map(
{poss: i for i, poss in enumerate(sorted(df["trailing_poss"].unique()))}
)
# Remaining possessions.
df["remaining_poss"] = df["seconds_left"].floordiv(25).add(1).astype(int)
# Remaining possessions ID.
df["remaining_poss_id"] = df["remaining_poss"].map(
{poss: i for i, poss in enumerate(sorted(df["remaining_poss"].unique()))}
)
# Keep only a few columns.
columns = [
"seconds_left",
"call_type",
"call_type_id",
"foul_called",
"committing_player",
"committing_player_id",
"disadvantaged_player",
"disadvantaged_player_id",
"score_committing",
"score_disadvantaged",
"season",
"season_id",
"trailing_committing",
"score_diff",
"trailing_poss",
"trailing_poss_id",
"remaining_poss",
"remaining_poss_id",
]
df = df[columns]
# Drop any duplicates.
df = df.drop_duplicates().reset_index(drop=True)
return df