in ludwig/data/preprocessing.py [0:0]
def handle_missing_values(dataset_df, feature, preprocessing_parameters):
missing_value_strategy = preprocessing_parameters['missing_value_strategy']
if missing_value_strategy == FILL_WITH_CONST:
dataset_df[feature['name']] = dataset_df[feature['name']].fillna(
preprocessing_parameters['fill_value'],
)
elif missing_value_strategy == FILL_WITH_MODE:
dataset_df[feature['name']] = dataset_df[feature['name']].fillna(
dataset_df[feature['name']].value_counts().index[0],
)
elif missing_value_strategy == FILL_WITH_MEAN:
if feature[TYPE] != NUMERICAL:
raise ValueError(
'Filling missing values with mean is supported '
'only for numerical types',
)
dataset_df[feature['name']] = dataset_df[feature['name']].fillna(
dataset_df[feature['name']].mean(),
)
elif missing_value_strategy in ['backfill', 'bfill', 'pad', 'ffill']:
dataset_df[feature['name']] = dataset_df[feature['name']].fillna(
method=missing_value_strategy,
)
elif missing_value_strategy == DROP_ROW:
dataset_df.dropna(subset=[feature['name']], inplace=True)
else:
raise ValueError('Invalid missing value strategy')