in sagemaker/source/preprocessing/preprocessing.py [0:0]
def sample_dataset(config, train_ratio=0.8):
dataset_df = pd.read_csv(config.fleet_dataset_fn, chunksize=config.chunksize)
train_writer = DataFrameWriter(filename=config.train_dataset_fn, chunksize=config.processing_chunksize)
test_writer = DataFrameWriter(filename=config.test_dataset_fn, chunksize=config.processing_chunksize)
for chunk_idx, chunk_df in enumerate(dataset_df):
print("Processing Fleet Dataset Chunk {}".format(chunk_idx + 1))
# Split into test and train set with 50/50 probability and random sample
# An assumption is being made here that failures occur with lower frequency. This may not be true.
# This should be fixed.
df_neg = chunk_df[chunk_df[config.target_column] == 0]
df_pos = chunk_df[chunk_df[config.target_column] == 1]
if len(df_neg) == 0 or len(df_pos) == 0:
print("Dropping chunk because data is all one class.")
continue
# Down-sample negative cases.
df_neg = resample(df_neg,
replace=False,
n_samples=len(df_pos))
train_pos,train_neg = df_pos.sample(frac=train_ratio) , df_neg.sample(frac=train_ratio)
test_pos, test_neg = df_pos.drop(train_pos.index), df_neg.drop(train_neg.index)
train_writer.append(train_pos)
train_writer.append(train_neg)
test_writer.append(test_pos)
test_writer.append(test_neg)
train_writer.flush_buffer()
test_writer.flush_buffer()