def sample_dataset()

in sagemaker/source/preprocessing/preprocessing.py [0:0]


def sample_dataset(config, train_ratio=0.8):
    dataset_df = pd.read_csv(config.fleet_dataset_fn, chunksize=config.chunksize)
    train_writer = DataFrameWriter(filename=config.train_dataset_fn, chunksize=config.processing_chunksize)
    test_writer = DataFrameWriter(filename=config.test_dataset_fn, chunksize=config.processing_chunksize)

    for chunk_idx, chunk_df in enumerate(dataset_df):
        print("Processing Fleet Dataset Chunk {}".format(chunk_idx + 1))
        # Split into test and train set with 50/50 probability and random sample

        # An assumption is being made here that failures occur with lower frequency. This may not be true.
        # This should be fixed.
        df_neg = chunk_df[chunk_df[config.target_column] == 0]
        df_pos = chunk_df[chunk_df[config.target_column] == 1]

        if len(df_neg) == 0 or len(df_pos) == 0:
            print("Dropping chunk because data is all one class.")
            continue

        # Down-sample negative cases.
        df_neg = resample(df_neg,
                          replace=False,
                          n_samples=len(df_pos))

        train_pos,train_neg = df_pos.sample(frac=train_ratio) , df_neg.sample(frac=train_ratio)
        test_pos, test_neg = df_pos.drop(train_pos.index), df_neg.drop(train_neg.index)

        train_writer.append(train_pos)
        train_writer.append(train_neg)
        test_writer.append(test_pos)
        test_writer.append(test_neg)

    train_writer.flush_buffer()
    test_writer.flush_buffer()