in simulation/decai/simulation/data/featuremapping/feature_index_mapper.py [0:0]
def map(self, training_data, testing_data) -> Tuple[np.ndarray, np.ndarray, Optional[FeatureIndexMapping]]:
if isinstance(training_data, np.ndarray):
assert isinstance(testing_data, np.ndarray), \
f"Testing data must also be an ndarray if the training data is an ndarray. Got: {type(testing_data)}."
return training_data, testing_data, None
mapping = sorted(map(int, set(training_data.nonzero()[-1])))
feature_index_to_index_mapping = {v: index for (index, v) in enumerate(mapping)}
# We want: `result_train = training_data[:, mapping].todense()` but this was allocating a large matrix even before calling `todense()`.
# Also tried making a mapping matrix and multiplying by it but that also allocated memory.
result_train = np.zeros(training_data.shape[:-1] + (len(mapping),), dtype=training_data.dtype)
*row_indices, col_indices = training_data.nonzero()
col_indices = tuple(feature_index_to_index_mapping[i] for i in col_indices)
result_train[row_indices, col_indices] = training_data[training_data.nonzero()]
result_test = np.zeros(testing_data.shape[:-1] + (len(mapping),), dtype=testing_data.dtype)
*row_indices, col_indices = testing_data.nonzero()
original_col_indices_used = []
row_indices_used = []
col_indices_mapped = []
for row_index, col_index in zip(*row_indices, col_indices):
index = feature_index_to_index_mapping.get(col_index)
if index is not None:
original_col_indices_used.append(col_index)
row_indices_used.append(row_index)
col_indices_mapped.append(index)
result_test[row_indices_used, col_indices_mapped] = testing_data[row_indices_used, original_col_indices_used]
return result_train, result_test, mapping