in dataloading.py [0:0]
def pca(data, num_dims=None, mapping=None):
"""
Applies PCA on the specified `data` to reduce its dimensionality to
`num_dims` dimensions, and returns the reduced data and `mapping`.
If a `mapping` is specified as input, `num_dims` is ignored and that mapping
is applied on the input `data`.
"""
# work on both data tensor and data dict:
data_dict = False
if isinstance(data, dict):
assert "features" in data, "data dict does not have features field"
data_dict = True
original_data = data
data = original_data["features"]
assert data.dim() == 2, "data tensor must be two-dimensional matrix"
# compute PCA mapping:
if mapping is None:
assert num_dims is not None, "must specify num_dims or mapping"
mean = torch.mean(data, 0, keepdim=True)
zero_mean_data = data.sub(mean)
covariance = torch.matmul(zero_mean_data.t(), zero_mean_data)
_, projection = torch.symeig(covariance, eigenvectors=True)
projection = projection[:, -min(num_dims, projection.size(1)):]
mapping = {"mean": mean, "projection": projection}
else:
assert isinstance(mapping, dict), "mapping must be a dict"
assert "mean" in mapping and "projection" in mapping, "mapping missing keys"
if num_dims is not None:
logging.warning("Value of num_dims is ignored when mapping is specified.")
# apply PCA mapping:
reduced_data = data.sub(mapping["mean"]).matmul(mapping["projection"])
# return results:
if data_dict:
original_data["features"] = reduced_data
reduced_data = original_data
return reduced_data, mapping