in model_inversion.py [0:0]
def fredrikson14_inverter(
train_data, target_attribute, model, weights=None, one_hot=False, **kwargs):
"""
Implements the model inversion attack of:
Fredrikson, 2014, Privacy in Pharmacogenetics: An End-to-End Case Study
of Personalized Warfarin Dosing
"""
if weights is None:
weights = torch.ones(train_data["features"].size(0))
log_marginal = compute_log_marginal(
train_data, target_attribute, weights=weights, one_hot=one_hot)
if type(model) == models.LeastSquares:
n, d = train_data["features"].shape
std_var = (weights * model.loss(train_data)).sum().true_divide(n - d)
score_fn = lambda data : -0.5 * (weights * model.loss(data)) / std_var
elif type(model) == models.Logistic:
preds = model.predict(train_data["features"])
y = train_data["targets"]
matched = preds == y
confusions = torch.tensor([
[matched[y == 0].sum(), (~matched)[y == 0].sum()],
[(~matched)[y == 1].sum(), matched[y == 1].sum()]
])
pi = confusions.true_divide(confusions.sum(axis=0, keepdim=True))
def score_fn(data):
preds = model.predict(data["features"])
y = data["targets"]
return pi[y, preds.long()].log()
else:
raise ValueError("Unknown model type.")
# For each possible value of the target attribute compute score for the
# attribute which should be proportional to log pi(y, y') + log p(x), where
# pi(y, y') is a model dependent performance measure.
tgt_features = train_data["features"].clone().detach()
tgt_features[:, range(*target_attribute)] = 0.
scores = []
for c in range(*target_attribute):
tgt_features[:, c] = 1.
score = score_fn(
{"features": tgt_features, "targets": train_data["targets"]})
score += log_marginal[c - target_attribute[0]]
scores.append(score)
tgt_features[:, c] = 0.
# Try all 0s
if not one_hot:
score = score_fn(
{"features": tgt_features, "targets": train_data["targets"]})
score += log_marginal[-1]
scores.append(score)
# Make the prediction:
return torch.argmax(torch.stack(scores, axis=1), axis=1)