in dataloading.py [0:0]
def __init__(self, root, train=True, download=False, mehnaz20=True, drop=True):
"""
mehnaz20: If True, then pre-process the data according to:
Black-box Model Inversion Attribute Inference Attacks on
Classification Models, Mehnaz 2020, https://arxiv.org/abs/2012.03404.
Namely, they combine the marital status features (5) into a single
binary feature {Married-civ-spouse, Married-spouse-absent,
Married-Af-spouse} vs {Divorced, Never-married, Separated, Widowed} and
remove the relationship features (7).
drop: If True, remove the last feature for a one-hot encoded
attribute. This helps alleviate perfect colinearity amongst
the features.
"""
self.root = root
if download:
self.download()
continuous_ids = set([0, 2, 4, 10, 11, 12])
feature_keys = [set() for _ in range(15)]
def load(dataset):
with open(os.path.join(root, dataset)) as fid:
# load data ignoring rows with missing values
lines = (l.strip() for l in fid)
lines = (l.split(",") for l in lines if "?" not in l)
lines = [l for l in lines if len(l) == 15]
return lines
for line in load("adult.data"):
for e, k in enumerate(line):
if e in continuous_ids:
continue
k = k.strip()
feature_keys[e].add(k)
feature_keys = [{k: i for i, k in enumerate(sorted(fk))}
for fk in feature_keys]
self.feature_keys = feature_keys
if mehnaz20:
# Remap marital status to binary feature:
marital_status = feature_keys[5]
for ms in ["Divorced", "Never-married", "Separated", "Widowed"]:
marital_status[ms] = 0
for ms in ["Married-AF-spouse", "Married-civ-spouse", "Married-spouse-absent"]:
marital_status[ms] = 1
def process(dataset, mean_stds=None):
features = []
targets = []
for line in load(dataset):
example = []
for e, k in enumerate(line):
k = k.strip().strip(".")
example.append(int(feature_keys[e].get(k, k)))
features.append(example[:-1])
targets.append(example[-1])
features = torch.tensor(features, dtype=torch.float)
features = list(features.split(1, dim=1))
targets = torch.tensor(targets)
if mean_stds is None:
mean_stds = {}
for e, feat in enumerate(features):
keys = feature_keys[e]
# Normalize continuous features:
if len(keys) == 0:
if e not in mean_stds:
mean_stds[e] = (torch.mean(feat), torch.std(feat))
mean, std = mean_stds[e]
features[e] = (feat - mean) / std
# One-hot encode non-continuous features:
else:
num_feats = max(keys.values()) + 1
features[e] = torch.nn.functional.one_hot(
feat.squeeze().to(torch.long), num_feats)
if drop:
features[e] = features[e][:, :-1]
features[e] = features[e].to(torch.float)
if mehnaz20:
# Remove relationship status:
features.pop(7)
features = torch.cat(features, dim=1)
return features, targets, mean_stds
features, targets, mean_stds = process("adult.data")
if not train:
features, targets, _ = process("adult.test", mean_stds)
self.data = features
self.targets = targets