in dataloading.py [0:0]
def __init__(self, root, train=True, download=False, drop=True):
"""
drop: If True, remove the last feature for a one-hot encoded
attribute. This helps alleviate perfect colinearity amongst
the features.
"""
if download:
# download dataloading code and data from repo
try:
git.Repo.clone_from("git@github.com:samuel-yeom/ml-privacy-csf18.git", root)
except git.GitCommandError:
print("Directory exists and is non-empty, skipping download")
sys.path.append(os.path.join(root, "code"))
from main import load_iwpc
X, y, featnames = load_iwpc(os.path.join(root, "data"))
X = X.todense()
if drop:
attrs = [f.split("=")[0] for f in featnames]
drop_keys = ["cyp2c9", "race", "vkorc1"]
drop_idx = [attrs.index(k) for k in drop_keys]
X = np.delete(X, drop_idx, axis=1)
featnames = np.delete(featnames, drop_idx)
print("Attributes: " + str(featnames))
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).float()
# fix a random 80:20 train-val split
torch.manual_seed(0)
perm = torch.randperm(X.size(0))
n_train = int(0.8 * X.size(0))
if train:
self.data = X[perm[:n_train], :]
self.targets = y[perm[:n_train]]
else:
self.data = X[perm[n_train:], :]
self.targets = y[perm[n_train:]]
torch.manual_seed(time.time())