in utils.py [0:0]
def __init__(self, uri_path, balance=True):
self.s3_client = None
# get uris and labels to train on
uris, labels = [], []
for l in open(uri_path):
uri, label = l[:-1].split(",")
label = float(label)
uris.append(uri)
labels.append(label)
# convert to np arrays for easier indexing
uris = np.asarray(uris)
labels = np.asarray(labels)
# split into benign and malicious uris
benign_uris = uris[labels == 0]
malicious_uris = uris[labels == 1]
# balance the dataset by throwing away samples in the majority class
if balance:
if len(benign_uris) > len(malicious_uris):
benign_idxs = np.random.permutation(len(benign_uris))
benign_uris = benign_uris[benign_idxs[:len(malicious_uris)]]
else:
malicious_idxs = np.random.permutation(len(malicious_uris))
malicious_uris = malicious_uris[malicious_idxs[:len(malicious_uris)]]
# finally, stitch everything together
self.uris = np.concatenate([malicious_uris, benign_uris])
self.labels = np.concatenate([np.ones(len(malicious_uris)), np.zeros(len(benign_uris))]).astype(np.float32)
return