in codegen_sources/model/src/trainer.py [0:0]
def deobfuscate_by_variable(self, x, y, p, roberta_mode, rng=None):
"""
Deobfuscate class, function and variable name with probabilty p, by variable blocked.
We chose some variables VAR_N, functions FUNC_N or class CLASS_N - with probability p - to deobfuscate entirely.
I.e if VAR_0 is picked, all the occurences of VAR_0 are deobfuscated.
x : tensor slen x bs , x is obfuscated, i.e variable, function and classes names are
replaced by special tokens. ( CLASS_X, FUNC_X and VAR_X)
y : ylen x bs contains the dictionary of obfuscated tokens, i.e 'CLASS_0 class_name | VAR_0 variable_name .. '
"""
slen, bs = x.size()
# put to negative all the obf_tokens, useful for restoration i.e replacement in string later on
obf_tokens = (x >= self.data["dico"].obf_index["CLASS"]) * (
x < (self.data["dico"].obf_index["CLASS"] + self.data["dico"].n_obf_tokens)
)
x[obf_tokens] = -x[obf_tokens]
# convert sentences to strings and dictionary to a python dictionary (obf_token_special , original_name)
x_ = [
" ".join(
[
str(w)
for w in s
if w not in [self.params.pad_index, self.params.eos_index]
]
)
for s in x.transpose(0, 1).tolist()
]
y_ = [
" ".join(
[
str(w)
for w in s
if w not in [self.params.pad_index, self.params.eos_index]
]
)
for s in y.transpose(0, 1).tolist()
]
if roberta_mode:
sep = (
f" {self.data['dico'].word2id['Ġ|']} {self.data['dico'].word2id['Ġ']} "
)
else:
sep = f" {self.data['dico'].word2id['|']} "
# reversed order to have longer obfuscation first, to make replacement in correct order
d = [
list(
reversed(
[
(
mapping.strip().split()[0],
" ".join(mapping.strip().split()[1:]),
)
for mapping in pred.split(sep)
]
)
)
for pred in y_
]
# restore x i.e select variable with probability p and restore all occurence of this variable
# keep only unrestored variable in dictionary d_
x = []
y = []
for i in range(bs):
d_ = []
if rng:
dobf_mask = rng.rand(len(d[i])) <= p
else:
dobf_mask = np.random.rand(len(d[i])) <= p
# make sure at least one variable is picked
if sum(dobf_mask) == len(d[i]):
if rng:
dobf_mask[rng.randint(0, len(d[i]))] = False
else:
dobf_mask[np.random.randint(0, len(d[i]))] = False
for m, (k, v) in enumerate(d[i]):
if dobf_mask[m]:
x_[i] = x_[i].replace(f"-{k}", f"{v}")
else:
d_.append((k, v))
x_[i] = x_[i].replace(f"-{k}", f"{k}")
if roberta_mode:
# we need to remove the double space introduced during deobfuscation, i.e the "Ġ Ġ"
sent_ids = np.array(
[
self.data["dico"].word2id[index]
for index in (
" ".join(
[
self.data["dico"].id2word[int(w)]
for w in x_[i].split()
]
).replace("Ġ Ġ", "Ġ")
).split()
]
)
else:
sent_ids = np.array([int(id) for id in x_[i].split()])
if len(sent_ids) < self.params.max_len:
x.append(sent_ids)
d_ids = sep.join([" ".join([k, v]) for k, v in reversed(d_)])
d_ids = np.array([int(id) for id in d_ids.split()])
y.append(d_ids)
if len(x) == 0:
return None, None, None, None
x, len_x = batch_sentences(x, self.params.pad_index, self.params.eos_index)
y, len_y = batch_sentences(y, self.params.pad_index, self.params.eos_index)
assert sum(sum((x < 0).float())) == 0
return (x, len_x, y, len_y)