in optimum/graphcore/models/mt5/modeling_mt5.py [0:0]
def deparallelize(self):
"""
Undo the changes to the model done by `parallelize`.
You should call this before doing `save_pretrained` so that the `model.state_dict` is
fully compatible with `transformers.MT5ForConditionalGeneration`.
"""
# MT5ForConditionalGeneration has a deparallelize method, so make sure that the PipelineMixin one is used here.
PipelineMixin.deparallelize(self)
self.encoder_and_decoder_embeddings_computation(False)
if self.shared.__class__ == SerializedEmbedding:
self.shared = self.shared.to_model()
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
self.change_lm_head_to_indexed_input_linear(restore=True)
if self.lm_head.__class__ == SerializedLinear:
self.lm_head = self.lm_head.to_model()
if self.config.tie_word_embeddings:
self.tie_weights()
elif self.lm_head.__class__ == SplitProjection:
self.lm_head = self.lm_head.to_model()
self.encoder.__class__ = MT5Stack
self.decoder.__class__ = MT5Stack
for block in self.encoder.block:
block.__class__ = MT5Block
block.layer[0].dropout = block.layer[0].dropout.module
with torch.no_grad():
block.layer[1].DenseReluDense.wo.weight *= block.layer[1].dropout.scale
block.layer[1].dropout = block.layer[1].dropout.module
if self.config.dense_act_fn == "gelu_new":
block.layer[1].DenseReluDense.act = NewGELUActivation()
for block in self.decoder.block:
block.__class__ = MT5Block
if self.config.dense_act_fn == "gelu_new":
block.layer[2].DenseReluDense.act = NewGELUActivation()
return self