in optimum/graphcore/models/bert/modeling_bert.py [0:0]
def parallelize(self):
"""
Transform the model to run in an IPU pipeline.
- Adds pipeline stages to the model
- Replaces self-attention layers with fused-qkv self-attention layers
- (If enabled) Replaces the word embedding projection with a SerializedLinear layer
- Adds recomputation checkpoints
"""
super().parallelize()
# Use faster fused-qkv self-attention
for layer in self.bert.encoder.layer:
layer.attention.self.__class__ = BertFusedSelfAttention
if self.ipu_config.embedding_serialization_factor > 1:
self.cls.predictions.decoder = SerializedLinear.from_model(
self.cls.predictions.decoder, self.ipu_config.embedding_serialization_factor
)
self.tie_weights()
logger.info("-------------------- Device Allocation --------------------")
logger.info("Embedding --> IPU 0")
self.bert.embeddings = poptorch.BeginBlock(self.bert.embeddings, "Embedding", ipu_id=0)
# Preventing the embeddings.LayerNorm from being outlined with the encoder.layer.LayerNorm
# improves the tile mapping of the pipeline stashes
hs = outline_attribute(self.bert.embeddings.LayerNorm, "embeddings")
self._hooks.extend(hs)
layer_ipu = get_layer_ipu(self.ipu_config, self.bert.encoder.layer)
for index, layer in enumerate(self.bert.encoder.layer):
ipu = layer_ipu[index]
if self.ipu_config.recompute_checkpoint_every_layer:
h = recomputation_checkpoint(layer)
self._hooks.append(h)
self.bert.encoder.layer[index] = poptorch.BeginBlock(layer, f"Encoder{index}", ipu_id=ipu)
logger.info(f"Encoder {index:<2} --> IPU {ipu}")
logger.info("Pooler --> IPU 0")
self.bert.pooler = poptorch.BeginBlock(self.bert.pooler, "Pooler", ipu_id=0)
logger.info("Classifier --> IPU 0")
self.cls = poptorch.BeginBlock(self.cls, "Classifier", ipu_id=0)
logger.info("-----------------------------------------------------------")
return self