in optimum/graphcore/ipu_configuration.py [0:0]
def _to_options(self, for_inference: bool = False, compile_only: bool = False) -> poptorch.Options:
if not compile_only and poptorch.ipuHardwareVersion() not in (2, 21):
raise RuntimeError("This requires an IPU Mk2 system to run.")
if self.execute_encoder_on_cpu_for_generation:
raise NotImplementedError("execute_encoder_on_cpu_for_generation is not supported yet.")
old_mode = self.mode
self.eval() if for_inference else self.train()
opts = Options()
opts.autoRoundNumIPUs(True)
opts.replicationFactor(self.inference_replication_factor if for_inference else self.replication_factor)
opts.deviceIterations(self.inference_device_iterations if for_inference else self.device_iterations)
if not for_inference:
# Set gradient accumulation factor
opts.Training.gradientAccumulation(self.gradient_accumulation_steps)
opts.Training.accumulationAndReplicationReductionType(poptorch.ReductionType.Mean)
# Enable automatic loss scaling
# Note that this is an experimental feature. Note also that it expects
# accumulationAndReplicationReductionType to be set to Mean as above,
# and for accumulation by the optimizer to be done in half precision
# using accum_type=torch.float16 during optimizer instantiation.
if self.auto_loss_scaling and not for_inference:
opts.Training.setAutomaticLossScaling(True)
# Return all results from IPU to host
output_mode_mapping = {
"all": OutputMode.All,
"sum": OutputMode.Sum,
"final": OutputMode.Final,
"default": OutputMode.Default,
}
training_output_mode = output_mode_mapping.get(self.output_mode, None)
if training_output_mode is None:
supported_output_modes = ", ".join(output_mode_mapping.keys())
raise KeyError(
f"{self.output_mode} is not a valid poptorch.OutputMode, supported output modes: {supported_output_modes}"
)
opts.outputMode(OutputMode.All if for_inference else training_output_mode)
if self.seed:
opts.randomSeed(self.seed)
# Enable replicated tensor sharding of optimizer state
# with optimizer state residing either on-chip or in DRAM.
# RTS is only enabled if replication factor is also greater than 1
opts.TensorLocations.setOptimizerLocation(
poptorch.TensorLocationSettings()
# Optimizer state lives on- or off-chip
.useOnChipStorage(not self.optimizer_state_offchip)
# Shard optimizer state between replicas with zero-redundancy
.useReplicatedTensorSharding(self.replicated_tensor_sharding and opts.replication_factor > 1)
)
if for_inference:
opts.setExecutionStrategy(poptorch.ShardedExecution(poptorch.AutoStage.AutoIncrement))
else:
# Use Pipelined Execution
opts.setExecutionStrategy(poptorch.PipelinedExecution(poptorch.AutoStage.AutoIncrement))
# Compile offline (no IPUs required)
if compile_only:
opts.useOfflineIpuTarget()
matmul_proportion = copy.deepcopy(self._matmul_proportion)
if isinstance(matmul_proportion, float):
matmul_proportion = [matmul_proportion] * self._ipus_per_replica
mem_prop = {f"IPU{i}": matmul_proportion[i] for i in range(self._ipus_per_replica)}
opts.setAvailableMemoryProportion(mem_prop)
# Enable caching the compiled executable to disk
if self.executable_cache_dir and self.executable_cache_dir != "disabled":
opts.enableExecutableCaching(self.executable_cache_dir)
opts._Popart.set("saveInitializersToFile", NamedTemporaryFile().name)
# Enable stochastic rounding (recommended for training with FP16)
opts.Precision.enableStochasticRounding(not for_inference)
# Half precision partials for matmuls and convolutions
if self.enable_half_partials:
opts.Precision.setPartialsType(torch.float16)
# PopART performance options #
# Only stream needed tensors back to host
opts._Popart.set("disableGradAccumulationTensorStreams", True)
# Parallelize optimizer step update across IPUs
opts._Popart.set(
"accumulateOuterFragmentSettings.schedule",
int(popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized),
)
opts._Popart.set("accumulateOuterFragmentSettings.excludedVirtualGraphs", ["0"])
# Enable patterns for better throughput and memory reduction
opts._Popart.set("outlineThreshold", 10.0)
opts._Popart.set("subgraphCopyingStrategy", int(popart.SubgraphCopyingStrategy.JustInTime))
opts._Popart.set("scheduleNonWeightUpdateGradientConsumersEarly", True)
opts._Popart.setPatterns(
{"TiedGather": True, "TiedGatherAccumulate": True, "UpdateInplacePrioritiesForIpu": True}
)
# Options for profiling with Popvision
engine_options = {
"opt.useAutoloader": "true",
"target.syncReplicasIndependently": "true",
}
if for_inference and self.explicit_ir_inference:
opts._popart.set("enableExplicitIR", True)
opts._Popart.set("engineOptions", engine_options)
self.mode = old_mode
return opts