in optimum/exporters/onnx/model_configs.py [0:0]
def inputs(self) -> Dict[str, Dict[int, str]]:
common_inputs = {}
# Batched inference is not supported in Transformers.
if self._behavior is ConfigBehavior.ENCODER:
common_inputs["input_ids"] = {1: "encoder_sequence_length"}
elif self._behavior is ConfigBehavior.DECODER:
# NOTE: even when past is used, the decoder takes the full sequence as input as the prenet seem to require it:
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/speecht5/modeling_speecht5.py#L2573
common_inputs["output_sequence"] = {1: "decoder_sequence_length"}
common_inputs["speaker_embeddings"] = {} # No dynamic shape here.
common_inputs["encoder_outputs"] = {1: "encoder_sequence_length"}
common_inputs["encoder_attention_mask"] = {1: "encoder_sequence_length"}
if self.variant == "with-past" and self.use_past_in_inputs:
self.add_past_key_values(common_inputs, direction="inputs")
elif self.is_postnet_and_vocoder:
common_inputs["spectrogram"] = {0: "n_spectrums x reduction_factor"}
else:
raise ValueError(
"self._behavior is neither encoder or decoder, and is_postnet_and_vocoder=False. This should not happen."
)
return common_inputs