in intermediate_source/forced_alignment_with_torchaudio_tutorial.py [0:0]
def backtrack(trellis, emission, tokens, blank_id=0):
# Note:
# j and t are indices for trellis, which has extra dimensions
# for time and tokens at the beginning.
# When refering to time frame index `T` in trellis,
# the corresponding index in emission is `T-1`.
# Similarly, when refering to token index `J` in trellis,
# the corresponding index in transcript is `J-1`.
j = trellis.size(1) - 1
t_start = torch.argmax(trellis[:, j]).item()
path = []
for t in range(t_start, 0, -1):
# 1. Figure out if the current position was stay or change
# Note (again):
# `emission[J-1]` is the emission at time frame `J` of trellis dimension.
# Score for token staying the same from time frame J-1 to T.
stayed = trellis[t-1, j] + emission[t-1, blank_id]
# Score for token changing from C-1 at T-1 to J at T.
changed = trellis[t-1, j-1] + emission[t-1, tokens[j-1]]
# 2. Store the path with frame-wise probability.
prob = emission[t-1, tokens[j-1] if changed > stayed else 0].exp().item()
# Return token index and time index in non-trellis coordinate.
path.append(Point(j-1, t-1, prob))
# 3. Update the token
if changed > stayed:
j -= 1
if j == 0:
break
else:
raise ValueError('Failed to align')
return path[::-1]