in hardware/xilinx/src/vta.cc [115:130]
void write_tensor(
IDX_T idx,
NARROW_T src[Y_DIM][X_DIM],
WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) {
#pragma HLS INLINE
for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
WIDE_T packet = 0;
for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y];
}
dst[idx][p] = packet;
}
}