in hardware/xilinx/src/vta.cc [327:413]
void alu(
insn_T insn_raw,
uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
#pragma HLS INLINE
VTAAluInsn insn = *((VTAAluInsn *) &insn_raw);
// Loop offset
acc_idx_T dst_offset_out = 0;
inp_idx_T src_offset_out = 0;
// Outer Loop
EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
acc_idx_T dst_offset_in = dst_offset_out;
inp_idx_T src_offset_in = src_offset_out;
// Inner Loop
EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
// Iterate over micro op
READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
#pragma HLS PIPELINE II = 2
// Read micro-op fields
uop_T uop = uop_mem[upc];
// Decode
acc_idx_T dst_idx =
uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in;
acc_idx_T src_idx =
uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
// Read in src tensor
acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(src_idx, acc_mem, src_tensor);
// Read in dst tensor
acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, dst_tensor);
// Output tensor
out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];
// Perform ALU op over matrix elements
for (int i = 0; i < VTA_BATCH; i++) {
for (int b = 0; b < VTA_BLOCK_OUT; b++) {
// Read in operands
acc_T src_0 = dst_tensor[i][b];
acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b];
aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0);
aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0);
if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) {
// Compute Min/Max
acc_T mix_val = src_0 < src_1 ?
(insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
(insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
dst_tensor[i][b] = mix_val;
o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
} else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) {
// Compute Sum
acc_T add_val =
src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
dst_tensor[i][b] = add_val;
o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
} else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) {
// Compute Shift Right
acc_T shr_val = src_0 >> shft_by;
dst_tensor[i][b] = shr_val;
o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0);
}
}
}
// Write the results back into accumulator
write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, dst_tensor, acc_mem);
// Write the results back in the output buffer
write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
}
// Update offsets
dst_offset_in += insn.dst_factor_in;
src_offset_in += insn.src_factor_in;
}
// Update offsets
dst_offset_out += insn.dst_factor_out;
src_offset_out += insn.src_factor_out;
}
}