in hardware/xilinx/src/vta.cc [415:515]
void compute(
volatile uint32_t &done,
volatile uop_T *uops,
volatile bus_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET)
#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
#pragma HLS INTERFACE axis port = gemm_queue
#pragma HLS INTERFACE axis port = l2g_dep_queue
#pragma HLS INTERFACE axis port = s2g_dep_queue
#pragma HLS INTERFACE axis port = g2l_dep_queue
#pragma HLS INTERFACE axis port = g2s_dep_queue
#pragma HLS INTERFACE bram port = inp_mem
#pragma HLS INTERFACE bram port = wgt_mem
#pragma HLS INTERFACE bram port = out_mem
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
#pragma HLS RESOURCE variable = inp_mem core = RAM_1P
#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
#pragma HLS RESOURCE variable = out_mem core = RAM_1P
// Micro-op storage
static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
// Accumulator storage
static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO];
#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2
// This is necessary to obtain II=1
#pragma HLS DEPENDENCE variable = acc_mem inter false
// Pop GEMM instruction
insn_T raw_insn = gemm_queue.read();
// Cast to GenericInsn
VTAInsn insn;
insn_T raw_copy = raw_insn;
insn.generic = *((VTAGenericInsn *) &raw_copy);
// Pop dependence token if instructed
if (insn.generic.pop_prev_dep) {
l2g_dep_queue.read();
}
if (insn.generic.pop_next_dep) {
s2g_dep_queue.read();
}
// Set done value
done = 0;
// Perform action based on opcode
if (insn.generic.opcode == VTA_OPCODE_FINISH) {
// Set done flag if we reach a FINISH instruction
done = 1;
} else if (insn.generic.opcode == VTA_OPCODE_LOAD) {
// Initialize indices
memop_sram_T sram_idx = insn.mem.sram_base;
memop_dram_T dram_idx = insn.mem.dram_base;
memop_sram_T x_width =
(insn.mem.x_pad_0 + insn.mem.x_size + insn.mem.x_pad_1);
memop_sram_T y_offset_0 = x_width * insn.mem.y_pad_0;
memop_sram_T y_offset_1 = x_width * insn.mem.y_pad_1;
if (insn.mem.memory_type == VTA_MEM_ID_UOP) {
// Perform data transfer
memcpy(&uop_mem[sram_idx],
(const uop_T*) &uops[dram_idx],
insn.mem.x_size * sizeof(uop_T));
} else if (insn.mem.memory_type == VTA_MEM_ID_ACC) {
// Perform data transfer from DRAM
load_pad_2d<bus_T, ACC_MAT_AXI_RATIO, VTA_ACC_ELEM_BYTES>(
biases,
acc_mem,
sram_idx,
dram_idx,
insn.mem.y_size,
insn.mem.x_size,
insn.mem.x_stride,
insn.mem.x_pad_0,
insn.mem.x_pad_1,
y_offset_0,
y_offset_1);
}
} else if (insn.generic.opcode == VTA_OPCODE_GEMM) {
gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
} else if (insn.generic.opcode == VTA_OPCODE_ALU) {
alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
}
// Push dependence token if instructed
if (insn.generic.push_prev_dep) {
g2l_dep_queue.write(1);
}
if (insn.generic.push_next_dep) {
g2s_dep_queue.write(1);
}
}