in hardware/xilinx/src/vta.cc [565:751]
void vta(
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile bus_T *inputs,
volatile bus_T *weights,
volatile bus_T *biases,
volatile bus_T *outputs) {
#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
// Instantiate temporary instruction queues (used for peeking)
hls::stream<insn_T> tmp_load_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_load_queue)
hls::stream<insn_T> tmp_gemm_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_gemm_queue)
hls::stream<insn_T> tmp_store_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_store_queue)
// Instatiate physical instruction queues
hls::stream<insn_T> load_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=load_queue)
hls::stream<insn_T> gemm_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=gemm_queue)
hls::stream<insn_T> store_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=store_queue)
// Dependence queues
hls::stream<bool> l2g_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=l2g_dep_queue)
hls::stream<bool> s2g_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue)
hls::stream<bool> g2l_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue)
hls::stream<bool> g2s_dep_queue;
PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)
// Instantiate memories
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO];
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO];
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO];
// Push all instructions into the queues
fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue);
// Global done indicator
uint32_t done = 0;
// Temporary instructions
insn_T tmp_load;
insn_T tmp_gemv;
insn_T tmp_store;
// Peeking status
bool tmp_load_popped = false;
bool tmp_gemm_popped = false;
bool tmp_store_popped = false;
int exit_counter = 0;
// Main control loop
while (true) {
// First execute as many load instructions as possible
while (!tmp_load_queue.empty() || tmp_load_popped == true) {
// Pop the load instruction
if (!tmp_load_popped) {
tmp_load_queue.read(tmp_load);
tmp_load_popped = true;
}
// Check dependences and invoke the load stage
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load);
if ((insn.pop_next_dep && !g2l_dep_queue.empty()) ||
!insn.pop_next_dep) {
// Push the instruction in the load queue
load_queue.write(tmp_load);
tmp_load_popped = false;
load(inputs, weights, load_queue, g2l_dep_queue, l2g_dep_queue, inp_mem, wgt_mem);
} else {
// Execution of load stage pending on completion of other stages, so break here...
break;
}
}
// Next execute as many gemm instructions as possible
while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) {
// Pop the gemm instruction
if (!tmp_gemm_popped) {
tmp_gemm_queue.read(tmp_gemv);
tmp_gemm_popped = true;
}
// Check dependences and invoke the load stage
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
if (
(insn.pop_prev_dep && !l2g_dep_queue.empty() &&
insn.pop_next_dep && !s2g_dep_queue.empty()) ||
(!insn.pop_prev_dep && insn.pop_next_dep &&
!s2g_dep_queue.empty()) ||
(insn.pop_prev_dep && !l2g_dep_queue.empty() &&
!insn.pop_next_dep) ||
(!insn.pop_prev_dep && !insn.pop_next_dep)
) {
// Push the instruction in the load queue
gemm_queue.write(tmp_gemv);
tmp_gemm_popped = false;
compute(done, uops, biases, gemm_queue, l2g_dep_queue, s2g_dep_queue,
g2l_dep_queue, g2s_dep_queue, inp_mem, wgt_mem, out_mem);
} else {
// Execution of load stage pending on completion of other stages,
// so break here...
break;
}
}
// Finally execute as many store instructions as possible
while (!tmp_store_queue.empty() || tmp_store_popped == true) {
// Pop the load instruction
if (!tmp_store_popped) {
tmp_store_queue.read(tmp_store);
tmp_store_popped = true;
}
// Check dependences and invoke the load stage
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store);
if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) ||
!insn.pop_prev_dep) {
// Push the instruction in the load queue
store_queue.write(tmp_store);
tmp_store_popped = false;
store(outputs, store_queue, g2s_dep_queue, s2g_dep_queue, out_mem);
} else {
// Execution of load stage pending on completion of other stages, so break here...
break;
}
}
// Check if we get a signal that we are done
if (done) {
break;
}
exit_counter++;
if (exit_counter > 1000) {
if (tmp_load_popped) {
if (g2l_dep_queue.empty()) {
printf("waiting on g2l\n");
}
}
if (tmp_gemm_popped) {
VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
if (l2g_dep_queue.empty() && insn.pop_prev_dep) {
printf("waiting on l2g\n");
}
if (s2g_dep_queue.empty() && insn.pop_next_dep) {
printf("waiting on s2g\n");
}
}
if (tmp_store_popped) {
if (g2s_dep_queue.empty()) {
printf("waiting on g2s\n");
}
}
break;
}
}
// Ensure that the tokens are empty
bool tmp_tok;
int l2g_count = 0;
int s2g_count = 0;
int g2l_count = 0;
int g2s_count = 0;
while (l2g_dep_queue.read_nb(tmp_tok)) {
l2g_count++;
}
while (s2g_dep_queue.read_nb(tmp_tok)) {
s2g_count++;
}
while (g2l_dep_queue.read_nb(tmp_tok)) {
g2l_count++;
}
while (g2s_dep_queue.read_nb(tmp_tok)) {
g2s_count++;
}
assert(l2g_count == 0 && s2g_count == 0 && g2l_count == 0 && g2s_count == 0);
}