in apps/gemm/src/driver.cc [67:88]
uint32_t Run(DLTensor* inp1, DLTensor* inp2, uint32_t shiftVal, DLTensor* out, uint32_t reset) {
uint32_t cycles;
uint32_t length = inp2->shape[0];
// 1 matrix 1 vector input
size_t size1 = (inp1->dtype.bits >> 3) * length * length;
size_t size2 = (inp2->dtype.bits >> 3) * length;
// 1 vector output
size_t size3 = (32 >> 3) * length;
inp1_ = this->MemAlloc(size1);
inp2_ = this->MemAlloc(size2);
out_ = this->MemAlloc(size3);
this->MemCopyFromHost(inp1_, inp1->data, size1);
this->MemCopyFromHost(inp2_, inp2->data, size2);
this->Init();
this->Launch(length, shiftVal, reset);
cycles = this->WaitForCompletion();
this->MemCopyToHost(out->data, out_, size3);
this->MemFree(inp1_);
this->MemFree(inp2_);
this->MemFree(out_);
return cycles;
}