in reInvent19_Developer_Workshop/modules/module_01/idct/src/idct.cpp [525:742]
int main(int argc, char* argv[]) {
char *xcl_mode = getenv("XCL_EMULATION_MODE");
int xclbin_argc = -1;
for(int i=0; i<argc; i++) {
std::string arg = argv[i];
std::string xclbinStr = "xclbin";
if(arg.find(xclbinStr) != std::string::npos) {
xclbin_argc = i;
}
}
// *********** Allocate and initialize test vectors **********
// Blocks of 64 of int16_t
size_t blocks = 1024*1024*4;
// Limit blocks for emulation modes
if (xcl_mode != NULL) {
blocks = 1024;
}
bool ignore_dc = true;
// Create input
std::vector<int16_t, aligned_allocator<int16_t>> source_block(64*blocks);
std::vector<uint16_t, aligned_allocator<uint16_t>> source_q(64);
std::vector<int16_t, aligned_allocator<int16_t>> golden_vpout(64*blocks);
std::vector<int16_t, aligned_allocator<int16_t>> result_vpout(64*blocks);
for(size_t i = 0; i < blocks; i++){
for(size_t j = 0; j < 64; j++) {
source_block[i*64 + j] = j;
}
}
for(size_t j = 0; j < 64; j++) {
source_q[j] = j;
}
// *********** Communication Parameters **********
int banks = 1;
const size_t cus = banks;
const size_t threads = cus;
size_t numBlocks64 = 16384;
if (xcl_mode != NULL) {
numBlocks64 = 256;
}
std::cout << "FPGA number of 64*int16_t blocks per transfer: " << numBlocks64 << std::endl;
if(blocks%(threads*numBlocks64) != 0) {
std::cout << "Error: The current implementation supports only full banks to be transfered"
<< " per thread" << std::endl;
exit(1);
}
// *********** OpenCL Host Code Setup **********
// Connect to first platform
int err;
char cl_platform_vendor[1001];
char cl_platform_name[1001];
char cl_device_name[1001];
cl_platform_id platform_id; // platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
// Get number of platforms
cl_uint platform_count;
clGetPlatformIDs(0, nullptr, &platform_count);
// get all platforms
std::vector<cl_platform_id> platforms(platform_count);
clGetPlatformIDs(platform_count, platforms.data(), nullptr);
bool found = false;
for (int p = 0; p < (int)platform_count; ++p) {
platform_id = platforms[p];
clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
if(!strcmp(cl_platform_vendor,"Xilinx")) {
found = true;
break;
}
}
if (!found){
std::cout << "Platform Not Found\n";
return err;
}
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 1, &device_id, NULL);
if (err != CL_SUCCESS) {
std::cout << "FAILED TEST - Device\n";
return err;
}
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context || (err != CL_SUCCESS)) {
std::cout << "FAILED TEST - Context \n";
return err;
}
clGetDeviceInfo(device_id, CL_DEVICE_NAME, 1000, (void*)cl_device_name, NULL);
std::cout << "DEVICE: " << cl_device_name << std::endl;
std::string binaryName;
if(xclbin_argc != -1) {
binaryName = argv[xclbin_argc];
} else {
getBinaryName(binaryName, cl_device_name);
}
std::cout << "Loading Bitstream: " << binaryName << std::endl;
char *krnl_bin;
size_t krnl_size;
krnl_size = load_file_to_memory(binaryName.c_str(), &krnl_bin);
printf("INFO: Loaded file\n");
printf("NUM_SCHED: %d\n", NUM_SCHED);
cl_program program = clCreateProgramWithBinary(context, 1,
(const cl_device_id* ) &device_id, &krnl_size,
(const unsigned char**) &krnl_bin,
NULL, &err);
// Create Kernel
std::cout << "Create Kernel: krnl_idct" << std::endl;
cl_kernel krnl = clCreateKernel(program, "krnl_idct", &err);
// Create Command Queue
cl_command_queue q = clCreateCommandQueue(context, device_id,
CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
// Create compute units
std::cout << "Create Compute Unit" << std::endl;
oclDct cu;
cu.init(context, device_id, krnl, q, numBlocks64);
std::cout << "Setup complete" << std::endl;
// *********** Host (CPU) execution **********
std::cout << "Running CPU version" << std::endl;
auto cpu_begin = std::chrono::high_resolution_clock::now();
runCPU(blocks, source_block, source_q, golden_vpout, ignore_dc);
auto cpu_end = std::chrono::high_resolution_clock::now();
// *********** Accelerator execution **********
std::cout << "Running FPGA version" << std::endl;
auto fpga_begin = std::chrono::high_resolution_clock::now();
runFPGA(blocks,
source_block,
source_q,
result_vpout,
q,
ignore_dc,
cu,
numBlocks64);
auto fpga_end = std::chrono::high_resolution_clock::now();
// *********** OpenCL Host Code cleanup **********
clReleaseCommandQueue(q);
clReleaseKernel(krnl);
clReleaseProgram(program);
clReleaseContext(context);
// *********** Comparison (Host to Acceleration) **********
std::cout << "Runs complete validating results" << std::endl;
int krnl_match = 0;
for(size_t i = 0; i < 64*blocks; i++){
if(result_vpout[i] != golden_vpout[i]){
printf("Error: Result mismatch\n");
printf("i = %d CPU result = %d Krnl Result = %d\n",
(int) i, golden_vpout[i], result_vpout[i]);
krnl_match = 1;
break;
}
}
std::cout << "TEST " << (krnl_match ? "FAILED" : "PASSED") << std::endl;
// *********** Computational Statistics **********
//
// Only reported in the HW execution mode as wall clock time is meaningless in
// emulation.
//
if (xcl_mode == NULL) {
std::chrono::duration<double> cpu_duration = cpu_end - cpu_begin;
std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;
std::cout << "CPU Time: " << cpu_duration.count() << " s" << std::endl;
std::cout << "CPU Throughput: "
<< (double) blocks*128 / cpu_duration.count() / (1024.0*1024.0)
<< " MB/s" << std::endl;
std::cout << "FPGA Time: " << fpga_duration.count() << " s" << std::endl;
std::cout << "FPGA Throughput: "
<< (double) blocks*128 / fpga_duration.count() / (1024.0*1024.0)
<< " MB/s" << std::endl;
} else {
std::cout << "RUN COMPLETE" << std::endl;
}
return (krnl_match ? EXIT_FAILURE : EXIT_SUCCESS);
}