int main()

in NYC_Summit18_Developer_Workshop/workspace/IDCT/src/idct.cpp [462:673]


int main(int argc, char* argv[]) {

  char *xcl_mode = getenv("XCL_EMULATION_MODE");

  if (argc != 2) {
    printf("Usage: %s "
	   "./xclbin/krnl_idct.<emulation_mode>.<dsa>.xclbin\n",
	   argv[0]);
    return EXIT_FAILURE;
  }

  char* binaryName = argv[1];


  // *********** Allocate and initialize test vectors **********

  // Blocks of 64 of int16_t
  size_t blocks = 1024*1024*4;

  // Limit blocks for emulation modes
  if (xcl_mode != NULL) {
    blocks = 1024;
  }

  bool ignore_dc = true;
  
  // Create input
  std::vector<int16_t, aligned_allocator<int16_t>>  source_block(64*blocks);
  std::vector<uint16_t, aligned_allocator<uint16_t>> source_q(64);
  std::vector<int16_t, aligned_allocator<int16_t>>  golden_vpout(64*blocks);
  std::vector<int16_t, aligned_allocator<int16_t>>  result_vpout(64*blocks);

  for(size_t i = 0; i < blocks; i++){
    for(size_t j = 0; j < 64; j++) {
      source_block[i*64 + j] = j;
    }
  }
	
  for(size_t j = 0; j < 64; j++) {
    source_q[j] = j;
  }


  // *********** Communication Parameters **********
  int banks = 1;
  const size_t cus = banks;
  const size_t threads = cus;
  size_t numBlocks64 = 512; 

  if (xcl_mode != NULL) {
    numBlocks64 = 256;
  }

  std::cout << "FPGA number of 64*int16_t blocks per transfer: " << numBlocks64 << std::endl;
  if(blocks%(threads*numBlocks64) != 0) {
    std::cout << "Error: The current implementation supports only full banks to be transfered"
	      << " per thread" << std::endl;
    exit(1);
  }

  // *********** OpenCL Host Code Setup **********

  // Connect to first platform
  int err;
  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
  char cl_device_name[1001];

  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id
  cl_context context;                 // compute context

  // Get number of platforms
  cl_uint platform_count;
  clGetPlatformIDs(0, nullptr, &platform_count);

  // get all platforms
  std::vector<cl_platform_id> platforms(platform_count);
  clGetPlatformIDs(platform_count, platforms.data(), nullptr);

  bool found = false;
  for (int p = 0; p < (int)platform_count; ++p) {  
    platform_id = platforms[p];
    clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
    clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
    if(!strcmp(cl_platform_vendor,"Xilinx")) {
      found = true;
      break;
    }
  }
  if (!found){
    std::cout << "Platform Not Found\n";
    return err;
  }

  err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 1, &device_id, NULL);
  if (err != CL_SUCCESS) {
    std::cout << "FAILED TEST - Device\n";
    return err;
  }
  
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context || (err != CL_SUCCESS)) {
    std::cout << "FAILED TEST - Context \n";
    return err;
  }
  
  clGetDeviceInfo(device_id, CL_DEVICE_NAME, 1000, (void*)cl_device_name, NULL);

  std::cout << "DEVICE: " << cl_device_name << std::endl;

  std::cout << "Loading Bitstream: " << binaryName << std::endl; 
  char *krnl_bin;
  size_t krnl_size;
  krnl_size = load_file_to_memory(binaryName, &krnl_bin);

  printf("INFO: Loaded file\n");

  cl_program program = clCreateProgramWithBinary(context, 1,
						 (const cl_device_id* ) &device_id, &krnl_size,
						 (const unsigned char**) &krnl_bin,
						 NULL, &err);


  // Create Kernel
  std::cout << "Create Kernel: krnl_idct" << std::endl;
  cl_kernel krnl = clCreateKernel(program, "krnl_idct", &err);

  // Create Command Queue
  cl_command_queue q = clCreateCommandQueue(context, device_id, 
					    CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);

  // Create compute units
  std::cout << "Create Compute Unit" << std::endl;
  oclDct cu;
  cu.init(context, device_id, krnl, q, numBlocks64);

  std::cout << "Setup complete" << std::endl;


  // *********** Host (CPU) execution **********
  std::cout << "Running CPU version" << std::endl;
  auto cpu_begin = std::chrono::high_resolution_clock::now();
  runCPU(blocks, source_block, source_q, golden_vpout, ignore_dc);
  auto cpu_end = std::chrono::high_resolution_clock::now();
  

  // *********** Accelerator execution **********
  std::cout << "Running FPGA version" << std::endl;
  auto fpga_begin = std::chrono::high_resolution_clock::now();
  runFPGA(blocks, 
	  source_block, 
	  source_q, 
	  result_vpout, 
	  q,
	  ignore_dc, 
 	  cu, 
	  numBlocks64);
  auto fpga_end = std::chrono::high_resolution_clock::now();


  // *********** OpenCL Host Code cleanup **********

  clReleaseCommandQueue(q);
  clReleaseKernel(krnl);
  clReleaseProgram(program);
  clReleaseContext(context);


  // *********** Comparison (Host to Acceleration)  **********

  std::cout << "Runs complete validating results" << std::endl;

  int krnl_match = 0;
  for(size_t i = 0; i < 64*blocks; i++){
    if(result_vpout[i] != golden_vpout[i]){
      printf("Error: Result mismatch\n");
      printf("i = %d CPU result = %d Krnl Result = %d\n", 
	     (int) i, golden_vpout[i], result_vpout[i]);
      krnl_match = 1;
      break;
    } 
  }

  std::cout << "TEST " << (krnl_match ? "FAILED" : "PASSED") << std::endl;

  // *********** Computational Statistics  **********
  //
  // Only reported in the HW execution mode as wall clock time is meaningless in
  // emulation.
  //
  if (xcl_mode == NULL) {
    std::chrono::duration<double> cpu_duration = cpu_end - cpu_begin;
    std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;

    std::cout << "CPU Time:        " << cpu_duration.count() << " s" << std::endl;
    std::cout << "CPU Throughput:  " 
	      << (double) blocks*128 / cpu_duration.count() / (1024.0*1024.0)
	      << " MB/s" << std::endl;
    std::cout << "FPGA Time:       " << fpga_duration.count() << " s" << std::endl;
    std::cout << "FPGA Throughput: " 
	      << (double) blocks*128 / fpga_duration.count() / (1024.0*1024.0)
	      << " MB/s" << std::endl;
    std::cout << "FPGA PCIe Throughput: " 
	      << (2*(double) blocks*128 + 128) / fpga_duration.count() / (1024.0*1024.0)
	      << " MB/s" << std::endl;
  } else {
    std::cout << "RUN COMPLETE" << std::endl;
  }

  return (krnl_match ? EXIT_FAILURE :  EXIT_SUCCESS);
}