int main()

in reInvent18_Developer_Workshop/filter2D/src/host/host.cpp [167:382]


int main(int argc, char** argv)
{
  std::cout << std::endl;    
  std::cout << "Xilinx 2D Filter Example Application\n";
        
  // ---------------------------------------------------------------------------------
  // Parse command line
  // ---------------------------------------------------------------------------------

  CmdLineParser parser;
  parser.addSwitch("--nruns", "-n", "Number of times to image is processed", "1");
  parser.addSwitch("--fpga", "-x", "FPGA binary (xclbin) file to use", "xclbin/fpga.hw.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0.awsxclbin");
  parser.addSwitch("--input", "-i", "Input image file");
  parser.addSwitch("--filter", "-f", "Filter type (0-3)", "0");

  //parse all command line options
  parser.parse(argc, argv);
  string inputImage = parser.value("input");
  string fpgaBinary = parser.value("fpga");
  int    numRuns    = parser.value_to_int("nruns");
  int    coeffs     = parser.value_to_int("filter");

  if (inputImage.size() == 0) {
    std::cout << std::endl;    
    std::cout << "ERROR: input image file must be specified using -i command line switch" << std::endl;
    exit(1);
  }
  if ((coeffs<0) || (coeffs>3)) {
    std::cout << std::endl;    
    std::cout << "ERROR: Supported filter type values are [0:3]" << std::endl;
    exit(1);
  }

  std::cout << std::endl;    
  std::cout << "FPGA binary    : " << fpgaBinary << std::endl;
  std::cout << "Input image    : " << inputImage << std::endl;
  std::cout << "Number of runs : " << numRuns    << std::endl;
  std::cout << "Filter type    : " << coeffs     << std::endl;
  std::cout << std::endl;    
  
  // ---------------------------------------------------------------------------------
  // Load XCLBIN file, create OpenCL context, device and program
  // ---------------------------------------------------------------------------------

  std::cout << "Programming FPGA" << std::endl;
  cl_context        context;
  cl_program        program;
  cl_device_id    device;
  load_xclbin_file(fpgaBinary.c_str(), context, device, program);

  // ---------------------------------------------------------------------------------
  // Read input image and format inputs
  // ---------------------------------------------------------------------------------
  
  // Create filenames for input and ouput images
  std::string srcFileName  = inputImage;
  std::string dstFileName  = inputImage.substr(0, inputImage.size()-4)+"_out.bmp";

  // Read Input image
  IplImage *src, *dst;
  src = cvLoadImage(srcFileName.c_str()); //format is BGR
  if(!src) {
    std::cout << "ERROR: Loading image " << srcFileName << " failed" << std::endl;
    exit(1);
  }
  unsigned width  = src->width;
  unsigned height = src->height;
  unsigned stride = ceil(width/64.0)*64;
  unsigned nbytes = (stride*height);

  // 4k aligned buffers for efficient data transfer to the kernel
  std::vector<uchar, aligned_allocator<uchar>> y_src(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> u_src(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> v_src(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> y_dst(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> u_dst(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> v_dst(nbytes);
  std::vector<short, aligned_allocator<short>> coeff(FILTER2D_KERNEL_V_SIZE*FILTER2D_KERNEL_V_SIZE);

  // Create destination image
  dst = cvCreateImage(cvSize(width, height), src->depth, src->nChannels);

  // Convert CV Image to AXI video data
  IplImage2Raw(src, y_src.data(), stride, u_src.data(), stride, v_src.data(), stride);

  // Copy coefficients to 4k aligned vector
  memcpy(coeff.data() , &filterCoeffs[coeffs][0][0], coeff.size()*sizeof(short) );

  // ---------------------------------------------------------------------------------
  // Make requests to kernel(s) 
  // ---------------------------------------------------------------------------------
  // Note: change the number of kernels in the device, or reorder the sync() methods
  // to see the impact on performance and how requests are scheduled.
  // ---------------------------------------------------------------------------------

  std::cout << std::endl;    
  std::cout << "Running FPGA version" << std::endl;    
  // std::cout << "Image width    : " << width << std::endl;
  // std::cout << "Image height   : " << height << std::endl;
  // std::cout << "Image stride   : " << stride << std::endl;

  // Create a dispatcher of requests to the Blur kernel(s) 
  Filter2DDispatcher Filter(device, context, program);

auto fpga_begin = std::chrono::high_resolution_clock::now();

  Filter2DRequest* request[numRuns*3];
  for(int xx=0; xx<numRuns; xx++) 
  {
    // Make independent requests to Blur Y, U and V planes
    // Requests will run sequentially if there is a single kernel
    // Requests will run in parallel is there are two or more kernels
    request[xx*3+0] = Filter(coeff.data(), y_src.data(), width, height, stride, y_dst.data());
    request[xx*3+1] = Filter(coeff.data(), u_src.data(), width, height, stride, u_dst.data());
    request[xx*3+2] = Filter(coeff.data(), v_src.data(), width, height, stride, v_dst.data());
	//}
	//for(int xx=0; xx<numRuns; xx++) 
	//{
    // Wait for completion of the outstanding requests
    request[xx*3+0]->finish();
    request[xx*3+1]->finish();
    request[xx*3+2]->finish();
  }

auto fpga_end = std::chrono::high_resolution_clock::now();

  // ---------------------------------------------------------------------------------
  // Format output and write image out 
  // ---------------------------------------------------------------------------------

  // Convert processed image back to CV Image
  Raw2IplImage(y_dst.data(), stride, u_dst.data(), stride, v_dst.data(), stride, dst);

  // Convert image to cvMat and write it to disk
  cvConvert( dst, cvCreateMat(height, width, CV_32FC3 ) );
  cvSaveImage(dstFileName.c_str(), dst);

  // ---------------------------------------------------------------------------------
  // Compute reference results and compare 
  // ---------------------------------------------------------------------------------

  std::cout << std::endl;
  std::cout << "Running Software version" << std::endl;

  // Create output buffers for reference results
  std::vector<uchar, aligned_allocator<uchar>> y_ref(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> u_ref(nbytes);
  std::vector<uchar, aligned_allocator<uchar>> v_ref(nbytes);

auto cpu_begin = std::chrono::high_resolution_clock::now();

  #pragma omp parallel for
  for(int xx=0; xx<numRuns; xx++) 
  {
    // Compute reference results
    Filter2D(filterCoeffs[coeffs], y_src.data(), width, height, stride, y_ref.data());
    Filter2D(filterCoeffs[coeffs], u_src.data(), width, height, stride, u_ref.data());
    Filter2D(filterCoeffs[coeffs], v_src.data(), width, height, stride, v_ref.data());
  }

auto cpu_end = std::chrono::high_resolution_clock::now();

  std::string refFileName  = inputImage.substr(0, inputImage.size()-4)+"_ref.bmp";
  Raw2IplImage(y_ref.data(), stride, u_ref.data(), stride, v_ref.data(), stride, dst);
  cvConvert( dst, cvCreateMat(height, width, CV_32FC3 ) );
  cvSaveImage(refFileName.c_str(), dst);

  // Compare results
  bool diff = false;
  for (int y = 0; y < height; y++) {
      for (int x = 0; x < width; x++) {
          if ( y_dst[y*stride+x] != y_ref[y*stride+x] ) diff = true;
          if ( u_dst[y*stride+x] != u_ref[y*stride+x] ) diff = true;
          if ( v_dst[y*stride+x] != v_ref[y*stride+x] ) diff = true;
      }
  }

  std::cout << std::endl;    
  if(diff) {
      std::cout << RED;
      std::cout << "*******************************************************" << std::endl;    
      std::cout << "MATCH FAILED: Output has mismatches with reference"      << std::endl;
      std::cout << "*******************************************************" << std::endl;    
      std::cout << RESET;    
  } else {
      std::cout << GREEN;
      std::cout << "*******************************************************" << std::endl;    
      std::cout << "MATCH PASS: Output matches reference"                    << std::endl;
      std::cout << "*******************************************************" << std::endl;
      std::cout << RESET;    
  }

  // Report performance (if not running in emulation mode)
  if (getenv("XCL_EMULATION_MODE") == NULL) {
      std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;
      std::cout << "FPGA Time:       " << fpga_duration.count() << " s" << std::endl;
      std::cout << "FPGA Throughput: " 
                << (double) numRuns*3*nbytes / fpga_duration.count() / (1024.0*1024.0)
                << " MB/s" << std::endl;
      std::chrono::duration<double> cpu_duration = cpu_end - cpu_begin;
      std::cout << "CPU Time:        " << cpu_duration.count() << " s" << std::endl;
      std::cout << "CPU Throughput:  " 
                << (double) numRuns*3*nbytes / cpu_duration.count() / (1024.0*1024.0)
                << " MB/s" << std::endl;
      std::cout << "FPGA Speedup:    " << cpu_duration.count() / fpga_duration.count() << " x" << std::endl;
  }

  // Release allocated memory
  cvReleaseImage(&src);
  cvReleaseImage(&dst);
  clReleaseProgram(program);
  clReleaseContext(context);    
  clReleaseDevice(device);

  return (diff?1:0);
}