in reInvent18_Developer_Workshop/filter2D/src/host/host.cpp [167:382]
int main(int argc, char** argv)
{
std::cout << std::endl;
std::cout << "Xilinx 2D Filter Example Application\n";
// ---------------------------------------------------------------------------------
// Parse command line
// ---------------------------------------------------------------------------------
CmdLineParser parser;
parser.addSwitch("--nruns", "-n", "Number of times to image is processed", "1");
parser.addSwitch("--fpga", "-x", "FPGA binary (xclbin) file to use", "xclbin/fpga.hw.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0.awsxclbin");
parser.addSwitch("--input", "-i", "Input image file");
parser.addSwitch("--filter", "-f", "Filter type (0-3)", "0");
//parse all command line options
parser.parse(argc, argv);
string inputImage = parser.value("input");
string fpgaBinary = parser.value("fpga");
int numRuns = parser.value_to_int("nruns");
int coeffs = parser.value_to_int("filter");
if (inputImage.size() == 0) {
std::cout << std::endl;
std::cout << "ERROR: input image file must be specified using -i command line switch" << std::endl;
exit(1);
}
if ((coeffs<0) || (coeffs>3)) {
std::cout << std::endl;
std::cout << "ERROR: Supported filter type values are [0:3]" << std::endl;
exit(1);
}
std::cout << std::endl;
std::cout << "FPGA binary : " << fpgaBinary << std::endl;
std::cout << "Input image : " << inputImage << std::endl;
std::cout << "Number of runs : " << numRuns << std::endl;
std::cout << "Filter type : " << coeffs << std::endl;
std::cout << std::endl;
// ---------------------------------------------------------------------------------
// Load XCLBIN file, create OpenCL context, device and program
// ---------------------------------------------------------------------------------
std::cout << "Programming FPGA" << std::endl;
cl_context context;
cl_program program;
cl_device_id device;
load_xclbin_file(fpgaBinary.c_str(), context, device, program);
// ---------------------------------------------------------------------------------
// Read input image and format inputs
// ---------------------------------------------------------------------------------
// Create filenames for input and ouput images
std::string srcFileName = inputImage;
std::string dstFileName = inputImage.substr(0, inputImage.size()-4)+"_out.bmp";
// Read Input image
IplImage *src, *dst;
src = cvLoadImage(srcFileName.c_str()); //format is BGR
if(!src) {
std::cout << "ERROR: Loading image " << srcFileName << " failed" << std::endl;
exit(1);
}
unsigned width = src->width;
unsigned height = src->height;
unsigned stride = ceil(width/64.0)*64;
unsigned nbytes = (stride*height);
// 4k aligned buffers for efficient data transfer to the kernel
std::vector<uchar, aligned_allocator<uchar>> y_src(nbytes);
std::vector<uchar, aligned_allocator<uchar>> u_src(nbytes);
std::vector<uchar, aligned_allocator<uchar>> v_src(nbytes);
std::vector<uchar, aligned_allocator<uchar>> y_dst(nbytes);
std::vector<uchar, aligned_allocator<uchar>> u_dst(nbytes);
std::vector<uchar, aligned_allocator<uchar>> v_dst(nbytes);
std::vector<short, aligned_allocator<short>> coeff(FILTER2D_KERNEL_V_SIZE*FILTER2D_KERNEL_V_SIZE);
// Create destination image
dst = cvCreateImage(cvSize(width, height), src->depth, src->nChannels);
// Convert CV Image to AXI video data
IplImage2Raw(src, y_src.data(), stride, u_src.data(), stride, v_src.data(), stride);
// Copy coefficients to 4k aligned vector
memcpy(coeff.data() , &filterCoeffs[coeffs][0][0], coeff.size()*sizeof(short) );
// ---------------------------------------------------------------------------------
// Make requests to kernel(s)
// ---------------------------------------------------------------------------------
// Note: change the number of kernels in the device, or reorder the sync() methods
// to see the impact on performance and how requests are scheduled.
// ---------------------------------------------------------------------------------
std::cout << std::endl;
std::cout << "Running FPGA version" << std::endl;
// std::cout << "Image width : " << width << std::endl;
// std::cout << "Image height : " << height << std::endl;
// std::cout << "Image stride : " << stride << std::endl;
// Create a dispatcher of requests to the Blur kernel(s)
Filter2DDispatcher Filter(device, context, program);
auto fpga_begin = std::chrono::high_resolution_clock::now();
Filter2DRequest* request[numRuns*3];
for(int xx=0; xx<numRuns; xx++)
{
// Make independent requests to Blur Y, U and V planes
// Requests will run sequentially if there is a single kernel
// Requests will run in parallel is there are two or more kernels
request[xx*3+0] = Filter(coeff.data(), y_src.data(), width, height, stride, y_dst.data());
request[xx*3+1] = Filter(coeff.data(), u_src.data(), width, height, stride, u_dst.data());
request[xx*3+2] = Filter(coeff.data(), v_src.data(), width, height, stride, v_dst.data());
//}
//for(int xx=0; xx<numRuns; xx++)
//{
// Wait for completion of the outstanding requests
request[xx*3+0]->finish();
request[xx*3+1]->finish();
request[xx*3+2]->finish();
}
auto fpga_end = std::chrono::high_resolution_clock::now();
// ---------------------------------------------------------------------------------
// Format output and write image out
// ---------------------------------------------------------------------------------
// Convert processed image back to CV Image
Raw2IplImage(y_dst.data(), stride, u_dst.data(), stride, v_dst.data(), stride, dst);
// Convert image to cvMat and write it to disk
cvConvert( dst, cvCreateMat(height, width, CV_32FC3 ) );
cvSaveImage(dstFileName.c_str(), dst);
// ---------------------------------------------------------------------------------
// Compute reference results and compare
// ---------------------------------------------------------------------------------
std::cout << std::endl;
std::cout << "Running Software version" << std::endl;
// Create output buffers for reference results
std::vector<uchar, aligned_allocator<uchar>> y_ref(nbytes);
std::vector<uchar, aligned_allocator<uchar>> u_ref(nbytes);
std::vector<uchar, aligned_allocator<uchar>> v_ref(nbytes);
auto cpu_begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for(int xx=0; xx<numRuns; xx++)
{
// Compute reference results
Filter2D(filterCoeffs[coeffs], y_src.data(), width, height, stride, y_ref.data());
Filter2D(filterCoeffs[coeffs], u_src.data(), width, height, stride, u_ref.data());
Filter2D(filterCoeffs[coeffs], v_src.data(), width, height, stride, v_ref.data());
}
auto cpu_end = std::chrono::high_resolution_clock::now();
std::string refFileName = inputImage.substr(0, inputImage.size()-4)+"_ref.bmp";
Raw2IplImage(y_ref.data(), stride, u_ref.data(), stride, v_ref.data(), stride, dst);
cvConvert( dst, cvCreateMat(height, width, CV_32FC3 ) );
cvSaveImage(refFileName.c_str(), dst);
// Compare results
bool diff = false;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
if ( y_dst[y*stride+x] != y_ref[y*stride+x] ) diff = true;
if ( u_dst[y*stride+x] != u_ref[y*stride+x] ) diff = true;
if ( v_dst[y*stride+x] != v_ref[y*stride+x] ) diff = true;
}
}
std::cout << std::endl;
if(diff) {
std::cout << RED;
std::cout << "*******************************************************" << std::endl;
std::cout << "MATCH FAILED: Output has mismatches with reference" << std::endl;
std::cout << "*******************************************************" << std::endl;
std::cout << RESET;
} else {
std::cout << GREEN;
std::cout << "*******************************************************" << std::endl;
std::cout << "MATCH PASS: Output matches reference" << std::endl;
std::cout << "*******************************************************" << std::endl;
std::cout << RESET;
}
// Report performance (if not running in emulation mode)
if (getenv("XCL_EMULATION_MODE") == NULL) {
std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;
std::cout << "FPGA Time: " << fpga_duration.count() << " s" << std::endl;
std::cout << "FPGA Throughput: "
<< (double) numRuns*3*nbytes / fpga_duration.count() / (1024.0*1024.0)
<< " MB/s" << std::endl;
std::chrono::duration<double> cpu_duration = cpu_end - cpu_begin;
std::cout << "CPU Time: " << cpu_duration.count() << " s" << std::endl;
std::cout << "CPU Throughput: "
<< (double) numRuns*3*nbytes / cpu_duration.count() / (1024.0*1024.0)
<< " MB/s" << std::endl;
std::cout << "FPGA Speedup: " << cpu_duration.count() / fpga_duration.count() << " x" << std::endl;
}
// Release allocated memory
cvReleaseImage(&src);
cvReleaseImage(&dst);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseDevice(device);
return (diff?1:0);
}