void convolve()

in reInvent19_Developer_Workshop/modules/module_03/design/reference-files/multicu/convolve.cpp [34:153]


void convolve(FILE* streamIn, FILE* streamOut,
              float* coefficients, int coefficient_size,
              arguments args) {
    size_t frame_bytes = args.width * args.height * sizeof(RGBPixel);
    size_t gray_frame_bytes = args.width * args.height * sizeof(GrayPixel);
    vector<RGBPixel> inFrame(args.width * args.height);
    vector<RGBPixel> outFrame(args.width * args.height);
    vector<GrayPixel> grayFrame(args.width * args.height);

    size_t bytes_read = 0;
    size_t bytes_written = 0;


    size_t total_coefficient_size = coefficient_size * coefficient_size;
    vector<float, aligned_allocator<float>> filter_coeff(coefficients, coefficients + total_coefficient_size);
    size_t coefficient_size_bytes = sizeof(float) * total_coefficient_size;


    vector<cl::Device> devices = xcl::get_xil_devices();
    cl::Device device = devices[0];


    cl::Context context(device);
    cl::CommandQueue q(context, device,
    cl::QueueProperties::Profiling | cl::QueueProperties::OutOfOrder);

    cl::Program::Binaries bins = xcl::import_binary_file(args.binary_file);
    devices.resize(1);
    cl::Program program(context, devices, bins);
    cl::Kernel convolve_kernel(program, args.kernel_name);


    cl::Buffer buffer_input(context, CL_MEM_READ_ONLY, frame_bytes, NULL);
    cl::Buffer buffer_output(context, CL_MEM_WRITE_ONLY, frame_bytes, NULL);
    cl::Buffer buffer_coefficient(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, coefficient_size_bytes, filter_coeff.data());


    convolve_kernel.setArg(0, buffer_input);
    convolve_kernel.setArg(1, buffer_output);
    convolve_kernel.setArg(2, buffer_coefficient);
    convolve_kernel.setArg(3, coefficient_size);
    convolve_kernel.setArg(4, args.width);
    convolve_kernel.setArg(5, args.height);

    q.enqueueMigrateMemObjects({buffer_coefficient}, 0);
	
    int compute_units = 4;
    int lines_per_compute_unit = args.height / compute_units;
	
	auto fpga_begin = std::chrono::high_resolution_clock::now();

    for(int frame_count = 0; frame_count < args.nframes; frame_count++) {
        // Read frame
        bytes_read = fread(inFrame.data(), 1, frame_bytes, streamIn);
        if(bytes_read != frame_bytes) {
        	printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", frame_bytes, bytes_read);
        	break;
        }

        /*
        convolve_cpu(inFrame.data(), outFrame.data(),
                     coefficients, coefficient_size,
                     args.width, args.height);
        */

        cl::Event write_event;
        q.enqueueWriteBuffer(buffer_input, CL_FALSE, 0, frame_bytes, inFrame.data(), nullptr, &write_event);
				  
	vector<cl::Event> iteration_events{write_event};
	vector<cl::Event> task_events;
        for(int cu = 0; cu < compute_units; cu++) {
            cl::Event task_event;
            convolve_kernel.setArg(6, cu * lines_per_compute_unit);
            convolve_kernel.setArg(7, lines_per_compute_unit);
            q.enqueueTask(convolve_kernel, &iteration_events, &task_event);
            task_events.push_back(task_event);
        }
        copy(begin(task_events), end(task_events), std::back_inserter(iteration_events));
		
        cl::Event read_event;
        q.enqueueReadBuffer(buffer_output, CL_FALSE, 0, frame_bytes, outFrame.data(), &iteration_events, &read_event);
        iteration_events.push_back(read_event);
	read_event.wait();
		
        if(args.gray) {
          grayscale_cpu(outFrame.data(), grayFrame.data(), args.width, args.height);
          bytes_written = fwrite(outFrame.data(), 1, gray_frame_bytes, streamOut);
          fflush(streamOut);
          if (bytes_written != gray_frame_bytes) {
            printf("\nError: partial frame.\nExpected %zu\nActual %zu\n",
                   gray_frame_bytes, bytes_written);
            break;
          }
        } else {
          bytes_written = fwrite(outFrame.data(), 1, frame_bytes, streamOut);
          fflush(streamOut);
          if (bytes_written != frame_bytes) {
            printf("\nError: partial frame.\nExpected %zu\nActual %zu\n",
                   frame_bytes, bytes_written);
            break;
          }
          // test(inFrame, outFrame, coefficients, coefficient_size, width, height);
        }

        print_progress(frame_count, args.nframes);
    }
    q.finish();

    auto fpga_end = std::chrono::high_resolution_clock::now();

    // Report performance (if not running in emulation mode)
    if (getenv("XCL_EMULATION_MODE") == NULL) {
        std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;
        std::cout << "                 " << std::endl;
        std::cout << "FPGA Time:       " << fpga_duration.count() << " s" << std::endl;
        std::cout << "FPGA Throughput: "
                  << (1920*1080*4*132) / fpga_duration.count() / (1024.0*1024.0)
                  << " MB/s" << std::endl;
     }
}