in reInvent19_Developer_Workshop/modules/module_03/design/reference-files/addi_k/convolve.cpp [34:174]
void convolve(FILE* streamIn, FILE* streamOut,
float* coefficients, int coefficient_size,
arguments args) {
size_t frame_bytes = args.width * args.height * sizeof(RGBPixel);
size_t gray_frame_bytes = args.width * args.height * sizeof(GrayPixel);
vector<RGBPixel> inFrame(args.width * args.height);
vector<RGBPixel> outFrame(args.width * args.height);
vector<GrayPixel, aligned_allocator<GrayPixel>> grayFrame(args.width * args.height);
size_t bytes_read = 0;
size_t bytes_written = 0;
size_t total_coefficient_size = coefficient_size * coefficient_size;
vector<float, aligned_allocator<float>> filter_coeff(coefficients, coefficients + total_coefficient_size);
size_t coefficient_size_bytes = sizeof(float) * total_coefficient_size;
vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
cl::Context context(device);
//cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
cl::CommandQueue q(context, device,
cl::QueueProperties::Profiling | cl::QueueProperties::OutOfOrder);
cl::Program::Binaries bins = xcl::import_binary_file(args.binary_file);
devices.resize(1);
cl::Program program(context, devices, bins);
cl::Kernel convolve_kernel(program, args.kernel_name);
cl::Buffer buffer_input(context, CL_MEM_READ_ONLY, frame_bytes, NULL);
cl::Buffer buffer_output(context, CL_MEM_READ_WRITE, frame_bytes, NULL);
cl::Buffer buffer_coefficient(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, coefficient_size_bytes, filter_coeff.data());
cl::Buffer gray_output(context, CL_MEM_WRITE_ONLY, gray_frame_bytes, NULL);
convolve_kernel.setArg(0, buffer_input);
convolve_kernel.setArg(1, buffer_output);
convolve_kernel.setArg(2, buffer_coefficient);
convolve_kernel.setArg(3, coefficient_size);
convolve_kernel.setArg(4, args.width);
convolve_kernel.setArg(5, args.height);
q.enqueueMigrateMemObjects({buffer_coefficient}, 0);
cl::Kernel grayscale_kernel(program, "grayscale_fpga");
grayscale_kernel.setArg(0, buffer_output);
grayscale_kernel.setArg(1, gray_output);
grayscale_kernel.setArg(2, args.width);
grayscale_kernel.setArg(3, args.height);
int compute_units = 4;
int lines_per_compute_unit = args.height / compute_units;
for(int frame_count = 0; frame_count < args.nframes; frame_count++) {
// Read frame
bytes_read = fread(inFrame.data(), 1, frame_bytes, streamIn);
if(bytes_read != frame_bytes) {
printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", frame_bytes, bytes_read);
break;
}
/*
convolve_cpu(inFrame.data(), outFrame.data(),
coefficients, coefficient_size,
args.width, args.height);
*/
// q.enqueueWriteBuffer(buffer_input, CL_FALSE, 0, frame_bytes, inFrame.data());
cl::Event write_event;
q.enqueueWriteBuffer(buffer_input, CL_FALSE, 0, frame_bytes, inFrame.data(), nullptr, &write_event);
//q.enqueueTask(convolve_kernel);
vector<cl::Event> iteration_events{write_event};
//cl::Event task_event;
//q.enqueueTask(convolve_kernel, &iteration_events, &task_event);
vector<cl::Event> task_events;
for(int cu = 0; cu < compute_units; cu++) {
cl::Event task_event;
convolve_kernel.setArg(6, cu * lines_per_compute_unit);
convolve_kernel.setArg(7, lines_per_compute_unit);
q.enqueueTask(convolve_kernel, &iteration_events, &task_event);
task_events.push_back(task_event);
}
copy(begin(task_events), end(task_events), std::back_inserter(iteration_events));
//q.enqueueReadBuffer(buffer_output, CL_TRUE, 0, frame_bytes, outFrame.data());
//iteration_events.push_back(task_event);
cl::Event read_event;
q.enqueueReadBuffer(buffer_output, CL_FALSE, 0, frame_bytes, outFrame.data(), &iteration_events, &read_event);
iteration_events.push_back(read_event);
read_event.wait();
if (args.gray) {
cl::Event gray_event;
q.enqueueTask(grayscale_kernel, &task_events, &gray_event);
iteration_events.push_back(gray_event);
cl::Event read_event;
q.enqueueReadBuffer(gray_output, CL_FALSE, 0, gray_frame_bytes,
grayFrame.data(), &iteration_events, &read_event);
iteration_events.push_back(read_event);
iteration_events.back().wait();
bytes_written = fwrite(grayFrame.data(), 1, gray_frame_bytes, streamOut);
fflush(streamOut);
if(bytes_written != gray_frame_bytes) {
printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", gray_frame_bytes, bytes_written);
break;
}
} else {
cl::Event read_event;
q.enqueueReadBuffer(buffer_output, CL_FALSE, 0, frame_bytes,
outFrame.data(), &iteration_events, &read_event);
iteration_events.push_back(read_event);
iteration_events.back().wait();
bytes_written = fwrite(outFrame.data(), 1, frame_bytes, streamOut);
fflush(streamOut);
if(bytes_written != frame_bytes) {
printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", frame_bytes, bytes_written);
break;
}
print_progress(frame_count, args.nframes);
}
}
q.finish();
}