void convolve()

in reInvent19_Developer_Workshop/modules/module_03/design/reference-files/addi_k/convolve.cpp [34:174]


void convolve(FILE* streamIn, FILE* streamOut,
              float* coefficients, int coefficient_size,
              arguments args) {
    size_t frame_bytes = args.width * args.height * sizeof(RGBPixel);
    size_t gray_frame_bytes = args.width * args.height * sizeof(GrayPixel);
    vector<RGBPixel> inFrame(args.width * args.height);
    vector<RGBPixel> outFrame(args.width * args.height);
    
    vector<GrayPixel, aligned_allocator<GrayPixel>> grayFrame(args.width * args.height);
	
    size_t bytes_read = 0;
    size_t bytes_written = 0;


    size_t total_coefficient_size = coefficient_size * coefficient_size;
    vector<float, aligned_allocator<float>> filter_coeff(coefficients, coefficients + total_coefficient_size);
    size_t coefficient_size_bytes = sizeof(float) * total_coefficient_size;


    vector<cl::Device> devices = xcl::get_xil_devices();
    cl::Device device = devices[0];


    cl::Context context(device);
    //cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
    cl::CommandQueue q(context, device,
    cl::QueueProperties::Profiling | cl::QueueProperties::OutOfOrder);

    cl::Program::Binaries bins = xcl::import_binary_file(args.binary_file);
    devices.resize(1);
    cl::Program program(context, devices, bins);
    cl::Kernel convolve_kernel(program, args.kernel_name);


    cl::Buffer buffer_input(context, CL_MEM_READ_ONLY, frame_bytes, NULL);
    
	cl::Buffer buffer_output(context, CL_MEM_READ_WRITE, frame_bytes, NULL);

    cl::Buffer buffer_coefficient(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, coefficient_size_bytes, filter_coeff.data());

    cl::Buffer gray_output(context, CL_MEM_WRITE_ONLY, gray_frame_bytes, NULL);

    convolve_kernel.setArg(0, buffer_input);
    convolve_kernel.setArg(1, buffer_output);
    convolve_kernel.setArg(2, buffer_coefficient);
    convolve_kernel.setArg(3, coefficient_size);
    convolve_kernel.setArg(4, args.width);
    convolve_kernel.setArg(5, args.height);

    q.enqueueMigrateMemObjects({buffer_coefficient}, 0);

    cl::Kernel grayscale_kernel(program, "grayscale_fpga");
	 
    grayscale_kernel.setArg(0, buffer_output);
    grayscale_kernel.setArg(1, gray_output);
    grayscale_kernel.setArg(2, args.width);
    grayscale_kernel.setArg(3, args.height);
	
	
    int compute_units = 4;
    int lines_per_compute_unit = args.height / compute_units;


    for(int frame_count = 0; frame_count < args.nframes; frame_count++) {
        // Read frame
        bytes_read = fread(inFrame.data(), 1, frame_bytes, streamIn);
        if(bytes_read != frame_bytes) {
        	printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", frame_bytes, bytes_read);
        	break;
        }

        /*
        convolve_cpu(inFrame.data(), outFrame.data(),
                     coefficients, coefficient_size,
                     args.width, args.height);
        */

        // q.enqueueWriteBuffer(buffer_input, CL_FALSE, 0, frame_bytes, inFrame.data());
        cl::Event write_event;
        q.enqueueWriteBuffer(buffer_input, CL_FALSE, 0, frame_bytes, inFrame.data(), nullptr, &write_event);
				  
		//q.enqueueTask(convolve_kernel);
		vector<cl::Event> iteration_events{write_event};
        //cl::Event task_event;
        //q.enqueueTask(convolve_kernel, &iteration_events, &task_event);	
		vector<cl::Event> task_events;
        for(int cu = 0; cu < compute_units; cu++) {
            cl::Event task_event;
            convolve_kernel.setArg(6, cu * lines_per_compute_unit);
            convolve_kernel.setArg(7, lines_per_compute_unit);
            q.enqueueTask(convolve_kernel, &iteration_events, &task_event);
            task_events.push_back(task_event);
        }
        copy(begin(task_events), end(task_events), std::back_inserter(iteration_events));
		
        //q.enqueueReadBuffer(buffer_output, CL_TRUE, 0, frame_bytes, outFrame.data());
        //iteration_events.push_back(task_event);
        cl::Event read_event;
        q.enqueueReadBuffer(buffer_output, CL_FALSE, 0, frame_bytes, outFrame.data(), &iteration_events, &read_event);
        iteration_events.push_back(read_event);
		read_event.wait();
		
 
 if (args.gray) {
   cl::Event gray_event;
   q.enqueueTask(grayscale_kernel, &task_events, &gray_event);
   iteration_events.push_back(gray_event);
   cl::Event read_event;
   q.enqueueReadBuffer(gray_output, CL_FALSE, 0, gray_frame_bytes,
                       grayFrame.data(), &iteration_events, &read_event);
   iteration_events.push_back(read_event);
 
   iteration_events.back().wait();
   bytes_written = fwrite(grayFrame.data(), 1, gray_frame_bytes, streamOut);
   fflush(streamOut);
 
   if(bytes_written != gray_frame_bytes) {
       printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", gray_frame_bytes, bytes_written);
       break;
   }
 
 } else {
   cl::Event read_event;
   q.enqueueReadBuffer(buffer_output, CL_FALSE, 0, frame_bytes,
                       outFrame.data(), &iteration_events, &read_event);
   iteration_events.push_back(read_event);
 
   iteration_events.back().wait();
   bytes_written = fwrite(outFrame.data(), 1, frame_bytes, streamOut);
   fflush(streamOut);
 
   if(bytes_written != frame_bytes) {
       printf("\nError: partial frame.\nExpected %zu\nActual %zu\n", frame_bytes, bytes_written);
       break;
   }

        print_progress(frame_count, args.nframes);
    }
  }
    q.finish();
 }