void runOnFPGA()

in reInvent19_Developer_Workshop/modules/module_02/reference_files/run_split_buffer.cpp [20:194]
129 lines of code
7 McCabe index (conditional complexity)

void runOnFPGA(	
	unsigned int*  doc_sizes,
	unsigned int*  input_doc_words,
	unsigned int*  bloom_filter,
	unsigned long* profile_weights,
	unsigned long* profile_score,
	unsigned int   total_num_docs, 
	unsigned int   total_doc_size,
        int   num_iter) 
{
	if ((total_doc_size/2)%64!=0) {
		printf("--------------------------------------------------------------------\n");
		printf("ERROR: The number of word per iterations must be a multiple of 64\n");
		printf("       Total words = %d, Number of iterations = 2, Word per iterations = %d\n", total_doc_size,total_doc_size/2);
		printf("       Skipping FPGA kernel execution\n");
		exit(-1);
	}

	// Boilerplate code to load the FPGA binary, create the kernel and command queue
	vector<cl::Device> devices = xcl::get_xil_devices();
	cl::Device device = devices[0];
	cl::Context context(device);
	cl::CommandQueue q(context,device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE );

	string run_type = xcl::is_emulation()?(xcl::is_hw_emulation()?"hw_emu":"sw_emu"):"hw";
	string binary_file = kernel_name + "_" + run_type + ".awsxclbin";
	cl::Program::Binaries bins = xcl::import_binary_file(binary_file);
	cl::Program program(context, devices, bins);
	cl::Kernel kernel(program,kernel_name_charptr,NULL);

	unsigned int total_size = total_doc_size;
	unsigned char* output_inh_flags = (unsigned char*)aligned_alloc(4096, total_size*sizeof(char));
	bool load_filter = true;

	// Create buffers
	cl::Buffer buffer_bloom_filter(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, bloom_filter_size*sizeof(uint),bloom_filter);
	cl::Buffer buffer_input_doc_words(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, total_size*sizeof(uint),input_doc_words);
	cl::Buffer buffer_output_inh_flags(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, total_size*sizeof(char),output_inh_flags);

	// Set buffer kernel arguments (needed to migrate the buffers in the correct memory) 
	kernel.setArg(0, buffer_output_inh_flags);
	kernel.setArg(1, buffer_input_doc_words);
	kernel.setArg(2, buffer_bloom_filter);

	// Make buffers resident in the device
	q.enqueueMigrateMemObjects({buffer_bloom_filter, buffer_input_doc_words, buffer_output_inh_flags}, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED);

	// Specify size of sub-buffers, one for each transaction 
	unsigned subbuf_doc_sz = total_doc_size/2;
	unsigned subbuf_inh_sz = total_doc_size/2;
 
        // Declare sub-buffer regions to specify offset and size of sub-buffer   
	cl_buffer_region subbuf_inh_info[2];
	cl_buffer_region subbuf_doc_info[2];

        // Declare sub-buffers
	cl::Buffer subbuf_inh_flags[2];
	cl::Buffer subbuf_doc_words[2];

        
        // Specify offset and size of sub-buffers 
        subbuf_inh_info[0]={0, subbuf_inh_sz*sizeof(char)};
        subbuf_inh_info[1]={subbuf_inh_sz*sizeof(char), subbuf_inh_sz*sizeof(char)};
        subbuf_doc_info[0]={0, subbuf_doc_sz*sizeof(uint)};
        subbuf_doc_info[1]={subbuf_doc_sz*sizeof(uint), subbuf_doc_sz*sizeof(uint)};

        // Create sub-buffers from buffers based on sub-buffer regions
	subbuf_inh_flags[0] = buffer_output_inh_flags.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &subbuf_inh_info[0]);
	subbuf_inh_flags[1] = buffer_output_inh_flags.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &subbuf_inh_info[1]);
	subbuf_doc_words[0] = buffer_input_doc_words.createSubBuffer (CL_MEM_READ_ONLY,  CL_BUFFER_CREATE_TYPE_REGION, &subbuf_doc_info[0]);
	subbuf_doc_words[1] = buffer_input_doc_words.createSubBuffer (CL_MEM_READ_ONLY,  CL_BUFFER_CREATE_TYPE_REGION, &subbuf_doc_info[1]);

          
	printf("\n");
    double mbytes_total  = (double)(total_doc_size * sizeof(int)) / (double)(1000*1000);
    double mbytes_block  = mbytes_total / 2;
    printf(" Processing %.3f MBytes of data\n", mbytes_total);
    printf(" Splitting data in 2 sub-buffers of %.3f MBytes for FPGA processing\n", mbytes_block);

    // Create Events to co-ordinate read,compute and write for each iteration
	vector<cl::Event> wordWait;
	vector<cl::Event> krnlWait;
	vector<cl::Event> flagWait;

    printf("--------------------------------------------------------------------\n");

	chrono::high_resolution_clock::time_point t1, t2;
	t1 = chrono::high_resolution_clock::now();

	// Set kernel arguments. Load bloom filter coefficients
	cl::Event buffDone,krnlDone,flagDone;
	total_size = 0;
	load_filter = true;
	kernel.setArg(3, total_size);
	kernel.setArg(4, load_filter);
	q.enqueueMigrateMemObjects({buffer_bloom_filter}, 0, NULL, &buffDone);
	wordWait.push_back(buffDone);
	q.enqueueTask(kernel, &wordWait, &krnlDone);
	krnlWait.push_back(krnlDone);
 
	//  Set Kernel Arguments, Read, Enqueue Kernel and Write for first iteration
		
                total_size = total_doc_size/2;
                load_filter=false;
		kernel.setArg(3, total_size);
		kernel.setArg(4, load_filter);
		kernel.setArg(0, subbuf_inh_flags[0]);
		kernel.setArg(1, subbuf_doc_words[0]);
		q.enqueueMigrateMemObjects({subbuf_doc_words[0]}, 0, &wordWait, &buffDone); 
		wordWait.push_back(buffDone);
		q.enqueueTask(kernel, &wordWait, &krnlDone);
		krnlWait.push_back(krnlDone);
		q.enqueueMigrateMemObjects({subbuf_inh_flags[0]}, CL_MIGRATE_MEM_OBJECT_HOST, &krnlWait, &flagDone);
		flagWait.push_back(flagDone);

	//  Set Kernel Arguments, Read, Enqueue Kernel and Write for second iteration
		kernel.setArg(0, subbuf_inh_flags[1]);
		kernel.setArg(1, subbuf_doc_words[1]);
		q.enqueueMigrateMemObjects({subbuf_doc_words[1]}, 0, &wordWait, &buffDone); 
		wordWait.push_back(buffDone);
		q.enqueueTask(kernel, &wordWait, &krnlDone);
		krnlWait.push_back(krnlDone);
		q.enqueueMigrateMemObjects({subbuf_inh_flags[1]}, CL_MIGRATE_MEM_OBJECT_HOST, &krnlWait, &flagDone);
		flagWait.push_back(flagDone);

	// Wait until all results are copied back to the host before doing the post-processing
		flagWait[0].wait();
		flagWait[1].wait();
        

	// Compute the profile score in CPU using the in-hash flags computed on the FPGA
	unsigned      curr_entry;
	unsigned char inh_flags;
			
	for(unsigned int doc=0, n=0; doc<total_num_docs;doc++) 
	{
		unsigned long ans = 0;
		unsigned int size = doc_sizes[doc];

		for (unsigned i = 0; i < size ; i++, n++)
		{ 
			curr_entry = input_doc_words[n];
			inh_flags  = output_inh_flags[n];

			if (inh_flags) 
			{
				unsigned frequency = curr_entry & 0x00ff;
				unsigned word_id = curr_entry >> 8;

				ans += profile_weights[word_id] * (unsigned long)frequency;
			}
		}
		profile_score[doc] = ans;
	}

	t2 = chrono::high_resolution_clock::now();
	chrono::duration<double> perf_all_sec  = chrono::duration_cast<duration<double>>(t2-t1);

    cl_ulong f1 = 0;
    cl_ulong f2 = 0;
    wordWait.front().getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &f1);
    flagWait.back().getProfilingInfo(CL_PROFILING_COMMAND_END, &f2);
    double perf_hw_ms = (f2 - f1)/1000000.0;

    if (xcl::is_emulation()) {
    	if (xcl::is_hw_emulation()) {
		    printf(" Emulated FPGA accelerated version  | run 'vitis_analyzer xclbin.run_summary' for performance estimates");
    	} else {
		    printf(" Emulated FPGA accelerated version  | (performance not relevant in SW emulation)");
		}
    } else {
		    printf(" Executed FPGA accelerated version  | %10.4f ms   ( FPGA %.3f ms )", 1000*perf_all_sec.count(), perf_hw_ms);    	
    }
	printf("\n");
}