in reInvent19_Developer_Workshop/modules/module_02/src/single_buffer/run_fpga.cpp [20:143]
void runOnFPGA(
unsigned int* doc_sizes,
unsigned int* input_doc_words,
unsigned int* bloom_filter,
unsigned long* profile_weights,
unsigned long* profile_score,
unsigned int total_num_docs,
unsigned int total_doc_size,
int num_iter)
{
if ((total_doc_size)%64!=0) {
printf("--------------------------------------------------------------------\n");
printf("ERROR: The number of word per iterations must be a multiple of 64\n");
printf(" Total words = %d, Number of iterations = 1, Word per iterations = %d\n", total_doc_size, total_doc_size);
printf(" Skipping FPGA kernel execution\n");
exit(-1);
}
// Boilerplate code to load the FPGA binary, create the kernel and command queue
vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
cl::Context context(device);
cl::CommandQueue q(context,device, CL_QUEUE_PROFILING_ENABLE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
string run_type = xcl::is_emulation()?(xcl::is_hw_emulation()?"hw_emu":"sw_emu"):"hw";
string binary_file = kernel_name + "_" + run_type + ".awsxclbin";
cl::Program::Binaries bins = xcl::import_binary_file(binary_file);
cl::Program program(context, devices, bins);
cl::Kernel kernel(program,kernel_name_charptr,NULL);
unsigned int total_size = total_doc_size;
unsigned char* output_inh_flags = (unsigned char*)aligned_alloc(4096, total_size*sizeof(char));
bool load_filter = true;
// Create buffers
cl::Buffer buffer_bloom_filter(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, bloom_filter_size*sizeof(uint),bloom_filter);
cl::Buffer buffer_input_doc_words(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, total_size*sizeof(uint),input_doc_words);
cl::Buffer buffer_output_inh_flags(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, total_size*sizeof(char),output_inh_flags);
// Set buffer kernel arguments (needed to migrate the buffers in the correct memory)
kernel.setArg(0, buffer_output_inh_flags);
kernel.setArg(1, buffer_input_doc_words);
kernel.setArg(2, buffer_bloom_filter);
double mbytes_total = (double)(total_doc_size * sizeof(int)) / (double)(1000*1000);
printf(" Processing %.3f MBytes of data\n", mbytes_total);
printf("Running with a single buffer of %.3f MBytes for FPGA processing\n",mbytes_total);
// Create events for read,compute and write
vector<cl::Event> wordWait;
vector<cl::Event> krnlWait;
vector<cl::Event> flagWait;
cl::Event buffDone, krnlDone, flagDone;
printf("--------------------------------------------------------------------\n");
chrono::high_resolution_clock::time_point t1, t2;
t1 = chrono::high_resolution_clock::now();
// Load the bloom filter and input document words buffers
q.enqueueMigrateMemObjects({buffer_bloom_filter, buffer_input_doc_words}, 0,NULL,&buffDone);
wordWait.push_back(buffDone);
// Start the FPGA compute
load_filter = true;
kernel.setArg(3, total_size);
kernel.setArg(4, load_filter);
q.enqueueTask(kernel,&wordWait,&krnlDone);
krnlWait.push_back(krnlDone);
// Read back the results from FPGA to host
q.enqueueMigrateMemObjects({buffer_output_inh_flags}, CL_MIGRATE_MEM_OBJECT_HOST,&krnlWait,&flagDone);
flagWait.push_back(flagDone);
q.finish();
// Compute the profile score the CPU using the in-hash flags computed on the FPGA
unsigned curr_entry;
unsigned char inh_flags;
for(unsigned int doc=0, n=0; doc<total_num_docs;doc++)
{
unsigned long ans = 0;
unsigned int size = doc_sizes[doc];
for (unsigned i = 0; i < size ; i++, n++)
{
curr_entry = input_doc_words[n];
inh_flags = output_inh_flags[n];
if (inh_flags)
{
unsigned frequency = curr_entry & 0x00ff;
unsigned word_id = curr_entry >> 8;
ans += profile_weights[word_id] * (unsigned long)frequency;
}
}
profile_score[doc] = ans;
}
t2 = chrono::high_resolution_clock::now();
chrono::duration<double> perf_all_sec = chrono::duration_cast<duration<double>>(t2-t1);
cl_ulong f1 = 0;
cl_ulong f2 = 0;
wordWait.front().getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &f1);
flagWait.back().getProfilingInfo(CL_PROFILING_COMMAND_END, &f2);
double perf_hw_ms = (f2 - f1)/1000000.0;
if (xcl::is_emulation()) {
if (xcl::is_hw_emulation()) {
printf(" Emulated FPGA accelerated version | run 'vitis_analyzer xclbin.run_summary' for performance estimates");
} else {
printf(" Emulated FPGA accelerated version | (performance not relevant in SW emulation)");
}
} else {
printf(" Executed FPGA accelerated version | %10.4f ms ( FPGA %.3f ms )", 1000*perf_all_sec.count(), perf_hw_ms);
}
printf("\n");
}