in MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp [6424:7957]
void Mesh::gpu_calc_neighbors_local(void)
{
if (! gpu_do_rezone) return;
ulong gpu_hash_table_size = 0;
struct timeval tstart_cpu;
cpu_timer_start(&tstart_cpu);
struct timeval tstart_lev2;
if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
cl_command_queue command_queue = ezcl_get_command_queue();
gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
ncells_ghost = ncells;
assert(dev_levtable);
assert(dev_level);
assert(dev_i);
assert(dev_j);
size_t one = 1;
cl_mem dev_check = ezcl_malloc(NULL, const_cast<char *>("dev_check"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t mem_request = (int)((float)ncells*mem_factor);
dev_nlft = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_nrht = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_nbot = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_ntop = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t local_work_size = 64;
size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
size_t block_size = global_work_size/local_work_size;
//printf("DEBUG file %s line %d lws = %d gws %d bs %d ncells %d\n",__FILE__,__LINE__,
// local_work_size, global_work_size, block_size, ncells);
cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
cl_mem dev_sizes = ezcl_malloc(NULL, const_cast<char *>("dev_sizes"), &one, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_level) < (int)ncells ){
printf("%d: Warning ncells %ld size dev_i %d dev_j %d dev_level %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
}
#endif
/*
__kernel void calc_hash_size_cl(
const int ncells, // 0
const int levmx, // 1
__global int *levtable, // 2
__global int *level, // 3
__global int *i, // 4
__global int *j, // 5
__global int4 *redscratch, // 6
__global int4 *sizes, // 7
__local int4 *tile) // 8
*/
ezcl_set_kernel_arg(kernel_hash_size, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_hash_size, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_size, 2, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_size, 3, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_hash_size, 4, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_hash_size, 5, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_hash_size, 6, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_hash_size, 7, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_hash_size, 8, local_work_size*sizeof(cl_int4), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_size, 1, NULL, &global_work_size, &local_work_size, NULL);
if (block_size > 1) {
/*
__kernel void finish_reduction_minmax4_cl(
const int isize, // 0
__global int4 *redscratch, // 1
__global int4 *sizes, // 2
__local int4 *tile) // 3
*/
ezcl_set_kernel_arg(kernel_finish_hash_size, 0, sizeof(cl_int), (void *)&block_size);
ezcl_set_kernel_arg(kernel_finish_hash_size, 1, sizeof(cl_mem), (void *)&dev_redscratch);
ezcl_set_kernel_arg(kernel_finish_hash_size, 2, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_finish_hash_size, 3, local_work_size*sizeof(cl_int4), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_hash_size, 1, NULL, &local_work_size, &local_work_size, NULL);
}
ezcl_device_memory_delete(dev_redscratch);
cl_int sizes[4];
ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes, NULL);
int imintile = sizes[0];
int imaxtile = sizes[1];
int jmintile = sizes[2];
int jmaxtile = sizes[3];
// Expand size by 2*coarse_cells for ghost cells
// TODO: May want to get fancier here and calc based on cell level
int jminsize = max(jmintile-2*IPOW2(levmx),0);
int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
int iminsize = max(imintile-2*IPOW2(levmx),0);
int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
//fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
//ezcl_enqueue_write_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes, NULL);
int gpu_hash_method = METHOD_UNSET;
// allow imput.c to control hash types and methods
if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
//=========
size_t hashsize;
uint hash_report_level = 1;
cl_mem dev_hash_header = NULL;
cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, gpu_hash_method, hash_report_level, &gpu_hash_table_size, &hashsize, &dev_hash_header);
int csize = corners_i.size();
#ifdef BOUNDS_CHECK
for (int ic=0; ic<csize; ic++){
if (corners_i[ic] >= iminsize) continue;
if (corners_j[ic] >= jminsize) continue;
if (corners_i[ic] < imaxsize) continue;
if (corners_j[ic] < jmaxsize) continue;
if ( (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) < 0 ||
(corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) > (int)hashsize){
printf("%d: Warning corners i %d j %d hash %d\n",mype,corners_i[ic],corners_j[ic],
(corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize));
}
}
#endif
size_t corners_local_work_size = MIN(csize, TILE_SIZE);
size_t corners_global_work_size = ((csize+corners_local_work_size - 1) /corners_local_work_size) * corners_local_work_size;
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 0, sizeof(cl_int), (void *)&csize);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 4, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 5, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_adjust_sizes, 1, NULL, &corners_global_work_size, &corners_local_work_size, NULL);
if (DEBUG){
vector<int> sizes_tmp(4);
ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
int iminsize_tmp = sizes_tmp[0];
int imaxsize_tmp = sizes_tmp[1];
int jminsize_tmp = sizes_tmp[2];
int jmaxsize_tmp = sizes_tmp[3];
fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
}
local_work_size = 128;
global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
#ifdef BOUNDS_CHECK
{
vector<int> i_tmp(ncells);
vector<int> j_tmp(ncells);
vector<int> level_tmp(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
for (int ic=0; ic<(int)ncells; ic++){
int lev = level_tmp[ic];
for ( int jj = j_tmp[ic]*IPOW2(levmx-lev)-jminsize; jj < (j_tmp[ic]+1)*IPOW2(levmx-lev)-jminsize; jj++) {
for (int ii = i_tmp[ic]*IPOW2(levmx-lev)-iminsize; ii < (i_tmp[ic]+1)*IPOW2(levmx-lev)-iminsize; ii++) {
if (jj < 0 || jj >= (jmaxsize-jminsize) || ii < 0 || ii >= (imaxsize-iminsize) ) {
printf("%d: Warning ncell %d writes to hash out-of-bounds at line %d ii %d jj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,ic,__LINE__,ii,jj,iminsize,imaxsize,jminsize,jmaxsize);
}
}
}
}
}
#endif
//printf("%d: lws %d gws %d \n",mype,local_work_size,global_work_size);
cl_event hash_setup_local_event;
/*
const int isize, // 0
const int levmx, // 1
const int imax, // 2
const int jmax, // 3
const int noffset, // 4
__global int *sizes, // 5
__global int *levtable, // 6
__global int *level, // 7
__global int *i, // 8
__global int *j, // 9
__global const ulong *hash_heaer, // 10
__global int *hash) // 11
*/
ezcl_set_kernel_arg(kernel_hash_setup_local, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_hash_setup_local, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_hash_setup_local, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_hash_setup_local, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_hash_setup_local, 4, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_hash_setup_local, 5, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_hash_setup_local, 6, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_hash_setup_local, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_hash_setup_local, 8, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_hash_setup_local, 9, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_hash_setup_local, 10, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_hash_setup_local, 11, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup_local, 1, NULL, &global_work_size, &local_work_size, &hash_setup_local_event);
ezcl_wait_for_events(1, &hash_setup_local_event);
ezcl_event_release(hash_setup_local_event);
if (DEBUG){
vector<int> sizes_tmp(4);
ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
int iminsize_tmp = sizes_tmp[0];
int imaxsize_tmp = sizes_tmp[1];
int jminsize_tmp = sizes_tmp[2];
int jmaxsize_tmp = sizes_tmp[3];
fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
}
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
#ifdef BOUNDS_CHECK
{
if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_i) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
ezcl_get_device_mem_nelements(dev_level) < (int)ncells ) {
printf("%d: Warning -- sizes for dev_neigh too small ncells %ld neigh %d %d %d %d %d %d %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_nlft),ezcl_get_device_mem_nelements(dev_nrht),ezcl_get_device_mem_nelements(dev_nbot),ezcl_get_device_mem_nelements(dev_ntop),ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
}
vector<int> level_tmp(ncells);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
int iflag = 0;
for (int ic=0; ic<ncells; ic++){
if (levmx-level_tmp[ic] < 0 || levmx-level_tmp[ic] > levmx) {
printf("%d: Warning level value bad ic %d level %d ncells %d\n",mype,ic,level_tmp[ic],ncells);
iflag++;
}
}
if (ezcl_get_device_mem_nelements(dev_levtable) < levmx+1) printf("%d Warning levtable too small levmx is %d devtable size is %d\n",mype,levmx,ezcl_get_device_mem_nelements(dev_levtable));
#ifdef HAVE_MPI
if (iflag > 20) {fflush(stdout); L7_Terminate(); exit(0);}
#endif
}
#endif
#ifdef BOUNDS_CHECK
{
int jmaxcalc = (jmax+1)*IPOW2(levmx);
int imaxcalc = (imax+1)*IPOW2(levmx);
vector<int> i_tmp(ncells);
vector<int> j_tmp(ncells);
vector<int> level_tmp(ncells);
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
for (int ic=0; ic<(int)ncells; ic++){
int ii = i_tmp[ic];
int jj = j_tmp[ic];
int lev = level_tmp[ic];
int levmult = IPOW2(levmx-lev);
int jjj=jj *levmult-jminsize;
int iii=max( ii *levmult-1, 0 )-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
jjj=jj *levmult-jminsize;
iii=min( (ii+1)*levmult, imaxcalc-1)-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
jjj=max( jj *levmult-1, 0) -jminsize;
iii=ii *levmult -iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
jjj=min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
iii=ii *levmult -iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
int nlftval = hash_tmp[(( jj *levmult )-jminsize)*(imaxsize-iminsize)+((max( ii *levmult-1, 0 ))-iminsize)];
int nrhtval = hash_tmp[(( jj *levmult )-jminsize)*(imaxsize-iminsize)+((min( (ii+1)*levmult, imaxcalc-1))-iminsize)];
int nbotval = hash_tmp[((max( jj *levmult-1, 0) )-jminsize)*(imaxsize-iminsize)+(( ii *levmult )-iminsize)];
int ntopval = hash_tmp[((min( (jj+1)*levmult, jmaxcalc-1))-jminsize)*(imaxsize-iminsize)+(( ii *levmult )-iminsize)];
if (nlftval == INT_MIN){
jjj = jj*levmult-jminsize;
iii = ii*levmult-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
if (nrhtval == INT_MIN){
jjj = jj*levmult-jminsize;
iii = ii*levmult-iminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
if (nbotval == INT_MIN) {
iii = ii*levmult-iminsize;
jjj = jj*levmult-jminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
if (ntopval == INT_MIN) {
iii = ii*levmult-iminsize;
jjj = jj*levmult-jminsize;
if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
}
}
}
#endif
cl_event calc_neighbors_local_event;
/*
const int isize, // 0
const int levmx, // 1
const int imaxsize, // 2
const int jmaxsize, // 3
const int noffset, // 4
__global int *sizes, // 5
__global int *levtable, // 6
__global int *level, // 7
__global int *i, // 8
__global int *j, // 9
__global int *nlft, // 10
__global int *nrht, // 11
__global int *nbot, // 12
__global int *ntop, // 13
__global const ulong *hash_heaer, // 14
__global int *hash) // 15
*/
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 4, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 5, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 6, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 7, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 8, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 9, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 10, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 11, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 12, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 13, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 14, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_neighbors_local, 15, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors_local, 1, NULL, &global_work_size, &local_work_size, &calc_neighbors_local_event);
ezcl_wait_for_events(1, &calc_neighbors_local_event);
ezcl_event_release(calc_neighbors_local_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH 0 numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nlft numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nlft_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nrht numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nrht_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nbot numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",nbot_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n ntop numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
if (hashval >= 0 && hashval < (int)ncells) {
fprintf(fp,"%5d",ntop_tmp[hashval]);
} else {
fprintf(fp," ");
}
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
#ifdef HAVE_MPI
if (numpe > 1) {
vector<int> iminsize_global(numpe);
vector<int> imaxsize_global(numpe);
vector<int> jminsize_global(numpe);
vector<int> jmaxsize_global(numpe);
vector<int> comm_partner(numpe,-1);
MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
int num_comm_partners = 0;
for (int ip = 0; ip < numpe; ip++){
if (ip == mype) continue;
if (iminsize_global[ip] > imaxtile) continue;
if (imaxsize_global[ip] < imintile) continue;
if (jminsize_global[ip] > jmaxtile) continue;
if (jmaxsize_global[ip] < jmintile) continue;
comm_partner[num_comm_partners] = ip;
num_comm_partners++;
//if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
}
#ifdef BOUNDS_CHECK
{
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells*sizeof(cl_int), &ntop_tmp[0], NULL);
for (uint ic=0; ic<ncells; ic++){
int nl = nlft_tmp[ic];
if (nl != -1){
nl -= noffset;
if (nl<0 || nl>= ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
}
int nr = nrht_tmp[ic];
if (nr != -1){
nr -= noffset;
if (nr<0 || nr>= ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
}
int nb = nbot_tmp[ic];
if (nb != -1){
nb -= noffset;
if (nb<0 || nb>= ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
}
int nt = ntop_tmp[ic];
if (nt != -1){
nt -= noffset;
if (nt<0 || nt>= ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
}
}
}
#endif
cl_mem dev_border_cell = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell1"), &ncells, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_calc_border_cells, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_border_cells, 1, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_border_cells, 2, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_border_cells, 3, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_border_cells, 4, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_border_cells, 5, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_border_cells, 6, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_border_cells, 7, sizeof(cl_mem), (void *)&dev_border_cell);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells, 1, NULL, &global_work_size, &local_work_size, NULL);
cl_mem dev_border_cell_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell2"), &ncells, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t one = 1;
cl_mem dev_nbsize = ezcl_malloc(NULL, const_cast<char *>("dev_nbsize"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 1, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 2, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 3, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 4, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 5, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 6, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 7, sizeof(cl_mem), (void *)&dev_border_cell);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 8, sizeof(cl_mem), (void *)&dev_border_cell_new);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 9, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 10, sizeof(cl_mem), (void *)&dev_nbsize);
ezcl_set_kernel_arg(kernel_calc_border_cells2, 11, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells2, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_swap(&dev_border_cell, &dev_border_cell_new);
ezcl_device_memory_delete(dev_border_cell_new);
int group_size = (int)(global_work_size/local_work_size);
ezcl_set_kernel_arg(kernel_finish_scan, 0, sizeof(cl_int), (void *)&group_size);
ezcl_set_kernel_arg(kernel_finish_scan, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_finish_scan, 2, sizeof(cl_mem), (void *)&dev_nbsize);
ezcl_set_kernel_arg(kernel_finish_scan, 3, local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &local_work_size, &local_work_size, NULL);
int nbsize_local;
ezcl_enqueue_read_buffer(command_queue, dev_nbsize, CL_TRUE, 0, 1*sizeof(cl_int), &nbsize_local, NULL);
ezcl_device_memory_delete(dev_nbsize);
//printf("%d: border cell size is %d global is %ld\n",mype,nbsize_local,nbsize_global);
vector<int> border_cell_num(nbsize_local);
vector<int> border_cell_i(nbsize_local);
vector<int> border_cell_j(nbsize_local);
vector<int> border_cell_level(nbsize_local);
// allocate new border memory
size_t nbsize_long = nbsize_local;
cl_mem dev_border_cell_i = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_j = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_num = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_get_border_data, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_get_border_data, 1, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_get_border_data, 2, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_get_border_data, 3, sizeof(cl_mem), (void *)&dev_border_cell);
ezcl_set_kernel_arg(kernel_get_border_data, 4, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_get_border_data, 5, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_get_border_data, 6, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_get_border_data, 7, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_get_border_data, 8, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_get_border_data, 9, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_get_border_data, 10, sizeof(cl_mem), (void *)&dev_border_cell_num);
ezcl_set_kernel_arg(kernel_get_border_data, 11, local_work_size*sizeof(cl_uint), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data, 1, NULL, &global_work_size, &local_work_size, NULL);
ezcl_device_memory_delete(dev_ioffset);
ezcl_device_memory_delete(dev_border_cell);
// read gpu border cell data
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_i, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_j, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_level, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_level[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_num, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_num[0], NULL);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_FIND_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
// Allocate push database
int **send_database = (int**)malloc(num_comm_partners*sizeof(int *));
for (int ip = 0; ip < num_comm_partners; ip++){
send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
}
// Compute the overlap between processor bounding boxes and set up push database
vector<int> send_buffer_count(num_comm_partners);
for (int ip = 0; ip < num_comm_partners; ip++){
int icount = 0;
for (int ib = 0; ib <nbsize_local; ib++){
int lev = border_cell_level[ib];
int levmult = IPOW2(levmx-lev);
if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] &&
border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] &&
border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] &&
border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
send_database[ip][icount] = ib;
icount++;
}
}
send_buffer_count[ip]=icount;
}
// Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and
// send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
// It will return receive_count_total for use in allocations
int receive_count_total;
int i_push_handle = 0;
L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
send_database, &receive_count_total, &i_push_handle);
if (DEBUG) {
fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
for (int ip = 0; ip < num_comm_partners; ip++){
fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
for (int ic = 0; ic < send_buffer_count[ip]; ic++){
int ib = send_database[ip][ic];
fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
}
}
}
// Can now free the send database. Other arrays are vectors and will automatically
// deallocate
for (int ip = 0; ip < num_comm_partners; ip++){
free(send_database[ip]);
}
free(send_database);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_PUSH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
// Push the data needed to the adjacent processors
int *border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
int *border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
int *border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
int *border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
L7_Push_Update(&border_cell_num[0], border_cell_num_local, i_push_handle);
L7_Push_Update(&border_cell_i[0], border_cell_i_local, i_push_handle);
L7_Push_Update(&border_cell_j[0], border_cell_j_local, i_push_handle);
L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
L7_Push_Free(&i_push_handle);
ezcl_device_memory_delete(dev_border_cell_i);
ezcl_device_memory_delete(dev_border_cell_j);
ezcl_device_memory_delete(dev_border_cell_level);
ezcl_device_memory_delete(dev_border_cell_num);
nbsize_local = receive_count_total;
if (DEBUG) {
for (int ic = 0; ic < nbsize_local; ic++) {
fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
}
}
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_PUSH_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
nbsize_long = nbsize_local;
dev_border_cell_num = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_border_cell_i = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_border_cell_j = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_needed = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_needed_out = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed_out"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_num, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_num_local[0], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_i, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i_local[0], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_j, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j_local[0], NULL);
ezcl_enqueue_write_buffer(command_queue, dev_border_cell_level, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_level_local[0], NULL);
//ezcl_enqueue_write_buffer(command_queue, dev_border_cell_needed, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
free(border_cell_i_local);
free(border_cell_j_local);
free(border_cell_level_local);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LOCAL_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering before layer 1\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
size_t nb_local_work_size = 128;
size_t nb_global_work_size = ((nbsize_local + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
ezcl_set_kernel_arg(kernel_calc_layer1, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_calc_layer1, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer1, 2, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer1, 3, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_layer1, 4, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_layer1, 5, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer1, 6, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer1, 7, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer1, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_layer1, 9, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer1, 10, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer1, 11, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer1, 12, sizeof(cl_mem), (void *)&dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_calc_layer1, 13, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer1, 14, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL);
if (DEBUG){
vector<int> border_cell_needed_local(nbsize_local);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] == 0) continue;
fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
}
}
cl_event calc_layer1_sethash_event;
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 3, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 4, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 5, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 6, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 7, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 8, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 9, sizeof(cl_mem), (void *)&dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 10, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 11, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1_sethash, 1, NULL, &nb_global_work_size, &nb_local_work_size, &calc_layer1_sethash_event);
ezcl_wait_for_events(1, &calc_layer1_sethash_event);
ezcl_event_release(calc_layer1_sethash_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LAYER1] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering for 1 layer\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
group_size = (int)(nb_global_work_size/nb_local_work_size);
cl_mem dev_nbpacked = ezcl_malloc(NULL, const_cast<char *>("dev_nbpacked"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
size_t group_size_long = group_size;
dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &group_size_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_set_kernel_arg(kernel_calc_layer2, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_calc_layer2, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer2, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer2, 3, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer2, 4, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_layer2, 5, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_layer2, 6, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer2, 7, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer2, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_calc_layer2, 9, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer2, 10, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer2, 11, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer2, 12, sizeof(cl_mem), (void *)&dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_calc_layer2, 13, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
ezcl_set_kernel_arg(kernel_calc_layer2, 14, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer2, 15, sizeof(cl_mem), (void *)&dev_hash);
ezcl_set_kernel_arg(kernel_calc_layer2, 16, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_calc_layer2, 17, sizeof(cl_mem), (void *)&dev_nbpacked);
ezcl_set_kernel_arg(kernel_calc_layer2, 18, nb_local_work_size*sizeof(cl_mem), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL);
if (DEBUG){
vector<int> border_cell_needed_local(nbsize_local);
ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed_out, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
for(int ic=0; ic<nbsize_local; ic++){
if (border_cell_needed_local[ic] <= 0) continue;
if (border_cell_needed_local[ic] < 0x0016) fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
}
}
free(border_cell_num_local);
ezcl_device_memory_delete(dev_border_cell_needed);
ezcl_set_kernel_arg(kernel_finish_scan, 0, sizeof(cl_int), (void *)&group_size);
ezcl_set_kernel_arg(kernel_finish_scan, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_finish_scan, 2, sizeof(cl_mem), (void *)&dev_nbpacked);
ezcl_set_kernel_arg(kernel_finish_scan, 3, nb_local_work_size*sizeof(cl_int), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &nb_local_work_size, &nb_local_work_size, NULL);
int nbpacked;
ezcl_enqueue_read_buffer(command_queue, dev_nbpacked, CL_TRUE, 0, 1*sizeof(cl_int), &nbpacked, NULL);
ezcl_device_memory_delete(dev_nbpacked);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LAYER2] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
nbsize_long = nbsize_local;
cl_mem dev_border_cell_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_border_cell_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_indices_needed = ezcl_malloc(NULL, const_cast<char *>("dev_indices_needed"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_event get_border_data2_event;
ezcl_set_kernel_arg(kernel_get_border_data2, 0, sizeof(cl_int), (void *)&nbsize_local);
ezcl_set_kernel_arg(kernel_get_border_data2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
ezcl_set_kernel_arg(kernel_get_border_data2, 2, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
ezcl_set_kernel_arg(kernel_get_border_data2, 3, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_get_border_data2, 4, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_get_border_data2, 5, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_get_border_data2, 6, sizeof(cl_mem), (void *)&dev_border_cell_num);
ezcl_set_kernel_arg(kernel_get_border_data2, 7, sizeof(cl_mem), (void *)&dev_border_cell_i_new);
ezcl_set_kernel_arg(kernel_get_border_data2, 8, sizeof(cl_mem), (void *)&dev_border_cell_j_new);
ezcl_set_kernel_arg(kernel_get_border_data2, 9, sizeof(cl_mem), (void *)&dev_border_cell_level_new);
ezcl_set_kernel_arg(kernel_get_border_data2, 10, sizeof(cl_mem), (void *)&dev_indices_needed);
ezcl_set_kernel_arg(kernel_get_border_data2, 11, local_work_size*sizeof(cl_uint), NULL);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data2, 1, NULL, &nb_global_work_size, &nb_local_work_size, &get_border_data2_event);
ezcl_device_memory_delete(dev_border_cell_num);
ezcl_device_memory_swap(&dev_border_cell_i, &dev_border_cell_i_new);
ezcl_device_memory_swap(&dev_border_cell_j, &dev_border_cell_j_new);
ezcl_device_memory_swap(&dev_border_cell_level, &dev_border_cell_level_new);
size_t nbp_local_work_size = 128;
size_t nbp_global_work_size = ((nbpacked + nbp_local_work_size - 1) /nbp_local_work_size) * nbp_local_work_size;
cl_event calc_layer2_sethash_event;
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 0, sizeof(cl_int), (void *)&nbpacked);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 3, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 4, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 5, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 6, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 7, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 8, sizeof(cl_mem), (void *)&dev_levibeg);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 9, sizeof(cl_mem), (void *)&dev_leviend);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 10, sizeof(cl_mem), (void *)&dev_levjbeg);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 11, sizeof(cl_mem), (void *)&dev_levjend);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 12, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 13, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 14, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 15, sizeof(cl_mem), (void *)&dev_indices_needed);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 16, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 17, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 18, sizeof(cl_mem), (void *)&dev_hash);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2_sethash, 1, NULL, &nbp_global_work_size, &nbp_local_work_size, &calc_layer2_sethash_event);
ezcl_wait_for_events(1, &calc_layer2_sethash_event);
ezcl_event_release(calc_layer2_sethash_event);
ezcl_device_memory_delete(dev_ioffset);
ezcl_wait_for_events(1, &get_border_data2_event);
ezcl_event_release(get_border_data2_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_LAYER_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
vector<int> indices_needed(nbpacked);
// read gpu border cell data
ezcl_enqueue_read_buffer(command_queue, dev_indices_needed, CL_TRUE, 0, nbpacked*sizeof(cl_int), &indices_needed[0], NULL);
ezcl_device_memory_delete(dev_border_cell_i_new);
ezcl_device_memory_delete(dev_border_cell_j_new);
ezcl_device_memory_delete(dev_border_cell_level_new);
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering for 2 layer\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fflush(fp);
}
ezcl_device_memory_delete(dev_border_cell_needed_out);
int nghost = nbpacked;
ncells_ghost = ncells + nghost;
//if (mype == 1) printf("%d: DEBUG before expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
if (ezcl_get_device_mem_capacity(dev_celltype) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_i) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_j) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_level) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_nlft) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_nrht) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_nbot) < ncells_ghost ||
ezcl_get_device_mem_capacity(dev_ntop) < ncells_ghost ) {
//if (mype == 0) printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
//printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
mem_factor = (float)(ncells_ghost/ncells);
cl_mem dev_celltype_old = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_i_old = ezcl_malloc(NULL, const_cast<char *>("dev_i_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_j_old = ezcl_malloc(NULL, const_cast<char *>("dev_j_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_level_old = ezcl_malloc(NULL, const_cast<char *>("dev_level_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nlft_old = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nrht_old = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_nbot_old = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
cl_mem dev_ntop_old = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
ezcl_device_memory_swap(&dev_celltype_old, &dev_celltype);
ezcl_device_memory_swap(&dev_i_old, &dev_i );
ezcl_device_memory_swap(&dev_j_old, &dev_j );
ezcl_device_memory_swap(&dev_level_old, &dev_level );
ezcl_device_memory_swap(&dev_nlft_old, &dev_nlft );
ezcl_device_memory_swap(&dev_nrht_old, &dev_nrht );
ezcl_device_memory_swap(&dev_nbot_old, &dev_nbot );
ezcl_device_memory_swap(&dev_ntop_old, &dev_ntop );
cl_event copy_mesh_data_event;
ezcl_set_kernel_arg(kernel_copy_mesh_data, 0, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 1, sizeof(cl_mem), (void *)&dev_celltype_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 2, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 3, sizeof(cl_mem), (void *)&dev_i_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 4, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 5, sizeof(cl_mem), (void *)&dev_j_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 6, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 7, sizeof(cl_mem), (void *)&dev_level_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 9, sizeof(cl_mem), (void *)&dev_nlft_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 10, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 11, sizeof(cl_mem), (void *)&dev_nrht_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 12, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 13, sizeof(cl_mem), (void *)&dev_nbot_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 14, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 15, sizeof(cl_mem), (void *)&dev_ntop_old);
ezcl_set_kernel_arg(kernel_copy_mesh_data, 16, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_mesh_data, 1, NULL, &global_work_size, &local_work_size, ©_mesh_data_event);
ezcl_device_memory_delete(dev_celltype_old);
ezcl_device_memory_delete(dev_i_old);
ezcl_device_memory_delete(dev_j_old);
ezcl_device_memory_delete(dev_level_old);
ezcl_device_memory_delete(dev_nlft_old);
ezcl_device_memory_delete(dev_nrht_old);
ezcl_device_memory_delete(dev_nbot_old);
ezcl_device_memory_delete(dev_ntop_old);
ezcl_wait_for_events(1, ©_mesh_data_event);
ezcl_event_release(copy_mesh_data_event);
}
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_COPY_MESH_DATA] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
nb_global_work_size = ((nbpacked + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_j) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_level) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_celltype) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
printf("DEBUG size issue at %d\n",__LINE__);
}
if (ezcl_get_device_mem_nelements(dev_border_cell_i) < nbpacked ||
ezcl_get_device_mem_nelements(dev_border_cell_j) < nbpacked ||
ezcl_get_device_mem_nelements(dev_border_cell_level) < nbpacked ){
printf("DEBUG size issue at %d\n",__LINE__);
}
#endif
cl_event fill_mesh_ghost_event;
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 0, sizeof(cl_int), (void *)&nbpacked);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 2, sizeof(cl_mem), (void *)&dev_levibeg);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 3, sizeof(cl_mem), (void *)&dev_leviend);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 4, sizeof(cl_mem), (void *)&dev_levjbeg);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 5, sizeof(cl_mem), (void *)&dev_levjend);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 6, sizeof(cl_mem), (void *)&dev_border_cell_i);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 7, sizeof(cl_mem), (void *)&dev_border_cell_j);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 8, sizeof(cl_mem), (void *)&dev_border_cell_level);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 9, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 10, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 11, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 12, sizeof(cl_mem), (void *)&dev_celltype);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 13, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 14, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 15, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 16, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_mesh_ghost, 1, NULL, &nb_global_work_size, &nb_local_work_size, &fill_mesh_ghost_event);
ezcl_wait_for_events(1, &fill_mesh_ghost_event);
ezcl_event_release(fill_mesh_ghost_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_FILL_MESH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG){
fprintf(fp,"After copying i,j, level to ghost cells\n");
print_dev_local();
}
ezcl_device_memory_delete(dev_border_cell_i);
ezcl_device_memory_delete(dev_border_cell_j);
ezcl_device_memory_delete(dev_border_cell_level);
size_t ghost_local_work_size = 128;
size_t ghost_global_work_size = ((ncells_ghost + ghost_local_work_size - 1) /ghost_local_work_size) * ghost_local_work_size;
cl_event fill_neighbor_ghost_event;
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 0, sizeof(cl_int), (void *)&ncells_ghost);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 1, sizeof(cl_int), (void *)&levmx);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 2, sizeof(cl_int), (void *)&imax);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 3, sizeof(cl_int), (void *)&jmax);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 4, sizeof(cl_mem), (void *)&dev_sizes);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 5, sizeof(cl_mem), (void *)&dev_levtable);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 6, sizeof(cl_mem), (void *)&dev_i);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 7, sizeof(cl_mem), (void *)&dev_j);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 8, sizeof(cl_mem), (void *)&dev_level);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 9, sizeof(cl_mem), (void *)&dev_hash_header);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 10, sizeof(cl_mem), (void *)&dev_hash);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 11, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 12, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 13, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 14, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_neighbor_ghost, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &fill_neighbor_ghost_event);
ezcl_wait_for_events(1, &fill_neighbor_ghost_event);
ezcl_event_release(fill_neighbor_ghost_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG){
fprintf(fp,"After setting neighbors through ghost cells\n");
print_dev_local();
}
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
printf("%d: Warning sizes for set_corner_neighbor not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
}
#endif
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
if (DEBUG){
fprintf(fp,"After setting corner neighbors\n");
print_dev_local();
}
#ifdef BOUNDS_CHECK
if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
printf("%d: Warning sizes for adjust neighbors not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
}
if (ezcl_get_device_mem_nelements(dev_indices_needed) < (int)(ncells_ghost-ncells) ){
printf("%d: Warning indices size wrong nghost %d size indices_needed\n",mype,ncells_ghost-ncells,ezcl_get_device_mem_nelements(dev_indices_needed));
}
#endif
cl_event adjust_neighbors_local_event;
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 0, sizeof(cl_int), (void *)&ncells_ghost);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 1, sizeof(cl_int), (void *)&ncells);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 2, sizeof(cl_int), (void *)&noffset);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 3, sizeof(cl_mem), (void *)&dev_indices_needed);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 4, sizeof(cl_mem), (void *)&dev_nlft);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 5, sizeof(cl_mem), (void *)&dev_nrht);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 6, sizeof(cl_mem), (void *)&dev_nbot);
ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 7, sizeof(cl_mem), (void *)&dev_ntop);
ezcl_enqueue_ndrange_kernel(command_queue, kernel_adjust_neighbors_local, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &adjust_neighbors_local_event);
ezcl_device_memory_delete(dev_indices_needed);
if (DEBUG){
fprintf(fp,"After adjusting neighbors to local indices\n");
print_dev_local();
}
ezcl_wait_for_events(1, &adjust_neighbors_local_event);
ezcl_event_release(adjust_neighbors_local_event);
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_NEIGH_ADJUST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
cpu_timer_start(&tstart_lev2);
}
offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
offtile_local_count++;
offtile_ratio_local /= offtile_local_count;
if (cell_handle) L7_Free(&cell_handle);
cell_handle=0;
if (DEBUG){
fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
for (int ic=0; ic<nghost; ic++){
fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ic,indices_needed[ic]);
}
}
L7_Dev_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
#ifdef BOUNDS_CHECK
{
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
vector<int> level_tmp(ncells_ghost);
vector<real_t> H_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
for (uint ic=0; ic<ncells; ic++){
int nl = nlft_tmp[ic];
if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
if (level_tmp[nl] > level_tmp[ic]){
int ntl = ntop_tmp[nl];
if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
}
int nr = nrht_tmp[ic];
if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
if (level_tmp[nr] > level_tmp[ic]){
int ntr = ntop_tmp[nr];
if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
}
int nb = nbot_tmp[ic];
if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
if (level_tmp[nb] > level_tmp[ic]){
int nrb = nrht_tmp[nb];
if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
}
int nt = ntop_tmp[ic];
if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d ntop %d ncells %ld ncells_ghost %ld\n",mype,__LINE__,ic,ic+noffset,nt,ncells,ncells_ghost);
if (level_tmp[nt] > level_tmp[ic]){
int nrt = nrht_tmp[nt];
if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
}
}
}
#endif
if (TIMING_LEVEL >= 2) {
gpu_timers[MESH_TIMER_SETUP_COMM] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
}
if (DEBUG) {
print_dev_local();
vector<int> hash_tmp(hashsize);
ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
cl_mem dev_hash_header_check = gpu_get_hash_header();
vector<ulong> hash_header_check(hash_header_size);
ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
int gpu_hash_method = (int)hash_header_check[0];
ulong gpu_hash_table_size = hash_header_check[1];
ulong gpu_AA = hash_header_check[2];
ulong gpu_BB = hash_header_check[3];
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
int jmaxglobal = (jmax+1)*IPOW2(levmx);
int imaxglobal = (imax+1)*IPOW2(levmx);
fprintf(fp,"\n HASH numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
if (ii >= iminsize && ii < imaxsize) {
fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nlft numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nlft_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nrht numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nrht_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n nbot numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",nbot_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
fprintf(fp,"\n ntop numbering\n");
for (int jj = jmaxglobal-1; jj>=0; jj--){
fprintf(fp,"%2d: %4d:",mype,jj);
if (jj >= jminsize && jj < jmaxsize) {
for (int ii = 0; ii<imaxglobal; ii++){
int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
fprintf(fp,"%5d",ntop_tmp[hashval]);
} else {
fprintf(fp," ");
}
}
}
fprintf(fp,"\n");
}
fprintf(fp,"%2d: ",mype);
for (int ii = 0; ii<imaxglobal; ii++){
fprintf(fp,"%4d:",ii);
}
fprintf(fp,"\n");
}
if (DEBUG) {
print_dev_local();
vector<int> i_tmp(ncells_ghost);
vector<int> j_tmp(ncells_ghost);
vector<int> level_tmp(ncells_ghost);
vector<int> nlft_tmp(ncells_ghost);
vector<int> nrht_tmp(ncells_ghost);
vector<int> nbot_tmp(ncells_ghost);
vector<int> ntop_tmp(ncells_ghost);
ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
for (uint ic=0; ic<ncells; ic++){
fprintf(fp,"%d: before update ic %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
mype,ic,i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
}
int ig=0;
for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
fprintf(fp,"%d: after update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
mype,ic,indices_needed[ig],i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
}
}
}
#endif
ezcl_device_memory_delete(dev_sizes);
ezcl_device_memory_delete(dev_check);
gpu_compact_hash_delete(dev_hash, dev_hash_header);
gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
}