in src/main.rs [529:543]
fn alloc_buffers<T: VariablePrecisionFloat>(
gpu: Arc<CudaDevice>,
a: Vec<T>,
b: Vec<T>,
num_out_slices: usize,
) -> anyhow::Result<AllocBufferTuple<T>> {
let a_gpu = gpu.htod_copy(a)?;
let b_gpu = gpu.htod_copy(b)?;
let mut out_slices = vec![];
for _ in 0..num_out_slices {
let out = gpu.alloc_zeros::<T>(SIZE * SIZE)?;
out_slices.push(out);
}
Ok((a_gpu, b_gpu, out_slices))
}