CUDA_support/cuda_sample.cu (32 lines of code) (raw):

// Check on [Windows: MSVC, Linux: gcc-7] // completion: <<<>>> kernel calls, __global__ and other attributes // highlighting: attributes, cuda***() calls, blockIdx/threadIdx #include <cstdio> __global__ void saxpy(int n, float a, float *x, float *y) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) y[i] = a * x[i] + y[i]; } int main() { int N = 1 << 20; float *x, *y, *d_x, *d_y; x = (float *) malloc(N * sizeof(float)); y = (float *) malloc(N * sizeof(float)); cudaMalloc(&d_x, N * sizeof(float)); cudaMalloc(&d_y, N * sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, d_x, d_y); cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); float maxError = 0.0f; for (int i = 0; i < N; i++) maxError = max(maxError, abs(y[i] - 4.0f)); // clang doesn't like math CPP-19160 printf("Max error: %f\n", maxError); cudaFree(d_x); cudaFree(d_y); free(x); free(y); }