#include #define p_Nthreads 128 __global__ void reduce(int N, float *x, float *xout){ __shared__ float s_x[p_Nthreads]; const int tid = threadIdx.x; const int i = blockIdx.x*blockDim.x + tid; // load smem s_x[tid] = 0; if (i < N){ s_x[tid] = x[i]; } __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s *= 2){ // strided access // s = 1 -> [0, 2, 4, 8, ...] // s = 2 -> [0, 4, 8, 16, ...] if (tid % (2*s)==0){ s_x[tid] += s_x[tid + s]; } __syncthreads(); } if (tid==0){ xout[blockIdx.x] = s_x[0]; } } __global__ void reduce1(int N, float *x, float *xout){ __shared__ float s_x[p_Nthreads]; const int tid = threadIdx.x; const int i = blockIdx.x*blockDim.x + tid; // load smem s_x[tid] = 0; if (i < N){ s_x[tid] = x[i]; } __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s *= 2){ int index = 2*s*tid; if (index < blockDim.x){ s_x[index] += s_x[index+s]; // bank conflicts } __syncthreads(); } if (tid==0){ xout[blockIdx.x] = s_x[0]; } } __global__ void reduce2(int N, float *x, float *xout){ __shared__ float s_x[p_Nthreads]; const int tid = threadIdx.x; const int i = blockIdx.x*blockDim.x + tid; // load smem s_x[tid] = 0; if (i < N){ s_x[tid] = x[i]; } __syncthreads(); for (unsigned int s = blockDim.x/2; s > 0; s /= 2){ if (tid < s){ s_x[tid] += s_x[tid+s]; // fewer bank conflicts } __syncthreads(); } if (tid==0){ xout[blockIdx.x] = s_x[0]; } } // use all threads __global__ void reduce3(int N, float *x, float *xout){ __shared__ float s_x[p_Nthreads]; const int tid = threadIdx.x; const int i = blockIdx.x*(2*blockDim.x) + tid; s_x[tid] = 0; if (i < N){ s_x[tid] = x[i] + x[i + blockDim.x]; } __syncthreads(); for (unsigned int s = blockDim.x/2; s > 0; s /= 2){ if (tid < s){ s_x[tid] += s_x[tid+s]; // no wasted threads on first iteration } __syncthreads(); } if (tid==0){ xout[blockIdx.x] = s_x[0]; } } // use all threads __global__ void reduce4(int N, float *x, float *xout){ __shared__ volatile float s_x[p_Nthreads]; // volatile for in-warp smem mods const int tid = threadIdx.x; const int i = blockIdx.x*(2*blockDim.x) + tid; s_x[tid] = 0; if (i < N){ s_x[tid] = x[i] + x[i + blockDim.x]; } __syncthreads(); // stop at s = 64 for (unsigned int s = blockDim.x/2; s > 32; s /= 2){ if (tid < s){ s_x[tid] += s_x[tid+s]; } __syncthreads(); } // manually reduce within a warp if (tid < 32){ s_x[tid] += s_x[tid + 32]; s_x[tid] += s_x[tid + 16]; s_x[tid] += s_x[tid + 8]; s_x[tid] += s_x[tid + 4]; s_x[tid] += s_x[tid + 2]; s_x[tid] += s_x[tid + 1]; } if (tid==0){ xout[blockIdx.x] = s_x[0]; } } int main(void) { int N = 1048576; float*x = (float*)malloc(N*sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1; } // alloc GPU mem and copy over float *x_c, *xout_c, *xouthalf_c; cudaMalloc(&x_c, N*sizeof(float)); cudaMemcpy(x_c, x, N*sizeof(float), cudaMemcpyHostToDevice); // run kernel, copy result back to CPU int Nthreads = p_Nthreads; // good if it's a multiple of 32, can't have more than 1024 int Nblocks = (N+Nthreads-1)/Nthreads; dim3 threadsPerBlock(Nthreads,1,1); dim3 blocks(Nblocks,1,1); float*xout = (float*)malloc(Nblocks*sizeof(float)); cudaMalloc(&xout_c, Nblocks*sizeof(float)); cudaMalloc(&xouthalf_c, Nblocks/2*sizeof(float)); // version 1: slow reduce <<< blocks, threadsPerBlock >>> (N, x_c, xout_c); // version 2: remove % operator reduce1 <<< blocks, threadsPerBlock >>> (N, x_c, xout_c); // version 3: reduce bank conflicts reduce2 <<< blocks, threadsPerBlock >>> (N, x_c, xout_c); cudaMemcpy(xout, xout_c, Nblocks*sizeof(float), cudaMemcpyDeviceToHost); // check result int reduction = 0; for (int i = 0; i < Nblocks; i++){ reduction += xout[i]; } printf("error = %d\n",reduction-N); // --- the following versions use only 1/2 the number of blocks dim3 halfblocks(Nblocks/2,1,1); float*xouthalf = (float*)malloc(Nblocks/2*sizeof(float)); cudaMalloc(&xouthalf_c, Nblocks/2*sizeof(float)); // version 4: fewer idle threads reduce3 <<< halfblocks, threadsPerBlock >>> (N, x_c, xouthalf_c); cudaMemcpy(xouthalf, xouthalf_c, Nblocks/2*sizeof(float), cudaMemcpyDeviceToHost); // version 5: manually unrolled last warp reduce4 <<< halfblocks, threadsPerBlock >>> (N, x_c, xouthalf_c); cudaMemcpy(xouthalf, xouthalf_c, Nblocks/2*sizeof(float), cudaMemcpyDeviceToHost); // check result reduction = 0; for (int i = 0; i < Nblocks/2; i++){ reduction += xouthalf[i]; } printf("error = %d\n",reduction-N); }