#include #include #include "cuda.h" // to compile: // nvcc -O0 -o transpose transpose.cu -lm // // to run: // ./transpose 1024 // assume going forward 32x32 threads in each thread-block #define BDIM 32 // reference "copy" kernel __global__ void copy(int N, const float * __restrict__ A, float * __restrict__ AT){ int idx = threadIdx.x + blockDim.x*blockIdx.x; int idy = threadIdx.y + blockDim.y*blockIdx.y; // output if(idx N dim3 threadsPerBlock(BDIM,BDIM,1); dim3 blocks(Nblocks,Nblocks,1); copy <<< blocks,threadsPerBlock >>> (N,c_A,c_AT); transposeV1 <<< blocks, threadsPerBlock >>> (N, c_A, c_AT); transposeV2 <<< blocks, threadsPerBlock >>> (N, c_A, c_AT); transposeV3 <<< blocks, threadsPerBlock >>> (N, c_A, c_AT); cudaMemcpy(AT, c_AT, sz, cudaMemcpyDeviceToHost); // -------------------------------------------------------------------------------- cudaError_t err = cudaGetLastError(); if(err != cudaSuccess){ fprintf(stderr, "CUDA ERROR: %s\n", cudaGetErrorString(err)); } }