#include #include #include "cuda.h" // to compile for a 3.5 capable device (like the titan in bodge): // nvcc -arch=sm_35 -O3 -o mxm mxm.cu -lm // // to run a partial reduction on a vector of length 8192 : // ./mxm 8192 // assume going forward 32x32 threads in each thread-block #define BDIM 32 // naive CUDA mxm kernel __global__ void mxmV1(int N, const float * __restrict__ A , const float * __restrict__ B , float * __restrict__ C){ const int idx = threadIdx.x + blockDim.x*blockIdx.x; const int idy = threadIdx.y + blockDim.y*blockIdx.y; float axb = 0.; for(int n=0;n>> (N, c_A, c_B, c_C); mxmV2 <<< blocks, threadsPerBlock >>> (N, c_A, c_B, c_C); mxmV3 <<< blocks, threadsPerBlock >>> (N, c_A, c_B, c_C); cudaMemcpy(C, c_C, sz, cudaMemcpyDeviceToHost); float maxerr = 0.; for (int i = 0; i < N; ++i){ for (int j = 0; j < N; ++j){ float errij = C[i+j*N]-1.0; maxerr += errij*errij; } } printf("err = %f\n",maxerr); // -------------------------------------------------------------------------------- cudaError_t err = cudaGetLastError(); if(err != cudaSuccess){ fprintf(stderr, "CUDA ERROR: %s\n", cudaGetErrorString(err)); } }