#include #include int main() { int nElements = 4194304; // 2^22 const int bytes = nElements * sizeof(float); cudaError_t err; // error flag // allocate and initialize - pageable memory float *h_a = (float*)calloc(nElements,sizeof(float)); float *h_b = (float*)calloc(nElements,sizeof(float)); // create pinned memory float *h_aPinned, *h_bPinned; err = cudaMallocHost(&h_aPinned, bytes); // host pinned if(err != cudaSuccess) fprintf(stderr, "CUDA ERROR: %s\n", cudaGetErrorString(err)); err = cudaMallocHost(&h_bPinned, bytes); // host pinned if(err != cudaSuccess) fprintf(stderr, "CUDA ERROR: %s\n", cudaGetErrorString(err)); for (int i = 0; i < nElements; ++i){ h_a[i] = i; h_aPinned[i] = i; h_b[i] = 0.; h_bPinned[i] = 0.; } // device array float *d_a; cudaMalloc(&d_a, bytes); // device // output device info and transfer size cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); printf("\nDevice: %s\n", prop.name); // 1024^2 bytes in MB printf("Transfer size (MB): %d\n", bytes / (1024 * 1024)); // cuda events for local timing cudaEvent_t startEvent, stopEvent; cudaEventCreate(&startEvent); cudaEventCreate(&stopEvent); // ---------- perform copies back/forth and report est. bandwidth printf("\n Standard memory transfer\n"); cudaEventRecord(startEvent, 0); cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice); cudaEventRecord(stopEvent, 0); cudaEventSynchronize(stopEvent); float time; cudaEventElapsedTime(&time, startEvent, stopEvent); printf(" Host to Device bandwidth (GB/s): %f\n", bytes / 1e6 / time); cudaEventRecord(startEvent, 0); cudaMemcpy(h_b, d_a, bytes, cudaMemcpyDeviceToHost); cudaEventRecord(stopEvent, 0); cudaEventSynchronize(stopEvent); cudaEventElapsedTime(&time, startEvent, stopEvent); printf(" Device to Host bandwidth (GB/s): %f\n", bytes / 1e6 / time); printf("\n"); // ---------- perform copies back/forth and report est. bandwidth printf("\n Pinned memory transfer\n"); cudaEventRecord(startEvent, 0); cudaMemcpy(d_a, h_aPinned, bytes, cudaMemcpyHostToDevice); cudaEventRecord(stopEvent, 0); cudaEventSynchronize(stopEvent); cudaEventElapsedTime(&time, startEvent, stopEvent); printf(" Host to Device bandwidth (GB/s): %f\n", bytes / 1e6 / time); cudaEventRecord(startEvent, 0); cudaMemcpy(h_bPinned, d_a, bytes, cudaMemcpyDeviceToHost); cudaEventRecord(stopEvent, 0); cudaEventSynchronize(stopEvent); cudaEventElapsedTime(&time, startEvent, stopEvent); printf(" Device to Host bandwidth (GB/s): %f\n", bytes / 1e6 / time); printf("\n"); // ---------------------------------------------------- err = cudaGetLastError(); if(err != cudaSuccess) fprintf(stderr, "CUDA ERROR: %s\n", cudaGetErrorString(err)); // cleanup cudaFree(d_a); cudaFreeHost(h_aPinned); cudaFreeHost(h_bPinned); free(h_a); free(h_b); cudaEventDestroy(startEvent); cudaEventDestroy(stopEvent); return 0; }