- 1 /*
- 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
- 3 *
- 4 * NVIDIA Corporation and its licensors retain all intellectual property and
- 5 * proprietary rights in and to this software and related documentation.
- 6 * Any use, reproduction, disclosure, or distribution of this software
- 7 * and related documentation without an express license agreement from
- 8 * NVIDIA Corporation is strictly prohibited.
- 9 *
- 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
- 11 * associated with this source code for terms and conditions that govern
- 12 * your use of this NVIDIA software.
- 13 *
- 14 */
- 15
- 16
- 17#include"../common/book.h"
- 18
- 19 #defineimin(a,b) (a 20
- 21 const intN =33*1024;
- 22 const intthreadsPerBlock =256;
- 23 const intblocksPerGrid = 24imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
- 25
- 26
- 27__global__voiddot(float*a,float*b,float*c ) {
- 28__shared__float cache[threadsPerBlock];
- 29 inttid = threadIdx.x + blockIdx.x * blockDim.x;
- 30 intcacheIndex = threadIdx.x;
- 31
- 32 floattemp =0;
- 33 while(tid < N) {
- 34temp += a[tid] * b[tid];
- 35tid += blockDim.x * gridDim.x;
- 36 }
- 37
- 38 // set the cache values
- 39cache[cacheIndex] = temp;
- 40
- 41 // synchronize threads in this block
- 42 __syncthreads();
- 43
- 44 // for reductions, threadsPerBlock must be a power of 2
- 45 // because of the following code
- 46 inti = blockDim.x/2;
- 47 while(i !=0) {
- 48 if(cacheIndex < i)
- 49cache[cacheIndex] += cache[cacheIndex + i];
- 50 __syncthreads();
- 51i /=2;
- 52 }
- 53
- 54 if(cacheIndex ==0)
- 55c[blockIdx.x] = cache[0];
- 56 }
- 57
- 58
- 59 intmain(void ) {
- 60 float*a, *b, c, *partial_c;
- 61 float*dev_a, *dev_b, *dev_partial_c;
- 62
- 63 // allocate memory on the cpu side
- 64a = (float*)malloc( N*sizeof(float) );
- 65b = (float*)malloc( N*sizeof(float) );
- 66partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
- 67
- 68 // allocate the memory on the GPU
- 69HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
- 70N*sizeof(float) ) );
- 71HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
- 72N*sizeof(float) ) );
- 73HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
- 74blocksPerGrid*sizeof(float) ) );
- 75
- 76 // fill in the host memory with data
- 77 for(inti=0; i) {
- 78a[i] = i;
- 79b[i] = i*2;
- 80 }
- 81
- 82 // copy the arrays 'a' and 'b' to the GPU
- 83HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
- 84 cudaMemcpyHostToDevice ) );
- 85HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
- 86 cudaMemcpyHostToDevice ) );
- 87
- 88dot<<>>( dev_a, dev_b,
- 89 dev_partial_c );
- 90
- 91 // copy the array 'c' back from the GPU to the CPU
- 92 HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
- 93blocksPerGrid*sizeof(float),
- 94 cudaMemcpyDeviceToHost ) );
- 95
- 96 // finish up on the CPU side
- 97c =0;
- 98 for(inti=0; i) {
- 99c += partial_c[i];
- 100 }
- 101
- 102 #definesum_squares(x) (x*(x+1)*(2*x+1)/6)103printf("Does GPU value %.6g = %.6g?\n", c,
- 104 2* sum_squares( (float)(N -1) ) );
- 105
- 106 // free memory on the gpu side
- 107 HANDLE_ERROR( cudaFree( dev_a ) );
- 108 HANDLE_ERROR( cudaFree( dev_b ) );
- 109 HANDLE_ERROR( cudaFree( dev_partial_c ) );
- 110
- 111 // free memory on the cpu side
- 112 free( a );
- 113 free( b );
- 114 free( partial_c );
- 115}
来源: http://www.cnblogs.com/dama116/p/6925205.html