cuda学习3-共享内存和同步

  1 /*
  2  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3  *
  4  * NVIDIA Corporation and its licensors retain all intellectual property and 
  5  * proprietary rights in and to this software and related documentation. 
  6  * Any use, reproduction, disclosure, or distribution of this software 
  7  * and related documentation without an express license agreement from
  8  * NVIDIA Corporation is strictly prohibited.
  9  *
 10  * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 11  * associated with this source code for terms and conditions that govern 
 12  * your use of this NVIDIA software.
 13  * 
 14  */
 15 
 16 
 17#include"../common/book.h"
 18 
 19 #defineimin(a,b) (a 20 
 21 const intN =33*1024;
 22 const intthreadsPerBlock =256;
 23 const intblocksPerGrid = 24imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
 25 
 26 
 27__global__voiddot(float*a,float*b,float*c ) {
 28__shared__float cache[threadsPerBlock];
 29     inttid = threadIdx.x + blockIdx.x * blockDim.x;
 30     intcacheIndex = threadIdx.x;
 31 
 32     floattemp =0;
 33     while(tid < N) {
 34temp += a[tid] * b[tid];
 35tid += blockDim.x * gridDim.x;
 36     }
 37     
 38     // set the cache values
 39cache[cacheIndex] = temp;
 40     
 41     // synchronize threads in this block
 42     __syncthreads();
 43 
 44     // for reductions, threadsPerBlock must be a power of 2
 45     // because of the following code
 46     inti = blockDim.x/2;
 47     while(i !=0) {
 48         if(cacheIndex < i)
 49cache[cacheIndex] += cache[cacheIndex + i];
 50         __syncthreads();
 51i /=2;
 52     }
 53 
 54     if(cacheIndex ==0)
 55c[blockIdx.x] = cache[0];
 56 }
 57 
 58 
 59 intmain(void ) {
 60     float*a, *b, c, *partial_c;
 61     float*dev_a, *dev_b, *dev_partial_c;
 62 
 63     // allocate memory on the cpu side
 64a = (float*)malloc( N*sizeof(float) );
 65b = (float*)malloc( N*sizeof(float) );
 66partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
 67 
 68     // allocate the memory on the GPU
 69HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
 70N*sizeof(float) ) );
 71HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
 72N*sizeof(float) ) );
 73HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
 74blocksPerGrid*sizeof(float) ) );
 75 
 76     // fill in the host memory with data
 77     for(inti=0; i) {
 78a[i] = i;
 79b[i] = i*2;
 80     }
 81 
 82     // copy the arrays 'a' and 'b' to the GPU
 83HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
 84                               cudaMemcpyHostToDevice ) );
 85HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
 86                               cudaMemcpyHostToDevice ) ); 
 87 
 88dot<<>>( dev_a, dev_b,
 89                                             dev_partial_c );
 90 
 91     // copy the array 'c' back from the GPU to the CPU
 92     HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
 93blocksPerGrid*sizeof(float),
 94                               cudaMemcpyDeviceToHost ) );
 95 
 96     // finish up on the CPU side
 97c =0;
 98     for(inti=0; i) {
 99c += partial_c[i];
100     }
101 
102     #definesum_squares(x)  (x*(x+1)*(2*x+1)/6)103printf("Does GPU value %.6g = %.6g?\n", c,
104              2* sum_squares( (float)(N -1) ) );
105 
106     // free memory on the gpu side
107     HANDLE_ERROR( cudaFree( dev_a ) );
108     HANDLE_ERROR( cudaFree( dev_b ) );
109     HANDLE_ERROR( cudaFree( dev_partial_c ) );
110 
111     // free memory on the cpu side
112     free( a );
113     free( b );
114     free( partial_c );
115}
来源: http://www.cnblogs.com/dama116/p/6925205.html
与本文相关文章

暂无,快来抢沙发吧！