测试了下性能。计算平方和1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91#include <stdio.h>
#define N 1048576
#define THREAD_PRE_BLOCK 1024
#define BLOCK_PRE_GRID 1
__global__ void squaresSum(int *data, int *sum)
{
__shared__ int cached[THREAD_PRE_BLOCK];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int cacheIdx = threadIdx.x;
int tmp = 0;
while(tid < N)
{
tmp += data[tid]*data[tid];
tid += blockDim.x * gridDim.x;
}
cached[cacheIdx] = tmp;
__syncthreads();
int i = blockDim.x/2;
while(i!=0)
{
if(cacheIdx<i)
{
cached[cacheIdx] += cached[cacheIdx+i];
}
__syncthreads();
i /= 2;
}
if(cacheIdx == 0)
{
sum[blockIdx.x] = cached[0];
}
}
void generateData(int * data , int s)
{
int i;
for(i=0;i<s;i++)
{
data[i] = rand()%5;
}
}
int cusumCpu(int * data)
{
int tmp = 0;
int i;
for(i=0;i<N;i++)
{
tmp += data[i]*data[i];
}
return tmp;
}
int main()
{
int *gpu_data, *gpu_sum;
int * data,*sum;
time_t t1,t2,t3,t4;
data = (int *) malloc(N*sizeof(int));
cudaMalloc((void**) &gpu_data, sizeof(int) * N);
sum = (int *) malloc(BLOCK_PRE_GRID*sizeof(int));
cudaMalloc((void**) &gpu_sum, sizeof(int) * BLOCK_PRE_GRID);
generateData(data,N);
cudaMemcpy(gpu_data, data, sizeof(int) * N, cudaMemcpyHostToDevice);
t1 = clock();
squaresSum<<<BLOCK_PRE_GRID,THREAD_PRE_BLOCK>>>(gpu_data,gpu_sum);
t2 = clock();
cudaMemcpy(sum,gpu_sum,BLOCK_PRE_GRID*sizeof(int),cudaMemcpyDeviceToHost);
int i,gpu_res = 0,cpu_res;
for(i=0;i<BLOCK_PRE_GRID;i++)
{
gpu_res += sum[i];
}
t3 = clock();
cpu_res = cusumCpu(data);
t4 = clock();
printf("gpu res : %d, time : %ld\ncpu res : %d, time : %ld\n",gpu_res,t2-t1,cpu_res,t4-t3);
cudaFree(gpu_data);
cudaFree(gpu_sum);
free(data);
}
如果t2放在内存拷贝后面时间大幅增加:
为啥这么费时。。