N = 100; a = int32(rand(N,1)*100); b = int32(rand(N,1)*100); ina = gpuArray(a); inb = gpuArray(b); %转换到gpu内存中
out1 = gpuArray(int32(rand(N,1)*100));
inN = gpuArray(N); ores = feval(k1,ina,inb,inN,out1); res = gather(ores); out = gather(out1); %转换回来 mlres = a+b;
sum(res-mlres)
此时的add2函数为:
1 2 3 4 5 6 7 8 9
__global__ voidadd2(constint * a ,constint * b, constint N, int * c) { int tid = threadIdx.x + blockIdx.x*blockDim.x; while(tid < N) { c[tid] = a[tid] + b[tid]; tid += blockDim.x * gridDim.x; } }
__global__ voidadd2(constint * a ,constint * b, constint N, int * c) { int tid = threadIdx.x + blockIdx.x*blockDim.x; while(tid < N) { a[tid] = a[tid] + b[tid]; tid += blockDim.x * gridDim.x; } }