#include
#include
#include
#define total_num 50000
bool cuda_initial(void)
else
printf("there is %d device beyond 1.0/n",device_count);
for(i=0;iif(cudasetdevice(i)==cudaerrorinvaliddevice)
return true;
}void generate_num(int *num,int data_num)
{int i;
for(i=0;i
__global__ void square_sum(int *num,int num_of_num,int * result,clock_t *time)
{int i;
int sum=0;
clock_t start,end;
start=clock();
for(i=0;i
int main()
{if(cuda_initial()==true)
printf("cuda initial successed!/n");
int num_str[total_num];
generate_num(num_str,total_num);
int *gpudata;
int *result;
clock_t *time;
cudamalloc((void **)&gpudata,sizeof(int)*total_num);
cudamalloc((void **)&result,sizeof(int));
cudamalloc((void **)&time,sizeof(clock_t));
cudamemcpy((void *)gpudata,num_str,sizeof(int)*total_num,cudamemcpyhosttodevice);
square_sum<<<1,1>>>(gpudata,total_num,result,time);
int result_in_gpu;
cudamemcpy((void *)&result_in_gpu,result,sizeof(int),cudamemcpydevicetohost);
clock_t time_used;
cudamemcpy((void *)&time_used,time,sizeof(clock_t),cudamemcpydevicetohost);
printf("in gpu result is %d/n",result_in_gpu);
printf("in gpu time used is %d/n",time_used);
int result_in_cpu=0;
int i;
for(i=0;i測試結果:
there is 1 device beyond 1.0
device properties is :
device name is geforce 9800 gt
totalglobalmem is 536543232
sharedmemperblock is 16384
regsperblock is 8192
warpsize is 32
mempitch is 262144
maxthreadsperblock is 512
maxthreadsdim [3] is 512 x 512 x 64
maxgridsize [3] is 65535 x 65535 x 1
totalconstmem is 65536
device version is major 1 ,minor 1
clockrate is 1350000
texturealignment is 256
deviceoverlap is 1
multiprocessorcount is 14
cuda initial successed!
in gpu result is 1419240
in gpu time used is 29763916
in cpu result is 1419240
請按任意鍵繼續. . .
記憶體頻寬:50000/1048576*4/0.022=8.67mb/s
CUDA程式設計(一)第乙個CUDA程式
cuda compute unified device architecture 是顯示卡廠商nvidia推出的運算平台。是一種通用平行計算架構,該架構使gpu能夠解決複雜的計算問題。說白了就是我們可以使用gpu來並行完成像神經網路 影象處理演算法這些在cpu上跑起來比較吃力的程式。通過gpu和高並...
CUDA程式設計(一)第乙個CUDA程式
cuda compute unified device architecture 是顯示卡廠商nvidia推出的運算平台。是一種通用平行計算架構,該架構使gpu能夠解決複雜的計算問題。說白了就是我們可以使用gpu來並行完成像神經網路 影象處理演算法這些在cpu上跑起來比較吃力的程式。通過gpu和高並...
CUDA 第乙個CUDA程式 addVector
本文主要通過對兩個浮點陣列中的資料進行相加,並將其結果放入第三個陣列中。其演算法分別在cpu gpu上分別執行,並比較了所需時間,強烈感受到gpu的平行計算能力。這裡,每個陣列的元素大小為30000000個。include include include include for the cuda r...