cuda 學習筆記（二）cuda於cpu時間對比

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include #include #include #include #include #define row 1024
#define col 1024
long long g_cpu_calc_count;
//定義的kernel函式
__global__ void addkernel(int **c, int **a, int **b)
}void matrix_add_cpu(int** a_ptr, int** b_ptr, int** c_ptr, int width)	}}
int main()
for (size_t i = 0; i < row; i++)
const clock_t cpu_begin_time_2 = clock(); //開始計時
matrix_add_cpu(a_ptr, b_ptr, c_ptr, col); //cpu計算
float ms = float(clock() - cpu_begin_time_2);
std::cout << "矩陣加法運算cpu單核運算總次數：" << g_cpu_calc_count << std::endl;
printf("cpu cost_time: %.2f ms \n", ms);
//gpu計算
// set value
for (int i = 0; i < row * col; i++)
// 將主機指標a指向裝置資料位置，目的是讓裝置二級指標能夠指向裝置資料一級指標
for (size_t i = 0; i < row; i++)
//set value
for (int i = 0; i < row * col; i++)
const clock_t gpu_begin_time_2 = clock(); //開始計時
// malloc device memory
cudamalloc((void**)&d_a_ptr, sizeof(int**) * row);
cudamalloc((void**)&d_b_ptr, sizeof(int**) * row);
cudamalloc((void**)&d_c_ptr, sizeof(int**) * row);
cudamalloc((void**)&d_a, sizeof(int**) * row*col);
cudamalloc((void**)&d_b, sizeof(int**) * row*col);
cudamalloc((void**)&d_c, sizeof(int**) * row*col);
//memcpy host to device
cudamemcpy(d_a_ptr, a_ptr, sizeof(int*)* row, cudamemcpyhosttodevice);
cudamemcpy(d_b_ptr, b_ptr, sizeof(int*)* row, cudamemcpyhosttodevice);
cudamemcpy(d_c_ptr, c_ptr, sizeof(int*)* row, cudamemcpyhosttodevice);
cudamemcpy(d_a, a, sizeof(int)* row, cudamemcpyhosttodevice);
cudamemcpy(d_b, b, sizeof(int)* row, cudamemcpyhosttodevice);
dim3 threadperblock_2(16, 16); // 定義變數作為kernel的grid
dim3 blocknumber_2((col + threadperblock_2.x - 1) / threadperblock_2.x, (row + threadperblock_2.y - 1) / threadperblock_2.y); // 定義變數作為kernel的block
printf("block(%d, %d) grid(%d, %d).\n", threadperblock_2.x, threadperblock_2.y, blocknumber_2.x, blocknumber_2.y);
addkernel << > > (d_c_ptr, d_a_ptr, d_b_ptr);
// memcpy device to host
cudamemcpy(c_ptr, d_c_ptr, sizeof(int) * row * col, cudamemcpydevicetohost);
ms = float(clock() - gpu_begin_time_2);
std::cout << "矩陣加法運算所有執行緒數：" << threadperblock_2.x * threadperblock_2.y * blocknumber_2.x * blocknumber_2.y << std::endl;
std::cout << "矩陣加法運算gpu單執行緒運算次數：1" << std::endl;
std::cout << "矩陣加法運算gpu拷貝到gpu資料位元組數：" << sizeof(int*) * row * 3 + sizeof(int) * row * col * 2 << std::endl;
std::cout << "矩陣加法運算gpu拷貝到cpu資料位元組數：" << sizeof(int) * row * col << std::endl;
printf("gpu cost_time: %.2f ms \n", ms);
//釋放記憶體
free(a);
free(b);
free(c);
free(a_ptr);
free(b_ptr);
free(c_ptr);
cudafree(d_a);
cudafree(d_b);
cudafree(d_c);
cudafree(d_a_ptr);
cudafree(d_b_ptr);
cudafree(d_c_ptr);
system("pause");
return 0;
}

執行結果：

矩陣加法運算cpu單核運算總次數：1048576 cpu cost_time: 2.00 ms block(16, 16) grid(64, 64). 矩陣加法運算所有執行緒數：1048576 矩陣加法運算gpu單執行緒運算次數：1 矩陣加法運算gpu拷貝到gpu資料位元組數：8413184 矩陣加法運算gpu拷貝到cpu資料位元組數：4194304 gpu cost_time: 439.00 ms

請按任意鍵繼續. . .

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include #include #include #include #include #define row  1024
#define col 1024
long long g_cpu_calc_count;
__global__ void matrix_mul_gpu(int *m, int* n, int* p, int width)

執行結果：

矩陣乘法運算cpu單核總運算次數：1073741824 cpu cost_time: 1743.00 ms block(16,16) grid(64,64). 矩陣乘法運算所有執行緒數：1048576 矩陣乘法運算gpu單執行緒運算次數：1024 矩陣乘法運算cpu拷貝到gpu資料位元組數：8388608 矩陣乘法運算gpu拷貝到cpu資料位元組數：4194304 gpu cost_time: 10.00 ms

請按任意鍵繼續. . .

結論：cuda程式設計呼叫gpu運算，會增加cpu與gpu傳輸資料的開銷,也就是說使用cuda程式設計gpu加速，本身就會出現一部分額外開銷；若cpu與gpu互動的資料量一定，則在gpu上執行的計算量越大，則使用gpu加速的效果越明顯。因此不可盲目地使用cuda的gpu加速。

參考：【cuda程式設計系列】cuda程式設計基本入門學習筆記

cuda 學習筆記（二）cuda於cpu時間對比

二 cuda學習筆記之 cuda基本概念

cuda合併訪問的要求 CUDA學習筆記（二）

Cuda學習筆記（三） Cuda程式設計Tips

cuda 學習筆記（二）cuda於cpu時間對比

二 cuda學習筆記之 cuda基本概念

cuda合併訪問的要求 CUDA學習筆記（二）

Cuda學習筆記（三） Cuda程式設計Tips

相關推薦