1.向量點積 cuda dot product
question 1: dot product
dot product is a reduction from vectors to a scalar.
please implement the kernel of dot product in cuda. the host code is provided. your work will be evaluated by accuracy and efficiency.
計算兩個向量的點積 經典的官方教程
#define imin(a,b) (aconst
int n = 4096 * 4096;
int threadsperblock = 512;
int blockspergrid = imin(32, (n+threadsperblock-1) / threadsperblock); 取整
__global__ void dot( float *a, float *b, float *c )
temp[threadidx.x] = move;
//wait all threads calculate over
// thread 0 sums the pairwise products
// calculate current block. 而不是將thread_sum放在thread_0計算
int i = blockdim.x/2;
while (i != 0)
if (threadidx.x == 0)
c[blockidx.x] = temp[0];
}int main( void )
// copy the arrays 'a' and 'b' to the gpu
cudamemcpy( dev_a, a, n*sizeof(float),
cudamemcpyhosttodevice ) ;
cudamemcpy( dev_b, b, n*sizeof(float),
cudamemcpyhosttodevice ) ;
dot<<>>( dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the gpu to the cpu
cudamemcpy( partial_c, dev_partial_c,
cudamemcpydevicetohost ) ;
// finish up on the cpu side
// 統計block_sum in grid
c = 0;
for (int i=0; i//#define sum_squares(x) (x*(x+1)*(2*x+1)/6) //此程式相當於計算0,1,...n-1的平方和
//printf("does gpu value %.10g = %.10g?\n", c, 2*sum_squares((float)(n-1)));
//printf("does gpu value %f = %f?\n", c, 2*sum_squares((float)(n-1)));
// free memory on the gpu side
cudafree( dev_a ) ;
cudafree( dev_b ) ;
cudafree( dev_partial_c );
// free memory on the cpu side
free( a );
free( b );
free( partial_c );}/*
blockdim.x,y,z gives the number of threads in a block, in the particular direction
griddim.x,y,z gives the number of blocks in a grid, in the particular direction
blockdim.x * griddim.x gives the number of threads in a grid (in the x direction, in this case)
1 block的官方examle
__global__ voiddot( int*a, int*b, int*c )
列舉排序(enumeration sort)是一種最簡單的排序演算法,通常也稱為秩排序(rank sort)。該演算法的具體思想是(假設按關鍵字遞增排序),對每乙個待排序的元素統計小於它的所有元素的個數,從而得到該元素最終處於序列中的位置。假定待排序的n個數存在a[1], …, a[n]中。首先將a[1]與a[2],…, a[n]比較,記錄比其小的數的個數,假設為k,則a[1]就被存入有序的陣列b[1], b[2],…, b[n]的b[k+1]位置上;然後將a[2]與a[1], a[3],…, a[n]比較,記錄比其小的數的個數,以此類推。這樣的比較操作共n(n-1)次,所以序列秩排序的時間複雜度為o(n2). 請用cuda並行列舉排序法
//感覺block沒用好 是不是再遍歷全部array之前有個combine操作?
//目前計算是9s 有點慢呢
#define block_size 512
__global__ void ranksortkernel(const
int *ori, unsigned
int *sorted, unsigned
int len)
for(int i=0;iint main ( int argc, char *argv )
printf("sort ok\n");
delete ori;
delete gpusorted;
