基於CUDA在GPU上實現膨脹腐蝕加速

傳統的opencv形態**算函式不能直接在gpu上執行，現提供幾種方法，使得膨脹、腐蝕能在gpu上實現加速。筆者使用的是gpu是nvidia的，故以下**基於cuda。

下面的例子是，對乙個尺寸為5×5的矩形元素分別進行腐蝕、膨脹，元素的支點為其中心，座標為(2, 2)。它相當於對3×3元素進行兩次操作。

iplconvkernel *elem = cvcreatestructuringelementex(5, 5, 2, 2, cv_shape_rect);
cverode(src, dst, elem, 1);	// 相當於cverode(src, dst, null, 2);
cvdilate(src, dst, elem, 1);	// 相當於cvdilate(src, dst, null, 2);
cvreleasestructuringelement(&elem);

cv::mat kernel = cv::getstructuringelement(cv::morph_rect, cv::size(5, 5), cv::point(2, 2));
cv::dilate(src, dst, kernel, cv::point(2, 2), 1);
cv::dilate(src, dst, kernel, cv::point(2, 2), 1);

// 假設src、dst的型別為iplimage*
using namespace cv;
mat mat_src = cvarrtomat(src);
mat mat_dst(mat_src);
cuda::gpumat gpu_src(mat_src);
cuda::gpumat gpu_dst(mat_dst);
mat elem = getstructuringelement( morph_rect, size(5, 5), point(2, 2) );
ptrerodefilter = cuda::createmorphologyfilter( morph_erode, cv_8u, elem );
ptrdilatefilter = cuda::createmorphologyfilter( morph_dilate, cv_8u, elem );
gpu_dst.download( mat_dst );
*dst = iplimage( mat_dst );

需要注意的是，這種方法第一次使用時，耗時較多，從第二次呼叫開始，執行時間變得正常。

上面的方法執行時間還是挺長的，為了進一步提高執行速度，可使用以下方法

#ifndef _morphology_cuh_
#define _morphology_cuh_
#include #include #include #include #include #include using namespace cv;
void erodetwostepshared(unsigned char *src, unsigned char *dst, int radio, int width, int height);
void dilatetwostepshared(unsigned char *src, unsigned char *dst, int radio, int width, int height);
#endif

需要注意的是，傳入的引數unsigned char *src及輸出unsigned char *dst為cudamalloc而來，通過

cudamemcpy(src, img->imagedata, img->width * img->height, cudamemcpyhosttodevice);

來獲得資料。

另外，當運算的元素為n×n時，傳給radio的值為(n-1)/2，例如：以5×5的元素進行腐蝕、膨脹，則radio的值為2。而且不能定製元素的形狀，固定為矩形元素，且長寬相等。

不同的opencv版本，包含的標頭檔案路徑有所差別，我用的版本是3.1.0。其他版本，若編譯不通過，需要對頭檔案進行微小的改動。

#include "morphology.cuh"
__global__ void erodesharedstep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
smem[ty * blockdim.x + tx] = (int)src[y * width + x];
__syncthreads();
if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) 
int *smem_thread = &smem[ty * blockdim.x + tx - radio];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) 
dst[y * width + x] = (unsigned char)val;
}__global__ void erodesharedstep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
smem[ty * blockdim.x + tx] = (int)src[y * width + x];
__syncthreads();
if( y < (by * tile_h) || y >= (by + 1) * tile_h ) 
int *smem_thread = &smem[(ty - radio) * blockdim.x + tx];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) 
dst[y * width + x] = (unsigned char)val;
}__global__ void dilatesharedstep1(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
smem[ty * blockdim.x + tx] = (int)src[y * width + x];
__syncthreads();
if( x < (bx * tile_w) || x >= (bx + 1) * tile_w ) 
int *smem_thread = &smem[ty * blockdim.x + tx - radio];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) 
dst[y * width + x] = (unsigned char)val;
}__global__ void dilatesharedstep2(unsigned char *src, unsigned char *dst, int radio, int width, int height, int tile_w, int tile_h)
smem[ty * blockdim.x + tx] = (int)src[y * width + x];
__syncthreads();
if( y < (by * tile_h) || y >= (by + 1) * tile_h ) 
int *smem_thread = &smem[(ty - radio) * blockdim.x + tx];
int val = smem_thread[0];
for( int i = 1; i <= 2 * radio; i++ ) 
dst[y * width + x] = (unsigned char)val;
}void erodetwostepshared(unsigned char *src, unsigned char *dst, int radio, int width, int height)
void dilatetwostepshared(unsigned char *src, unsigned char *dst, int radio, int width, int height)

luke domanski, pascal vallotton, dadong wang. parallel van herk/gil-werman image morphology on gpus using cuda. csiro mathematical & information sciences, biotech imaging

及開源**

pytorch在CPU和GPU上載入模型

pytorch允許把在gpu上訓練的模型載入到cpu上，也允許把在cpu上訓練的模型載入到gpu上。cpu cpu,gpu gpu torch.load gen 500000.pkl gpu cpu torch.load gen 500000.pkl map location lambda stor...

使用GPU進行字串匹配 cuda程式設計實現

cuda程式的字尾為.cu，編譯時使用nvcc，其使用方法與gcc相似。例如nvcc test.cu o test nvcc的官方文件 1。首先遇到錯誤 fatal error cutil.h no such file or directorykmp.cu 62 error identifier c...

CUDA 9 0在Ubuntu上的安裝

在終端執行如下命令 sudo dpkg i cuda repo ubuntu1604 9 0 local 9.0.176 1 amd64.deb sudo apt key add var cuda repo 7fa2af80.pub 執行完上一句指令後，末尾會有此指令的提示 sudo apt get...

基於CUDA在GPU上實現膨脹 腐蝕加速

pytorch在CPU和GPU上載入模型

使用GPU進行字串匹配 cuda程式設計實現

CUDA 9 0在Ubuntu上的安裝

相關推薦

基於CUDA在GPU上實現膨脹腐蝕加速