25 #ifndef TGSTKCUDAOPERATIONS_H
26 #define TGSTKCUDAOPERATIONS_H
28 #include <cuda_runtime.h>
30 __global__
void gpuAdd(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest,
int w,
int h,
int d);
31 __global__
void gpuConstMultiply(cudaPitchedPtr devSrc,
float c, cudaPitchedPtr devDest,
int w,
int h,
int d);
32 __global__
void gpuCopy(cudaPitchedPtr devSrc, cudaPitchedPtr devDest,
int w,
int h,
int d);
33 __global__
void gpuDivide(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest,
int w,
int h,
int d);
34 __global__
void gpuMultiply(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest,
int w,
int h,
int d);
35 __global__
void gpuNorm(cudaPitchedPtr devSrcX, cudaPitchedPtr devSrcY, cudaPitchedPtr devSrcZ, cudaPitchedPtr devDest,
int w,
int h,
int d);
36 __global__
void gpuSet(cudaPitchedPtr devDest,
float value,
int w,
int h,
int d);
37 __global__
void gpuSubstract(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest,
int w,
int h,
int d);
38 __global__
void gpuThreshold(cudaPitchedPtr devSrc, cudaPitchedPtr devDest,
float threshold,
int w,
int h,
int d);
40 template<
int blockDimX,
int blockDimY,
int blockDimZ,
int pixelsPerThread,
int kernelBlockRatio>
41 __global__
void gpuConvolutionX(cudaPitchedPtr devSrc, cudaPitchedPtr devDest,
float* kernel,
int kernelRadius,
int w,
int h,
int d) {
43 __shared__
float sharedData[blockDimX*(pixelsPerThread + 2*kernelBlockRatio)][blockDimY][blockDimZ];
48 const int baseX = (blockIdx.x * pixelsPerThread - kernelBlockRatio)*blockDimX + threadIdx.x;
49 const int baseY = blockIdx.y * blockDimY + threadIdx.y;
50 const int baseZ = blockIdx.z * blockDimZ + threadIdx.z;
52 const size_t pitch = devSrc.pitch;
53 const size_t slicePitch = pitch * h;
59 for (
int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
61 sharedData[threadIdx.x + i*blockDimX][threadIdx.y][threadIdx.z] = (baseX + i*blockDimX < w && baseY < h && baseZ < d) ? *((
float*)((
char*)devSrc.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) : 0.0f;
68 for (
int i = 0; i != kernelBlockRatio; i++) {
70 sharedData[threadIdx.x + i*blockDimX][threadIdx.y][threadIdx.z] = (baseX + i*blockDimX >= 0 && baseY < h && baseZ < d) ? *((
float*)((
char*)devSrc.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) : 0.0f;
77 for (
int i = kernelBlockRatio + pixelsPerThread; i != kernelBlockRatio + pixelsPerThread + kernelBlockRatio; i++) {
79 sharedData[threadIdx.x + i*blockDimX][threadIdx.y][threadIdx.z] = (baseX + i*blockDimX < w && baseY < h && baseZ < d) ? *((
float*)((
char*)devSrc.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) : 0.0f;
88 for (
int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
93 for (
int j = -kernelRadius; j != kernelRadius + 1; j++) {
95 sum += kernel[kernelRadius + j] * sharedData[threadIdx.x + i*blockDimX + j][threadIdx.y][threadIdx.z];
98 if (baseX + i*blockDimX < w && baseY < h && baseZ < d) {
100 *((
float*)((
char*)devDest.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) = sum;
105 template<
int blockDimX,
int blockDimY,
int blockDimZ,
int pixelsPerThread,
int kernelBlockRatio>
106 __global__
void gpuConvolutionY(cudaPitchedPtr devSrc, cudaPitchedPtr devDest,
float* kernel,
int kernelRadius,
int w,
int h,
int d) {
108 __shared__
float sharedData[blockDimX][blockDimY*(pixelsPerThread + 2*kernelBlockRatio)][blockDimZ];
113 const int baseX = blockIdx.x * blockDimX + threadIdx.x;
114 const int baseY = (blockIdx.y * pixelsPerThread - kernelBlockRatio)*blockDimY + threadIdx.y;
115 const int baseZ = blockIdx.z * blockDimZ + threadIdx.z;
117 const size_t pitch = devSrc.pitch;
118 const size_t slicePitch = pitch * h;
124 for (
int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
126 sharedData[threadIdx.x][threadIdx.y + i*blockDimY][threadIdx.z] = (baseX < w && baseY + i*blockDimY < h && baseZ < d) ? *((
float*)((
char*)devSrc.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) : 0.0f;
133 for (
int i = 0; i != kernelBlockRatio; i++) {
135 sharedData[threadIdx.x][threadIdx.y + i*blockDimY][threadIdx.z] = (baseX < w && baseY + i*blockDimY >= 0 && baseZ < d) ? *((
float*)((
char*)devSrc.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) : 0.0f;
142 for (
int i = kernelBlockRatio + pixelsPerThread; i != kernelBlockRatio + pixelsPerThread + kernelBlockRatio; i++) {
144 sharedData[threadIdx.x][threadIdx.y + i*blockDimY][threadIdx.z] = (baseX < w && baseY + i*blockDimY < h && baseZ < d) ? *((
float*)((
char*)devSrc.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) : 0.0f;
153 for (
int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
158 for (
int j = -kernelRadius; j != kernelRadius + 1; j++) {
160 sum += kernel[kernelRadius + j] * sharedData[threadIdx.x][threadIdx.y + i*blockDimY + j][threadIdx.z];
163 if (baseX < w && baseY + i*blockDimY < h && baseZ < d) {
165 *((
float*)((
char*)devDest.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) = sum;
170 template<
int blockDimX,
int blockDimY,
int blockDimZ,
int pixelsPerThread,
int kernelBlockRatio>
171 __global__
void gpuConvolutionZ(cudaPitchedPtr devSrc, cudaPitchedPtr devDest,
float* kernel,
int kernelRadius,
int w,
int h,
int d) {
173 __shared__
float sharedData[blockDimX][blockDimY][blockDimZ*(pixelsPerThread + 2*kernelBlockRatio)];
178 const int baseX = blockIdx.x * blockDimX + threadIdx.x;
179 const int baseY = blockIdx.y * blockDimY + threadIdx.y;
180 const int baseZ = (blockIdx.z * pixelsPerThread - kernelBlockRatio)*blockDimZ + threadIdx.z;
182 const size_t pitch = devSrc.pitch;
183 const size_t slicePitch = pitch * h;
189 for (
int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
191 sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ] = (baseX < w && baseY < h && baseZ + i*blockDimZ < d) ? *((
float*)((
char*)devSrc.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) : 0.0f;
198 for (
int i = 0; i != kernelBlockRatio; i++) {
200 sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ] = (baseX < w && baseY < h && baseZ + i*blockDimZ >= 0) ? *((
float*)((
char*)devSrc.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) : 0.0f;
207 for (
int i = kernelBlockRatio + pixelsPerThread; i != kernelBlockRatio + pixelsPerThread + kernelBlockRatio; i++) {
209 sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ] = (baseX < w && baseY < h && baseZ + i*blockDimZ < d) ? *((
float*)((
char*)devSrc.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) : 0.0f;
218 for (
int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
223 for (
int j = -kernelRadius; j != kernelRadius + 1; j++) {
225 sum += kernel[kernelRadius + j] * sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ + j];
228 if (baseX < w && baseY < h && baseZ + i*blockDimZ < d) {
230 *((
float*)((
char*)devDest.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) = sum;
235 template<
int blockDimX,
int blockDimY,
int blockDimZ,
int pixelsPerDimension>
236 __global__
void gpuReduce(cudaPitchedPtr devSrc,
float* devDest,
int w,
int h,
int d) {
238 const int numberOfThreads = blockDimX*blockDimY*blockDimZ;
239 const int sharedSize = numberOfThreads*pixelsPerDimension*pixelsPerDimension*pixelsPerDimension;
241 __shared__
float sharedData[sharedSize];
243 const int localThreadIndex1D = blockDimX*blockDimY*threadIdx.z + blockDimX*threadIdx.y + threadIdx.x;
244 const int blockIndex1D = gridDim.x*gridDim.y*blockIdx.z + gridDim.x*blockIdx.y + blockIdx.x;
246 const int baseX = blockIdx.x * pixelsPerDimension * blockDimX + threadIdx.x;
247 const int baseY = blockIdx.y * pixelsPerDimension * blockDimY + threadIdx.y;
248 const int baseZ = blockIdx.z * pixelsPerDimension * blockDimZ + threadIdx.z;
250 const size_t pitch = devSrc.pitch;
251 const size_t slicePitch = pitch * h;
257 for (
int k=0; k!=pixelsPerDimension; k++) {
260 for (
int j=0; j!=pixelsPerDimension; j++) {
263 for (
int i=0; i!=pixelsPerDimension; i++) {
265 sharedData[blockDimX*pixelsPerDimension*blockDimY*pixelsPerDimension*(threadIdx.z + k*blockDimZ) + blockDimX*pixelsPerDimension*(threadIdx.y + j*blockDimY) + threadIdx.x + i*blockDimX] = (baseX + i*blockDimX < w && baseY + j*blockDimY < h && baseZ + k*blockDimZ < d) ? *((
float*)((
char*)devSrc.ptr + (baseZ + k*blockDimZ) * slicePitch + (baseY + j*blockDimY) * pitch) + baseX + i*blockDimX) : 0.0f;
278 for (
int s=sharedSize/2; s>0; s/=2) {
284 index = localThreadIndex1D + i*numberOfThreads;
288 sharedData[index] += sharedData[index+s];
294 while (i < s/numberOfThreads);
299 if (localThreadIndex1D == 0) {
301 devDest[blockIndex1D] = sharedData[0];
305 #endif // TGSTKCUDAOPERATIONS_H