TGSTK  0.0.1
The Tumour Growth Simulation ToolKit
tgstkCUDAOperations.h
Go to the documentation of this file.
1 /*==========================================================================
2 
3  This file is part of the Tumor Growth Simulation ToolKit (TGSTK)
4  (<https://github.com/cormarte/TGSTK>, <https://cormarte.github.io/TGSTK>).
5 
6  Copyright (C) 2021 Corentin Martens
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <https://www.gnu.org/licenses/>.
20 
21  Contact: corentin.martens@ulb.be
22 
23 ==========================================================================*/
24 
25 #ifndef TGSTKCUDAOPERATIONS_H
26 #define TGSTKCUDAOPERATIONS_H
27 
28 #include <cuda_runtime.h>
29 
30 __global__ void gpuAdd(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d);
31 __global__ void gpuConstMultiply(cudaPitchedPtr devSrc, float c, cudaPitchedPtr devDest, int w, int h, int d);
32 __global__ void gpuCopy(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, int w, int h, int d);
33 __global__ void gpuDivide(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d);
34 __global__ void gpuMultiply(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d);
35 __global__ void gpuNorm(cudaPitchedPtr devSrcX, cudaPitchedPtr devSrcY, cudaPitchedPtr devSrcZ, cudaPitchedPtr devDest, int w, int h, int d);
36 __global__ void gpuSet(cudaPitchedPtr devDest, float value, int w, int h, int d);
37 __global__ void gpuSubstract(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d);
38 __global__ void gpuThreshold(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float threshold, int w, int h, int d);
39 
40 template<int blockDimX, int blockDimY, int blockDimZ, int pixelsPerThread, int kernelBlockRatio>
41 __global__ void gpuConvolutionX(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float* kernel, int kernelRadius, int w, int h, int d) {
42 
43  __shared__ float sharedData[blockDimX*(pixelsPerThread + 2*kernelBlockRatio)][blockDimY][blockDimZ]; // 10 MB
44 
45 
46  // Base indices
47 
48  const int baseX = (blockIdx.x * pixelsPerThread - kernelBlockRatio)*blockDimX + threadIdx.x;
49  const int baseY = blockIdx.y * blockDimY + threadIdx.y;
50  const int baseZ = blockIdx.z * blockDimZ + threadIdx.z;
51 
52  const size_t pitch = devSrc.pitch;
53  const size_t slicePitch = pitch * h;
54 
55 
56  // Shared memory tile loading
57 
58  #pragma unroll
59  for (int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
60 
61  sharedData[threadIdx.x + i*blockDimX][threadIdx.y][threadIdx.z] = (baseX + i*blockDimX < w && baseY < h && baseZ < d) ? *((float*)((char*)devSrc.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) : 0.0f;
62  }
63 
64 
65  // Shared memory left halo loading
66 
67  #pragma unroll
68  for (int i = 0; i != kernelBlockRatio; i++) {
69 
70  sharedData[threadIdx.x + i*blockDimX][threadIdx.y][threadIdx.z] = (baseX + i*blockDimX >= 0 && baseY < h && baseZ < d) ? *((float*)((char*)devSrc.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) : 0.0f;
71  }
72 
73 
74  // Shared memory right halo loading
75 
76  #pragma unroll
77  for (int i = kernelBlockRatio + pixelsPerThread; i != kernelBlockRatio + pixelsPerThread + kernelBlockRatio; i++) {
78 
79  sharedData[threadIdx.x + i*blockDimX][threadIdx.y][threadIdx.z] = (baseX + i*blockDimX < w && baseY < h && baseZ < d) ? *((float*)((char*)devSrc.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) : 0.0f;
80  }
81 
82  __syncthreads();
83 
84 
85  // Convolution
86 
87  #pragma unroll
88  for (int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
89 
90  float sum = 0.0f;
91 
92  #pragma unroll
93  for (int j = -kernelRadius; j != kernelRadius + 1; j++) {
94 
95  sum += kernel[kernelRadius + j] * sharedData[threadIdx.x + i*blockDimX + j][threadIdx.y][threadIdx.z];
96  }
97 
98  if (baseX + i*blockDimX < w && baseY < h && baseZ < d) {
99 
100  *((float*)((char*)devDest.ptr + baseZ * slicePitch + baseY * pitch) + baseX + i*blockDimX) = sum;
101  }
102  }
103 }
104 
105 template<int blockDimX, int blockDimY, int blockDimZ, int pixelsPerThread, int kernelBlockRatio>
106 __global__ void gpuConvolutionY(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float* kernel, int kernelRadius, int w, int h, int d) {
107 
108  __shared__ float sharedData[blockDimX][blockDimY*(pixelsPerThread + 2*kernelBlockRatio)][blockDimZ]; // 10 MB
109 
110 
111  // Base indices
112 
113  const int baseX = blockIdx.x * blockDimX + threadIdx.x;
114  const int baseY = (blockIdx.y * pixelsPerThread - kernelBlockRatio)*blockDimY + threadIdx.y;
115  const int baseZ = blockIdx.z * blockDimZ + threadIdx.z;
116 
117  const size_t pitch = devSrc.pitch;
118  const size_t slicePitch = pitch * h;
119 
120 
121  // Shared memory tile loading
122 
123  #pragma unroll
124  for (int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
125 
126  sharedData[threadIdx.x][threadIdx.y + i*blockDimY][threadIdx.z] = (baseX < w && baseY + i*blockDimY < h && baseZ < d) ? *((float*)((char*)devSrc.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) : 0.0f;
127  }
128 
129 
130  // Shared memory left halo loading
131 
132  #pragma unroll
133  for (int i = 0; i != kernelBlockRatio; i++) {
134 
135  sharedData[threadIdx.x][threadIdx.y + i*blockDimY][threadIdx.z] = (baseX < w && baseY + i*blockDimY >= 0 && baseZ < d) ? *((float*)((char*)devSrc.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) : 0.0f;
136  }
137 
138 
139  // Shared memory right halo loading
140 
141  #pragma unroll
142  for (int i = kernelBlockRatio + pixelsPerThread; i != kernelBlockRatio + pixelsPerThread + kernelBlockRatio; i++) {
143 
144  sharedData[threadIdx.x][threadIdx.y + i*blockDimY][threadIdx.z] = (baseX < w && baseY + i*blockDimY < h && baseZ < d) ? *((float*)((char*)devSrc.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) : 0.0f;
145  }
146 
147  __syncthreads();
148 
149 
150  // Convolution
151 
152  #pragma unroll
153  for (int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
154 
155  float sum = 0.0f;
156 
157  #pragma unroll
158  for (int j = -kernelRadius; j != kernelRadius + 1; j++) {
159 
160  sum += kernel[kernelRadius + j] * sharedData[threadIdx.x][threadIdx.y + i*blockDimY + j][threadIdx.z];
161  }
162 
163  if (baseX < w && baseY + i*blockDimY < h && baseZ < d) {
164 
165  *((float*)((char*)devDest.ptr + baseZ * slicePitch + (baseY + i*blockDimY) * pitch) + baseX) = sum;
166  }
167  }
168 }
169 
170 template<int blockDimX, int blockDimY, int blockDimZ, int pixelsPerThread, int kernelBlockRatio>
171 __global__ void gpuConvolutionZ(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float* kernel, int kernelRadius, int w, int h, int d) {
172 
173  __shared__ float sharedData[blockDimX][blockDimY][blockDimZ*(pixelsPerThread + 2*kernelBlockRatio)]; // 10 MB
174 
175 
176  // Base indices
177 
178  const int baseX = blockIdx.x * blockDimX + threadIdx.x;
179  const int baseY = blockIdx.y * blockDimY + threadIdx.y;
180  const int baseZ = (blockIdx.z * pixelsPerThread - kernelBlockRatio)*blockDimZ + threadIdx.z;
181 
182  const size_t pitch = devSrc.pitch;
183  const size_t slicePitch = pitch * h;
184 
185 
186  // Shared memory tile loading
187 
188  #pragma unroll
189  for (int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
190 
191  sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ] = (baseX < w && baseY < h && baseZ + i*blockDimZ < d) ? *((float*)((char*)devSrc.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) : 0.0f;
192  }
193 
194 
195  // Shared memory left halo loading
196 
197  #pragma unroll
198  for (int i = 0; i != kernelBlockRatio; i++) {
199 
200  sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ] = (baseX < w && baseY < h && baseZ + i*blockDimZ >= 0) ? *((float*)((char*)devSrc.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) : 0.0f;
201  }
202 
203 
204  // Shared memory right halo loading
205 
206  #pragma unroll
207  for (int i = kernelBlockRatio + pixelsPerThread; i != kernelBlockRatio + pixelsPerThread + kernelBlockRatio; i++) {
208 
209  sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ] = (baseX < w && baseY < h && baseZ + i*blockDimZ < d) ? *((float*)((char*)devSrc.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) : 0.0f;
210  }
211 
212  __syncthreads();
213 
214 
215  // Convolution
216 
217  #pragma unroll
218  for (int i = kernelBlockRatio; i != kernelBlockRatio + pixelsPerThread; i++) {
219 
220  float sum = 0.0f;
221 
222  #pragma unroll
223  for (int j = -kernelRadius; j != kernelRadius + 1; j++) {
224 
225  sum += kernel[kernelRadius + j] * sharedData[threadIdx.x][threadIdx.y][threadIdx.z + i*blockDimZ + j];
226  }
227 
228  if (baseX < w && baseY < h && baseZ + i*blockDimZ < d) {
229 
230  *((float*)((char*)devDest.ptr + (baseZ + i*blockDimZ) * slicePitch + baseY * pitch) + baseX) = sum;
231  }
232  }
233 }
234 
235 template<int blockDimX, int blockDimY, int blockDimZ, int pixelsPerDimension>
236 __global__ void gpuReduce(cudaPitchedPtr devSrc, float* devDest, int w, int h, int d) {
237 
238  const int numberOfThreads = blockDimX*blockDimY*blockDimZ;
239  const int sharedSize = numberOfThreads*pixelsPerDimension*pixelsPerDimension*pixelsPerDimension;
240 
241  __shared__ float sharedData[sharedSize];
242 
243  const int localThreadIndex1D = blockDimX*blockDimY*threadIdx.z + blockDimX*threadIdx.y + threadIdx.x;
244  const int blockIndex1D = gridDim.x*gridDim.y*blockIdx.z + gridDim.x*blockIdx.y + blockIdx.x;
245 
246  const int baseX = blockIdx.x * pixelsPerDimension * blockDimX + threadIdx.x;
247  const int baseY = blockIdx.y * pixelsPerDimension * blockDimY + threadIdx.y;
248  const int baseZ = blockIdx.z * pixelsPerDimension * blockDimZ + threadIdx.z;
249 
250  const size_t pitch = devSrc.pitch;
251  const size_t slicePitch = pitch * h;
252 
253 
254  // Copy
255 
256  #pragma unroll
257  for (int k=0; k!=pixelsPerDimension; k++) {
258 
259  #pragma unroll
260  for (int j=0; j!=pixelsPerDimension; j++) {
261 
262  #pragma unroll
263  for (int i=0; i!=pixelsPerDimension; i++) {
264 
265  sharedData[blockDimX*pixelsPerDimension*blockDimY*pixelsPerDimension*(threadIdx.z + k*blockDimZ) + blockDimX*pixelsPerDimension*(threadIdx.y + j*blockDimY) + threadIdx.x + i*blockDimX] = (baseX + i*blockDimX < w && baseY + j*blockDimY < h && baseZ + k*blockDimZ < d) ? *((float*)((char*)devSrc.ptr + (baseZ + k*blockDimZ) * slicePitch + (baseY + j*blockDimY) * pitch) + baseX + i*blockDimX) : 0.0f;
266  }
267  }
268  }
269 
270  __syncthreads();
271 
272 
273  // Reduction
274 
275  int index, i;
276 
277  #pragma unroll
278  for (int s=sharedSize/2; s>0; s/=2) {
279 
280  i=0;
281 
282  do {
283 
284  index = localThreadIndex1D + i*numberOfThreads;
285 
286  if (index < s) {
287 
288  sharedData[index] += sharedData[index+s];
289  }
290 
291  i++;
292  }
293 
294  while (i < s/numberOfThreads);
295  }
296 
297  __syncthreads();
298 
299  if (localThreadIndex1D == 0) {
300 
301  devDest[blockIndex1D] = sharedData[0];
302  }
303 }
304 
305 #endif // TGSTKCUDAOPERATIONS_H
gpuAdd
__global__ void gpuAdd(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d)
gpuMultiply
__global__ void gpuMultiply(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d)
gpuConvolutionX
__global__ void gpuConvolutionX(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float *kernel, int kernelRadius, int w, int h, int d)
Definition: tgstkCUDAOperations.h:41
gpuReduce
__global__ void gpuReduce(cudaPitchedPtr devSrc, float *devDest, int w, int h, int d)
Definition: tgstkCUDAOperations.h:236
gpuSet
__global__ void gpuSet(cudaPitchedPtr devDest, float value, int w, int h, int d)
gpuConvolutionZ
__global__ void gpuConvolutionZ(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float *kernel, int kernelRadius, int w, int h, int d)
Definition: tgstkCUDAOperations.h:171
gpuConvolutionY
__global__ void gpuConvolutionY(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float *kernel, int kernelRadius, int w, int h, int d)
Definition: tgstkCUDAOperations.h:106
gpuNorm
__global__ void gpuNorm(cudaPitchedPtr devSrcX, cudaPitchedPtr devSrcY, cudaPitchedPtr devSrcZ, cudaPitchedPtr devDest, int w, int h, int d)
gpuConstMultiply
__global__ void gpuConstMultiply(cudaPitchedPtr devSrc, float c, cudaPitchedPtr devDest, int w, int h, int d)
gpuThreshold
__global__ void gpuThreshold(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, float threshold, int w, int h, int d)
gpuSubstract
__global__ void gpuSubstract(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d)
gpuCopy
__global__ void gpuCopy(cudaPitchedPtr devSrc, cudaPitchedPtr devDest, int w, int h, int d)
gpuDivide
__global__ void gpuDivide(cudaPitchedPtr devSrc1, cudaPitchedPtr devSrc2, cudaPitchedPtr devDest, int w, int h, int d)