Openholo  v4.0
Open Source Digital Holographic Library
ophWRPKernel.cu
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
7 //
8 //
9 // License Agreement
10 // For Open Source Digital Holographic Library
11 //
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
14 //
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
43 //
44 //M*/
45 
46 #ifndef ophWRPKernel_cu__
47 #define ophWRPKernel_cu__
48 #include "ophKernel.cuh"
49 #include <cuda_runtime_api.h>
50 #include <cuda.h>
51 #include <curand_kernel.h>
52 #include <curand_uniform.h>
53 #include <device_launch_parameters.h>
54 #include "ophWRP_GPU.h"
55 
56 __global__ void cudaKernel_CalcDataWRP(cufftDoubleComplex *src, const WRPGpuConst* config)
57 {
58  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
59  __shared__ double ppX;
60  __shared__ double ppY;
61  __shared__ int pnX;
62  __shared__ int pnY;
63  __shared__ int pnXY;
64  __shared__ double ssX;
65  __shared__ double ssY;
66  __shared__ double z;
67  __shared__ double v;
68  __shared__ double lambda;
69  __shared__ double distance;
70  __shared__ double pi2;
71 
72  if (threadIdx.x == 0)
73  {
74  ppX = config->pp_X;
75  ppY = config->pp_Y;
76  pnX = config->pn_X;
77  pnY = config->pn_Y;
78  pnXY = pnX * pnY;
79  ssX = pnX * ppX * 2;
80  ssY = pnY * ppY * 2;
81  lambda = config->lambda;
82  distance = config->propa_d;
83  pi2 = config->pi2;
84  z = distance * pi2;
85  v = 1 / (lambda * lambda);
86  }
87  __syncthreads();
88 
89  if (tid < pnXY * 4)
90  {
91  int pnX2 = pnX * 2;
92 
93  int w = tid % pnX2;
94  int h = tid / pnX2;
95 
96  double fy = (-pnY + h) / ssY;
97  double fyy = fy * fy;
98  double fx = (-pnX + w) / ssX;
99  double fxx = fx * fx;
100  double sqrtpart = sqrt(v - fxx - fyy);
101 
102  cuDoubleComplex prop;
103  prop.x = 0;
104  prop.y = z * sqrtpart;
105 
106  exponent_complex(&prop);
107 
108  cuDoubleComplex val;
109  val.x = src[tid].x;
110  val.y = src[tid].y;
111 
112  cuDoubleComplex val2 = cuCmul(val, prop);
113  src[tid].x = val2.x;
114  src[tid].y = val2.y;
115  }
116 }
117 
118 __global__ void cudaKernel_MoveDataPostWRP(cuDoubleComplex *src, cuDoubleComplex *dst, const WRPGpuConst* config)
119 {
120  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
121 
122  __shared__ int pnX;
123  __shared__ int pnY;
124  __shared__ ulonglong pnXY;
125 
126  if (threadIdx.x == 0)
127  {
128  pnX = config->pn_X;
129  pnY = config->pn_Y;
130  pnXY = pnX * pnY;
131  }
132  __syncthreads();
133 
134  if (tid < pnXY)
135  {
136  int w = tid % pnX;
137  int h = tid / pnX;
138  ulonglong iSrc = pnX * 2 * (pnY / 2 + h) + pnX / 2;
139 
140  dst[tid] = src[iSrc + w];
141  }
142 }
143 
144 __global__ void cudaKernel_MoveDataPreWRP(cuDoubleComplex *src, cuDoubleComplex *dst, const WRPGpuConst* config)
145 {
146  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
147 
148  __shared__ int pnX;
149  __shared__ int pnY;
150  __shared__ ulonglong pnXY;
151 
152  if (threadIdx.x == 0)
153  {
154  pnX = config->pn_X;
155  pnY = config->pn_Y;
156  pnXY = pnX * pnY;
157  }
158  __syncthreads();
159 
160  if (tid < pnXY)
161  {
162  int w = tid % pnX;
163  int h = tid / pnX;
164  ulonglong iDst = pnX * 2 * (pnY / 2 + h) + pnX / 2;
165  dst[iDst + w] = src[tid];
166  }
167 }
168 
169 //__global__ void cudaKernel_GenWRP(Real* pc_dst, Real* amp_dst, const WRPGpuConst* config, const int n_points_stream, cuDoubleComplex* dst)
170 __global__
171 void cudaKernel_GenWRP(Vertex* pc_dst, const WRPGpuConst* config, const int n_points_stream, cuDoubleComplex* dst)
172 {
173  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
174 
175  if (tid < n_points_stream)
176  {
177  __shared__ double ppX;
178  __shared__ double ppY;
179  __shared__ int pnX;
180  __shared__ int pnY;
181  __shared__ double dz;
182  __shared__ double dzz;
183  __shared__ double pi2;
184  __shared__ double k;
185  __shared__ double lambda;
186  __shared__ bool random_phase;
187  __shared__ bool sign;
188 
189  if (threadIdx.x == 0) {
190  ppX = config->pp_X;
191  ppY = config->pp_Y;
192  pnX = config->pn_X;
193  pnY = config->pn_Y;
194  dz = config->wrp_d - config->zmax;
195  dzz = dz * dz;
196  k = config->k;
197  lambda = config->lambda;
198  pi2 = config->pi2;
199  random_phase = config->bRandomPhase;
200  sign = dz > 0.0 ? true : false;
201  }
202  __syncthreads();
203 
204  //int idx = tid * 3;
205  double x = pc_dst[tid].point.pos[_X];
206  double y = pc_dst[tid].point.pos[_Y];
207  double z = pc_dst[tid].point.pos[_Z];
208  double amp = pc_dst[tid].color.color[_R + config->iAmplitude];
209 
210  int hpnX = pnX >> 1;
211  int hpnY = pnY >> 1;
212  double ppXX = ppX * ppX * 2;
213  //double dz = config->wrp_d - config->zmax;
214  double tw = fabs(lambda * dz / ppXX) * 2;
215 
216  int w = (int)tw;
217  int tx = (int)(x / ppX) + hpnX;
218  int ty = (int)(y / ppY) + hpnY;
219 
220  curandState state;
221  if (random_phase)
222  {
223  curand_init(4 * w * w, 0, 0, &state);
224  }
225 
226  for (int wy = -w; wy < w; wy++)
227  {
228  double dy = wy * ppY;
229  double dyy = dy * dy;
230  int tmpY = wy + ty;
231  int baseY = tmpY * pnX;
232 
233  for (int wx = -w; wx < w; wx++) //WRP coordinate
234  {
235  int tmpX = wx + tx;
236 
237  if (tmpX >= 0 && tmpX < pnX && tmpY >= 0 && tmpY < pnY) {
238  int iDst = tmpX + baseY;
239 
240  double dx = wx * ppX;
241 
242  double r = sign ? sqrt(dx * dx + dyy + dzz) : -sqrt(dx * dx + dyy + dzz);
243  double randomData = random_phase ? curand_uniform_double(&state) : 1.0;
244  double randVal = randomData * pi2;
245 
246  cuDoubleComplex tmp;
247  tmp.x = (amp * cos(k*r) * cos(randVal)) / r;
248  tmp.y = (-amp * sin(k*r) * sin(randVal)) / r;
249 
250 #if defined(__cplusplus) && defined(__CUDACC__)
251 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
252  dst[iDst].x = atomicAdd(&dst[iDst].x, tmp.x);
253  dst[iDst].y = atomicAdd(&dst[iDst].y, tmp.y);
254 
255 #else
256  dst[iDst].x += tmp.x; // <-- sync problem
257  dst[iDst].y += tmp.y; // <-- sync problem
258 #endif
259 #else
260  dst[iDst].x += tmp.x; // <-- sync problem
261  dst[iDst].y += tmp.y; // <-- sync problem
262 #endif
263  }
264  }
265  }
266  }
267 }
268 
269 extern "C"
270 {
271  void cudaFresnelPropagationWRP(
272  const int &nBlocks, const int&nBlocks2, const int &nThreads, const int &nx, const int &ny,
273  cuDoubleComplex *src, cuDoubleComplex *dst, cufftDoubleComplex *fftsrc, cufftDoubleComplex *fftdst,
274  const WRPGpuConst* cuda_config)
275  {
276  cudaKernel_MoveDataPreWRP << <nBlocks, nThreads >> > (src, dst, cuda_config);
277 
278  cudaFFT(nullptr, nx * 2, ny * 2, dst, fftsrc, CUFFT_FORWARD, false);
279 
280  cudaKernel_CalcDataWRP << <nBlocks2, nThreads >> > (fftsrc, cuda_config);
281 
282  cudaFFT(nullptr, nx * 2, ny * 2, fftsrc, fftdst, CUFFT_INVERSE, true);
283 
284  cudaKernel_MoveDataPostWRP << <nBlocks, nThreads >> > (fftdst, src, cuda_config);
285  }
286 
287  void cudaGenWRP(
288  const int &nBlocks, const int &nThreads, const int &n_pts_per_stream,
289  //Real* cuda_pc_data, Real* cuda_amp_data,
290  Vertex* cuda_pc_data,
291  cuDoubleComplex* cuda_dst, const WRPGpuConst* cuda_config)
292  {
293  //cudaKernel_GenWRP << <nBlocks, nThreads >> > (cuda_pc_data, cuda_amp_data, cuda_config, n_pts_per_stream, cuda_dst);
294  cudaKernel_GenWRP << <nBlocks, nThreads >> > (cuda_pc_data, cuda_config, n_pts_per_stream, cuda_dst);
295  }
296 }
297 
298 #endif // !OphWRPKernel_cu__