Openholo  v4.0
Open Source Digital Holographic Library
ophSigKernel.cu
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
7 //
8 //
9 // License Agreement
10 // For Open Source Digital Holographic Library
11 //
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
14 //
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
43 //
44 //M*/
45 
46 
47 #ifndef __ophSigKernel_cu
48 #define __ophSigKernel_cu
49 
50 #include <cuda_runtime_api.h>
51 #include <cuda.h>
52 #include <cufft.h>
53 #include <device_launch_parameters.h>
54 #include <device_functions.h>
55 #include <npp.h>
56 #include "typedef.h"
57 #include "ophSig.h"
58 
59 static const int kBlockThreads = 1024;
60 
61 
62 __global__ void cudaKernel_FFT(int nx, int ny, cufftDoubleComplex* input, cufftDoubleComplex* output, bool bNormailzed)
63 {
64  int tid = threadIdx.x + blockIdx.x * blockDim.x;
65  double normalF = nx * ny;
66 
67  output[tid].x = input[tid].x / normalF;
68  output[tid].y = input[tid].y / normalF;
69 }
70 
71 __global__ void cudaKernel_CreateSphtialCarrier() {
72  int tid = threadIdx.x + blockIdx.x * blockDim.x;
73 }
74 
75 //__global__ void cudaKernel_sigCvtOFF(cuDoubleComplex *src_data, Real *dst_data, ophSigConfig *device_config, int nx, int ny, Real wl, cuDoubleComplex *F, Real *angle) {
76 __global__ void cudaKernel_sigCvtOFF(cuDoubleComplex *src_data, Real *dst_data, ophSigConfig *device_config, int nx, int ny, Real wl, cuDoubleComplex*F, Real *angle) {
77  int tid = threadIdx.x + blockIdx.x * blockDim.x;
78  int r = tid % ny;
79  int c = tid / ny;
80  Real x, y;
81  if (tid < nx*ny)
82  {
83  x = (device_config->height / (ny - 1)*r - device_config->height / 2);
84  y = (device_config->width / (nx - 1)*c - device_config->width / 2);
85  F[tid].x = cos(((2 * M_PI) / wl)*(x * sin(angle[_X]) + y * sin(angle[_Y])));
86  F[tid].y = sin(((2 * M_PI) / wl)*(x * sin(angle[_X]) + y * sin(angle[_Y])));
87  dst_data[tid] = src_data[tid].x * F[tid].x - src_data[tid].y * F[tid].y;
88  }
89 }
90 //__global__ void cudaKernel_sigCvtHPO(ophSigConfig *device_config, cuDoubleComplex *F, int nx, int ny, Real Rephase, Real Imphase) {
91 __global__ void cudaKernel_sigCvtHPO(ophSigConfig *device_config, cuDoubleComplex *F, int nx, int ny, Real Rephase, Real Imphase) {
92  int tid = threadIdx.x + blockIdx.x * blockDim.x;
93  int r = tid % ny;
94  int c = tid / ny;
95  int xshift = nx / 2;
96  int yshift = ny / 2;
97  Real y;
98 
99  if (tid < nx*ny)
100  {
101  int ii = (r + yshift) % ny;
102  int jj = (c + xshift) % nx;
103  y = (2 * M_PI * (c) / device_config->width - M_PI * (nx - 1) / device_config->width);
104  F[ny*jj + ii].x = exp(Rephase*y * y)*cos(Imphase*y * y);
105  F[ny*jj + ii].y = exp(Rephase*y * y)*sin(Imphase*y * y);
106  }
107 }
108 
109 __global__ void cudaKernel_sigCvtCAC(cuDoubleComplex*FFZP, ophSigConfig *device_config, int nx, int ny, Real sigmaf, Real radius) {
110  int tid = threadIdx.x + blockIdx.x * blockDim.x;
111  int r = tid % ny;
112  int c = tid / ny;
113  int xshift = nx >> 1;
114  int yshift = ny >> 1;
115  Real x, y;
116  if (tid < nx*ny)
117  {
118  int ii = (r + yshift) % ny;
119  int jj = (c + xshift) % nx;
120  y = (2 * M_PI * c) / radius - (M_PI*(nx - 1)) / radius;
121  x = (2 * M_PI * r) / radius - (M_PI*(ny - 1)) / radius;
122 
123  FFZP[ny*jj + ii].x = cos(sigmaf * (x*x + y * y));
124  FFZP[ny*jj + ii].y = -sin(sigmaf * (x*x + y * y));
125 
126  }
127 
128 }
129 
130 //__global__ void cudaKernel_multiply(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *F, int nx, int ny) {
131 __global__ void cudaKernel_multiply(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *F, int nx, int ny)
132 {
133  int tid = threadIdx.x + blockIdx.x * blockDim.x;
134  if (tid < nx*ny)
135  {
136  dst_data[tid].x = src_data[tid].x * F[tid].x - src_data[tid].y * F[tid].y;
137  dst_data[tid].y = src_data[tid].y * F[tid].x + src_data[tid].x * F[tid].y;
138  }
139 }
140 
141 
142 //__global__ void cudaKernel_Realmultiply(cufftDoubleComplex *src_data, Real *dst_data, cuDoubleComplex *F, int nx, int ny) {
143 __global__ void cudaKernel_Realmultiply(cufftDoubleComplex *src_data, Real *dst_data, cuDoubleComplex *F, int nx, int ny)
144 {
145  int tid = threadIdx.x + blockIdx.x * blockDim.x;
146  if (tid < nx*ny)
147  {
148  dst_data[tid] = src_data[tid].x * F[tid].x - src_data[tid].y * F[tid].y;
149  }
150 }
151 
152 
153 __global__ void cudaKernel_Propagation(cuDoubleComplex*FH, ophSigConfig *device_config, int nx, int ny, Real sigmaf) {
154  int tid = threadIdx.x + blockIdx.x * blockDim.x;
155  int r = tid % ny;
156  int c = tid / ny;
157  int xshift = nx >> 1;
158  int yshift = ny >> 1;
159  int ii = (r + yshift) % ny;
160  int jj = (c + xshift) % nx;
161  Real x, y;
162 
163  x = (2 * M_PI * (c)) / device_config->height - (M_PI*(nx - 1)) / (device_config->height);
164  y = (2 * M_PI * (r)) / device_config->width - (M_PI*(ny - 1)) / (device_config->width);
165  FH[ny*jj + ii].x = cos(sigmaf * (x * x + y * y));
166  FH[ny*jj + ii].y = sin(sigmaf * (x * x + y * y));
167 }
168 
169 __global__ void cudaKernel_GetParamAT1(cuDoubleComplex*src_data, cuDoubleComplex*Flr, cuDoubleComplex*Fli, cuDoubleComplex*G, ophSigConfig *device_config, int nx, int ny, Real_t NA_g, Real wl)
170 {
171  int tid = threadIdx.x + blockIdx.x * blockDim.x;
172 
173  int r = tid % ny;
174  int c = tid / ny;
175  Real x, y;
176  if (tid < nx*ny)
177  {
178  x = 2 * M_PI * (c) / device_config->height - M_PI * (nx - 1) / device_config->height;
179  y = 2 * M_PI * (r) / device_config->width - M_PI * (ny - 1) / device_config->width;
180 
181  G[tid].x = exp(-M_PI * pow((wl) / (2 * M_PI * NA_g), 2) * (x * x + y * y));
182 
183  Flr[tid].x = src_data[tid].x;
184  Fli[tid].x = src_data[tid].y;
185  Flr[tid].y = 0;
186  Fli[tid].y = 0;
187  }
188 }
189 
190 __global__ void cudaKernel_GetParamAT2(cuDoubleComplex*Flr, cuDoubleComplex*Fli, cuDoubleComplex*G, cuDoubleComplex *temp_data, int nx, int ny)
191 {
192  int tid = threadIdx.x + blockIdx.x * blockDim.x;
193 
194  int r = tid % ny;
195  int c = tid / ny;
196  int xshift = nx >> 1;
197  int yshift = ny >> 1;
198  int ii = (c + xshift) % nx;
199  int jj = (r + yshift) % ny;
200 
201  if (tid < nx*ny)
202  {
203  temp_data[ny*ii + jj].x =
204  (
205  ((Flr[tid].x * G[tid].x * Flr[tid].x * G[tid].x) -
206  (Fli[tid].x * G[tid].x * Fli[tid].x * G[tid].x)) /
207  ((Flr[tid].x * G[tid].x * Flr[tid].x * G[tid].x) +
208  (Fli[tid].x * G[tid].x * Fli[tid].x * G[tid].x) +
209  pow(10., -300)
210  )
211  );
212  temp_data[ny*ii + jj].y =
213  (2 * Flr[tid].x * G[tid].x * (Fli[tid].x * G[tid].x)) /
214  ((Flr[tid].x * G[tid].x * Flr[tid].x * G[tid].x) +
215  (Fli[tid].x * G[tid].x * Fli[tid].x * G[tid].x) +
216  pow(10., -300)
217  );
218  }
219 }
220 
221 __global__ void cudaKernel_sigGetParamSF(cufftDoubleComplex *src_data, Real *f, int nx, int ny, float th)
222 {
223  int tid = threadIdx.x + blockIdx.x * blockDim.x;
224  int r = tid % ny;
225  int c = tid / ny;
226  Real ret1 = 0, ret2 = 0;
227  if (tid < (nx)*(ny))
228  {
229  f[tid] = 0;
230  if (r != ny - 2 && r != ny - 1)
231  {
232  if (c != nx - 2 && c != nx - 1)
233  {
234  ret1 = abs(src_data[r + (c + 2)*ny].x - src_data[tid].x);
235  ret2 = abs(src_data[(r + 2) + c * ny].x - src_data[tid].x);
236  }
237  }
238  if (ret1 >= th) { f[tid] = ret1 * ret1; }
239  else if (ret2 >= th) { f[tid] = ret2 * ret2; }
240  }
241 }
242 
243 __global__ void cudaKernel_sub(Real *data, int nx, int ny, Real *Min)
244 {
245  int tid = threadIdx.x + blockIdx.x*blockDim.x;
246  data[tid] = data[tid] - *Min;
247 }
248 
249 __global__ void cudaKernel_div(Real *src_data, cuDoubleComplex *dst_data, int nx, int ny, Real *Max)
250 {
251  int tid = threadIdx.x + blockIdx.x*blockDim.x;
252  dst_data[tid].x = src_data[tid] / *Max;
253  dst_data[tid].y = 0;
254 }
255 __global__ void fftShift(int N, int nx, int ny, cufftDoubleComplex* input, cufftDoubleComplex* output, bool bNormailzed)
256 {
257  int tid = threadIdx.x + blockIdx.x*blockDim.x;
258  /*double normalF = 1.0;
259  if (bNormailzed == true)
260  normalF = nx * ny;
261 
262 
263  int r = tid % ny;
264  int c = tid / ny;
265  int xshift = nx / 2;
266  int yshift = ny / 2;
267  int ii = (c + xshift) % nx;
268  int jj = (r + yshift) % ny;
269 
270  output[ny*jj + ii].x = input[tid].x;
271  output[ny*jj + ii].y = input[tid].y;*/
272  double normalF = 1.0;
273  if (bNormailzed == true)
274  normalF = nx * ny;
275 
276  while (tid < N)
277  {
278  int i = tid % nx;
279  int j = tid / nx;
280 
281  int ti = i - nx / 2; if (ti < 0) ti += nx;
282  int tj = j - ny / 2; if (tj < 0) tj += ny;
283 
284  int oindex = tj * nx + ti;
285 
286 
287  output[tid].x = input[oindex].x / normalF;
288  output[tid].y = input[oindex].y / normalF;
289 
290  tid += blockDim.x * gridDim.x;
291  }
292 }
293 
294 extern "C"
295 {
296  void cudaFFT(int nx, int ny, cufftDoubleComplex* in_field, cufftDoubleComplex* output_field, int direction, bool bNormalized)
297  {
298  unsigned int nblocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
299  int N = nx * ny;
300  cufftHandle plan;
301  cufftResult result = cufftPlan2d(&plan, ny, nx, CUFFT_Z2Z);
302 
303  if (direction == -1)
304  result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_FORWARD);
305  else
306  result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_INVERSE);
307 
308  if (result != CUFFT_SUCCESS)
309  {
310  LOG("------------------FAIL: execute cufft, code=%s", result);
311  return;
312  }
313 
314  if (cudaDeviceSynchronize() != cudaSuccess) {
315  LOG("Cuda error: Failed to synchronize\n");
316  return;
317  }
318 
319 
320  cufftDestroy(plan);
321  }
322 
323  void cudaCuFFT(cufftHandle* plan, cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, int nx, int ny, int direction) {
324  unsigned int nBlocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
325 
326 
327  cufftResult result;
328  result = cufftExecZ2Z(*plan, src_data, dst_data, CUFFT_FORWARD);
329 
330  if (result != CUFFT_SUCCESS)
331  return;
332 
333  if (cudaDeviceSynchronize() != cudaSuccess)
334  return;
335 
336  }
337 
338 
339  void cudaCuIFFT(cufftHandle* plan, cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, int nx, int ny, int direction) {
340  unsigned int nBlocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
341  cufftResult result;
342  //
343  result = cufftExecZ2Z(*plan, src_data, dst_data, CUFFT_INVERSE);
344  cudaKernel_FFT << < nBlocks, kBlockThreads >> > (nx, ny, dst_data, src_data, direction);
345 
346  if (result != CUFFT_SUCCESS)
347  return;
348 
349  if (cudaDeviceSynchronize() != cudaSuccess)
350  return;
351 
352  }
353 
354  void cudaCvtOFF(cuDoubleComplex *src_data, Real *dst_data, ophSigConfig *device_config, int nx, int ny, Real wl, cuDoubleComplex*F, Real *angle)
355  {
356  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
357 
358 
359  cudaKernel_sigCvtOFF << < nBlocks, kBlockThreads >> > (src_data, dst_data, device_config, nx, ny, wl, F, angle);
360  int tid = threadIdx.x + blockIdx.x*blockDim.x;
361  uchar * pDeviceBuffer;
362  Real pM = 0;
363  int nBufferSize;
364  nppsSumGetBufferSize_64f(nx*ny, &nBufferSize);
365  cudaMalloc((void**)(&pDeviceBuffer), nBufferSize);
366  nppsMin_64f(dst_data, nx*ny, &pM, pDeviceBuffer);
367  cudaKernel_sub << < nBlocks, kBlockThreads >> > (dst_data, nx, ny, &pM);
368  nppsMax_64f(dst_data, nx*ny, &pM, pDeviceBuffer);
369  cudaKernel_div << < nBlocks, kBlockThreads >> > (dst_data, src_data, nx, ny, &pM);
370  cudaFree(pDeviceBuffer);
371 
372  }
373 
374  void cudaCvtHPO(CUstream_st* stream, cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, ophSigConfig *device_config, cuDoubleComplex*F, int nx, int ny, Real Rephase, Real Imphase) {
375  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
376  cudaKernel_sigCvtHPO << < nBlocks, kBlockThreads, 0, stream >> > (device_config, F, nx, ny, Rephase, Imphase);
377  cudaKernel_multiply << < nBlocks, kBlockThreads, 0, stream >> > (src_data, dst_data, F, nx, ny);
378 
379  }
380  void cudaCvtCAC(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *FFZP, ophSigConfig *device_config, int nx, int ny, Real sigmaf, Real radius) {
381  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
382  cudaKernel_sigCvtCAC << < nBlocks, kBlockThreads >> > (FFZP, device_config, nx, ny, sigmaf, radius);
383  cudaKernel_multiply << < nBlocks, kBlockThreads >> > (src_data, dst_data, FFZP, nx, ny);
384  }
385 
386  void cudaPropagation(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *FH, ophSigConfig *device_config, int nx, int ny, Real sigmaf) {
387  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
388  cudaKernel_Propagation << < nBlocks, kBlockThreads >> > (FH, device_config, nx, ny, sigmaf);
389  cudaKernel_multiply << < nBlocks, kBlockThreads >> > (src_data, dst_data, FH, nx, ny);
390  //__syncthreads();
391  }
392  void cudaGetParamAT1(cuDoubleComplex *src_data, cuDoubleComplex *Flr, cuDoubleComplex *Fli, cuDoubleComplex *G, ophSigConfig *device_config, int nx, int ny, Real_t NA_g, Real wl) {
393  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
394 
395  cudaKernel_GetParamAT1 << < nBlocks, kBlockThreads >> > (src_data, Flr, Fli, G, device_config, nx, ny, NA_g, wl);
396 
397  //__syncthreads();
398  }
399 
400  void cudaGetParamAT2(cuDoubleComplex *Flr, cuDoubleComplex *Fli, cuDoubleComplex *G, cuDoubleComplex *temp_data, int nx, int ny) {
401  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
402 
403  cudaKernel_GetParamAT2 << < nBlocks, kBlockThreads >> > (Flr, Fli, G, temp_data, nx, ny);
404 
405  //__syncthreads();
406  }
407 
408 
409  double cudaGetParamSF(cufftHandle *fftplan, cufftDoubleComplex *src_data, cufftDoubleComplex *temp_data, cufftDoubleComplex *dst_data, Real *f, cuDoubleComplex *FH, ophSigConfig *device_config, int nx, int ny, float zMax, float zMin, int sampN, float th, Real wl) {
410  unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
411 
412  int nBufferSize;
413  Real max = MIN_DOUBLE;
414  Real depth = 0;
415  Real nSumHost = 0;
416  uchar * pDeviceBuffer;
417  Real* pSum;
418 
419  cudaMalloc((void **)(&pSum), sizeof(Real));
420  nppsSumGetBufferSize_64f(nx*ny, &nBufferSize);
421  cudaMalloc((void**)(&pDeviceBuffer), nBufferSize);
422  Real dz = (zMax - zMin) / sampN;
423  for (int n = 0; n < sampN + 1; n++)
424  {
425 
426  Real z = ((n)* dz + zMin);
427  Real_t sigmaf = (z*wl) / (4 * M_PI);
428  //propagation
429  cudaPropagation(src_data, dst_data, FH, device_config, nx, ny, sigmaf);
430  cudaCuIFFT(fftplan, dst_data, temp_data, nx, ny, CUFFT_INVERSE);
431  //SF
432  cudaKernel_sigGetParamSF << < nBlocks, kBlockThreads >> > (dst_data, f, nx, ny, th);
433  //// //////sum matrix
434  ////
435  nppsSum_64f(f, nx*ny, pSum, pDeviceBuffer);
436  cudaMemcpy(&nSumHost, pSum, sizeof(Real), cudaMemcpyDeviceToHost);
437  cout << (float)n / sampN * 100 << " %" << endl;
438  //// //////find max
439  if (nSumHost > max) {
440  max = nSumHost;
441  depth = z;
442  }
443  }
444  cudaFree(pDeviceBuffer);
445  cudaFree(pSum);
446  return depth;
447  //__syncthreads();
448  }
449 
450 
451 };
452 
453 
454 #endif