1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
10 // For Open Source Digital Holographic Library
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
47 #ifndef __ophSigKernel_cu
48 #define __ophSigKernel_cu
50 #include <cuda_runtime_api.h>
53 #include <device_launch_parameters.h>
54 #include <device_functions.h>
59 static const int kBlockThreads = 1024;
62 __global__ void cudaKernel_FFT(int nx, int ny, cufftDoubleComplex* input, cufftDoubleComplex* output, bool bNormailzed)
64 int tid = threadIdx.x + blockIdx.x * blockDim.x;
65 double normalF = nx * ny;
67 output[tid].x = input[tid].x / normalF;
68 output[tid].y = input[tid].y / normalF;
71 __global__ void cudaKernel_CreateSphtialCarrier() {
72 int tid = threadIdx.x + blockIdx.x * blockDim.x;
75 //__global__ void cudaKernel_sigCvtOFF(cuDoubleComplex *src_data, Real *dst_data, ophSigConfig *device_config, int nx, int ny, Real wl, cuDoubleComplex *F, Real *angle) {
76 __global__ void cudaKernel_sigCvtOFF(cuDoubleComplex *src_data, Real *dst_data, ophSigConfig *device_config, int nx, int ny, Real wl, cuDoubleComplex*F, Real *angle) {
77 int tid = threadIdx.x + blockIdx.x * blockDim.x;
83 x = (device_config->height / (ny - 1)*r - device_config->height / 2);
84 y = (device_config->width / (nx - 1)*c - device_config->width / 2);
85 F[tid].x = cos(((2 * M_PI) / wl)*(x * sin(angle[_X]) + y * sin(angle[_Y])));
86 F[tid].y = sin(((2 * M_PI) / wl)*(x * sin(angle[_X]) + y * sin(angle[_Y])));
87 dst_data[tid] = src_data[tid].x * F[tid].x - src_data[tid].y * F[tid].y;
90 //__global__ void cudaKernel_sigCvtHPO(ophSigConfig *device_config, cuDoubleComplex *F, int nx, int ny, Real Rephase, Real Imphase) {
91 __global__ void cudaKernel_sigCvtHPO(ophSigConfig *device_config, cuDoubleComplex *F, int nx, int ny, Real Rephase, Real Imphase) {
92 int tid = threadIdx.x + blockIdx.x * blockDim.x;
101 int ii = (r + yshift) % ny;
102 int jj = (c + xshift) % nx;
103 y = (2 * M_PI * (c) / device_config->width - M_PI * (nx - 1) / device_config->width);
104 F[ny*jj + ii].x = exp(Rephase*y * y)*cos(Imphase*y * y);
105 F[ny*jj + ii].y = exp(Rephase*y * y)*sin(Imphase*y * y);
109 __global__ void cudaKernel_sigCvtCAC(cuDoubleComplex*FFZP, ophSigConfig *device_config, int nx, int ny, Real sigmaf, Real radius) {
110 int tid = threadIdx.x + blockIdx.x * blockDim.x;
113 int xshift = nx >> 1;
114 int yshift = ny >> 1;
118 int ii = (r + yshift) % ny;
119 int jj = (c + xshift) % nx;
120 y = (2 * M_PI * c) / radius - (M_PI*(nx - 1)) / radius;
121 x = (2 * M_PI * r) / radius - (M_PI*(ny - 1)) / radius;
123 FFZP[ny*jj + ii].x = cos(sigmaf * (x*x + y * y));
124 FFZP[ny*jj + ii].y = -sin(sigmaf * (x*x + y * y));
130 //__global__ void cudaKernel_multiply(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *F, int nx, int ny) {
131 __global__ void cudaKernel_multiply(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *F, int nx, int ny)
133 int tid = threadIdx.x + blockIdx.x * blockDim.x;
136 dst_data[tid].x = src_data[tid].x * F[tid].x - src_data[tid].y * F[tid].y;
137 dst_data[tid].y = src_data[tid].y * F[tid].x + src_data[tid].x * F[tid].y;
142 //__global__ void cudaKernel_Realmultiply(cufftDoubleComplex *src_data, Real *dst_data, cuDoubleComplex *F, int nx, int ny) {
143 __global__ void cudaKernel_Realmultiply(cufftDoubleComplex *src_data, Real *dst_data, cuDoubleComplex *F, int nx, int ny)
145 int tid = threadIdx.x + blockIdx.x * blockDim.x;
148 dst_data[tid] = src_data[tid].x * F[tid].x - src_data[tid].y * F[tid].y;
153 __global__ void cudaKernel_Propagation(cuDoubleComplex*FH, ophSigConfig *device_config, int nx, int ny, Real sigmaf) {
154 int tid = threadIdx.x + blockIdx.x * blockDim.x;
157 int xshift = nx >> 1;
158 int yshift = ny >> 1;
159 int ii = (r + yshift) % ny;
160 int jj = (c + xshift) % nx;
163 x = (2 * M_PI * (c)) / device_config->height - (M_PI*(nx - 1)) / (device_config->height);
164 y = (2 * M_PI * (r)) / device_config->width - (M_PI*(ny - 1)) / (device_config->width);
165 FH[ny*jj + ii].x = cos(sigmaf * (x * x + y * y));
166 FH[ny*jj + ii].y = sin(sigmaf * (x * x + y * y));
169 __global__ void cudaKernel_GetParamAT1(cuDoubleComplex*src_data, cuDoubleComplex*Flr, cuDoubleComplex*Fli, cuDoubleComplex*G, ophSigConfig *device_config, int nx, int ny, Real_t NA_g, Real wl)
171 int tid = threadIdx.x + blockIdx.x * blockDim.x;
178 x = 2 * M_PI * (c) / device_config->height - M_PI * (nx - 1) / device_config->height;
179 y = 2 * M_PI * (r) / device_config->width - M_PI * (ny - 1) / device_config->width;
181 G[tid].x = exp(-M_PI * pow((wl) / (2 * M_PI * NA_g), 2) * (x * x + y * y));
183 Flr[tid].x = src_data[tid].x;
184 Fli[tid].x = src_data[tid].y;
190 __global__ void cudaKernel_GetParamAT2(cuDoubleComplex*Flr, cuDoubleComplex*Fli, cuDoubleComplex*G, cuDoubleComplex *temp_data, int nx, int ny)
192 int tid = threadIdx.x + blockIdx.x * blockDim.x;
196 int xshift = nx >> 1;
197 int yshift = ny >> 1;
198 int ii = (c + xshift) % nx;
199 int jj = (r + yshift) % ny;
203 temp_data[ny*ii + jj].x =
205 ((Flr[tid].x * G[tid].x * Flr[tid].x * G[tid].x) -
206 (Fli[tid].x * G[tid].x * Fli[tid].x * G[tid].x)) /
207 ((Flr[tid].x * G[tid].x * Flr[tid].x * G[tid].x) +
208 (Fli[tid].x * G[tid].x * Fli[tid].x * G[tid].x) +
212 temp_data[ny*ii + jj].y =
213 (2 * Flr[tid].x * G[tid].x * (Fli[tid].x * G[tid].x)) /
214 ((Flr[tid].x * G[tid].x * Flr[tid].x * G[tid].x) +
215 (Fli[tid].x * G[tid].x * Fli[tid].x * G[tid].x) +
221 __global__ void cudaKernel_sigGetParamSF(cufftDoubleComplex *src_data, Real *f, int nx, int ny, float th)
223 int tid = threadIdx.x + blockIdx.x * blockDim.x;
226 Real ret1 = 0, ret2 = 0;
230 if (r != ny - 2 && r != ny - 1)
232 if (c != nx - 2 && c != nx - 1)
234 ret1 = abs(src_data[r + (c + 2)*ny].x - src_data[tid].x);
235 ret2 = abs(src_data[(r + 2) + c * ny].x - src_data[tid].x);
238 if (ret1 >= th) { f[tid] = ret1 * ret1; }
239 else if (ret2 >= th) { f[tid] = ret2 * ret2; }
243 __global__ void cudaKernel_sub(Real *data, int nx, int ny, Real *Min)
245 int tid = threadIdx.x + blockIdx.x*blockDim.x;
246 data[tid] = data[tid] - *Min;
249 __global__ void cudaKernel_div(Real *src_data, cuDoubleComplex *dst_data, int nx, int ny, Real *Max)
251 int tid = threadIdx.x + blockIdx.x*blockDim.x;
252 dst_data[tid].x = src_data[tid] / *Max;
255 __global__ void fftShift(int N, int nx, int ny, cufftDoubleComplex* input, cufftDoubleComplex* output, bool bNormailzed)
257 int tid = threadIdx.x + blockIdx.x*blockDim.x;
258 /*double normalF = 1.0;
259 if (bNormailzed == true)
267 int ii = (c + xshift) % nx;
268 int jj = (r + yshift) % ny;
270 output[ny*jj + ii].x = input[tid].x;
271 output[ny*jj + ii].y = input[tid].y;*/
272 double normalF = 1.0;
273 if (bNormailzed == true)
281 int ti = i - nx / 2; if (ti < 0) ti += nx;
282 int tj = j - ny / 2; if (tj < 0) tj += ny;
284 int oindex = tj * nx + ti;
287 output[tid].x = input[oindex].x / normalF;
288 output[tid].y = input[oindex].y / normalF;
290 tid += blockDim.x * gridDim.x;
296 void cudaFFT(int nx, int ny, cufftDoubleComplex* in_field, cufftDoubleComplex* output_field, int direction, bool bNormalized)
298 unsigned int nblocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
301 cufftResult result = cufftPlan2d(&plan, ny, nx, CUFFT_Z2Z);
304 result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_FORWARD);
306 result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_INVERSE);
308 if (result != CUFFT_SUCCESS)
310 LOG("------------------FAIL: execute cufft, code=%s", result);
314 if (cudaDeviceSynchronize() != cudaSuccess) {
315 LOG("Cuda error: Failed to synchronize\n");
323 void cudaCuFFT(cufftHandle* plan, cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, int nx, int ny, int direction) {
324 unsigned int nBlocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
328 result = cufftExecZ2Z(*plan, src_data, dst_data, CUFFT_FORWARD);
330 if (result != CUFFT_SUCCESS)
333 if (cudaDeviceSynchronize() != cudaSuccess)
339 void cudaCuIFFT(cufftHandle* plan, cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, int nx, int ny, int direction) {
340 unsigned int nBlocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
343 result = cufftExecZ2Z(*plan, src_data, dst_data, CUFFT_INVERSE);
344 cudaKernel_FFT << < nBlocks, kBlockThreads >> > (nx, ny, dst_data, src_data, direction);
346 if (result != CUFFT_SUCCESS)
349 if (cudaDeviceSynchronize() != cudaSuccess)
354 void cudaCvtOFF(cuDoubleComplex *src_data, Real *dst_data, ophSigConfig *device_config, int nx, int ny, Real wl, cuDoubleComplex*F, Real *angle)
356 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
359 cudaKernel_sigCvtOFF << < nBlocks, kBlockThreads >> > (src_data, dst_data, device_config, nx, ny, wl, F, angle);
360 int tid = threadIdx.x + blockIdx.x*blockDim.x;
361 uchar * pDeviceBuffer;
364 nppsSumGetBufferSize_64f(nx*ny, &nBufferSize);
365 cudaMalloc((void**)(&pDeviceBuffer), nBufferSize);
366 nppsMin_64f(dst_data, nx*ny, &pM, pDeviceBuffer);
367 cudaKernel_sub << < nBlocks, kBlockThreads >> > (dst_data, nx, ny, &pM);
368 nppsMax_64f(dst_data, nx*ny, &pM, pDeviceBuffer);
369 cudaKernel_div << < nBlocks, kBlockThreads >> > (dst_data, src_data, nx, ny, &pM);
370 cudaFree(pDeviceBuffer);
374 void cudaCvtHPO(CUstream_st* stream, cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, ophSigConfig *device_config, cuDoubleComplex*F, int nx, int ny, Real Rephase, Real Imphase) {
375 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
376 cudaKernel_sigCvtHPO << < nBlocks, kBlockThreads, 0, stream >> > (device_config, F, nx, ny, Rephase, Imphase);
377 cudaKernel_multiply << < nBlocks, kBlockThreads, 0, stream >> > (src_data, dst_data, F, nx, ny);
380 void cudaCvtCAC(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *FFZP, ophSigConfig *device_config, int nx, int ny, Real sigmaf, Real radius) {
381 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
382 cudaKernel_sigCvtCAC << < nBlocks, kBlockThreads >> > (FFZP, device_config, nx, ny, sigmaf, radius);
383 cudaKernel_multiply << < nBlocks, kBlockThreads >> > (src_data, dst_data, FFZP, nx, ny);
386 void cudaPropagation(cufftDoubleComplex *src_data, cufftDoubleComplex *dst_data, cuDoubleComplex *FH, ophSigConfig *device_config, int nx, int ny, Real sigmaf) {
387 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
388 cudaKernel_Propagation << < nBlocks, kBlockThreads >> > (FH, device_config, nx, ny, sigmaf);
389 cudaKernel_multiply << < nBlocks, kBlockThreads >> > (src_data, dst_data, FH, nx, ny);
392 void cudaGetParamAT1(cuDoubleComplex *src_data, cuDoubleComplex *Flr, cuDoubleComplex *Fli, cuDoubleComplex *G, ophSigConfig *device_config, int nx, int ny, Real_t NA_g, Real wl) {
393 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
395 cudaKernel_GetParamAT1 << < nBlocks, kBlockThreads >> > (src_data, Flr, Fli, G, device_config, nx, ny, NA_g, wl);
400 void cudaGetParamAT2(cuDoubleComplex *Flr, cuDoubleComplex *Fli, cuDoubleComplex *G, cuDoubleComplex *temp_data, int nx, int ny) {
401 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
403 cudaKernel_GetParamAT2 << < nBlocks, kBlockThreads >> > (Flr, Fli, G, temp_data, nx, ny);
409 double cudaGetParamSF(cufftHandle *fftplan, cufftDoubleComplex *src_data, cufftDoubleComplex *temp_data, cufftDoubleComplex *dst_data, Real *f, cuDoubleComplex *FH, ophSigConfig *device_config, int nx, int ny, float zMax, float zMin, int sampN, float th, Real wl) {
410 unsigned int nBlocks = (nx * ny + kBlockThreads - 1) / kBlockThreads;
413 Real max = MIN_DOUBLE;
416 uchar * pDeviceBuffer;
419 cudaMalloc((void **)(&pSum), sizeof(Real));
420 nppsSumGetBufferSize_64f(nx*ny, &nBufferSize);
421 cudaMalloc((void**)(&pDeviceBuffer), nBufferSize);
422 Real dz = (zMax - zMin) / sampN;
423 for (int n = 0; n < sampN + 1; n++)
426 Real z = ((n)* dz + zMin);
427 Real_t sigmaf = (z*wl) / (4 * M_PI);
429 cudaPropagation(src_data, dst_data, FH, device_config, nx, ny, sigmaf);
430 cudaCuIFFT(fftplan, dst_data, temp_data, nx, ny, CUFFT_INVERSE);
432 cudaKernel_sigGetParamSF << < nBlocks, kBlockThreads >> > (dst_data, f, nx, ny, th);
433 //// //////sum matrix
435 nppsSum_64f(f, nx*ny, pSum, pDeviceBuffer);
436 cudaMemcpy(&nSumHost, pSum, sizeof(Real), cudaMemcpyDeviceToHost);
437 cout << (float)n / sampN * 100 << " %" << endl;
439 if (nSumHost > max) {
444 cudaFree(pDeviceBuffer);