57 const long long int pnXY = pnX * pnY;
60 const int nX = num_image[
_X];
61 const int nY = num_image[
_Y];
62 const int N = nX * nY;
63 const int rX = resolution_image[
_X];
64 const int rY = resolution_image[
_Y];
65 const long long int R = rX * rY;
66 const long long int NR = N * R;
67 const Real distance = distanceRS2Holo;
72 uchar1** device_LF =
nullptr;
73 uchar** device_LFData =
nullptr;
74 cufftDoubleComplex* device_FFT_src =
nullptr;
75 cufftDoubleComplex* device_FFT_dst =
nullptr;
76 cufftDoubleComplex *device_dst =
nullptr;
77 cufftDoubleComplex *device_FFT_tmp =
nullptr;
78 cufftDoubleComplex *device_FFT_tmp2 =
nullptr;
79 cufftDoubleComplex *device_FFT_tmp3 =
nullptr;
84 LOG(
"%s (Memory Allocation) : ", __FUNCTION__);
86 HANDLE_ERROR(cudaMalloc((
void **)& device_LF,
sizeof(uchar1*) * N));
87 device_LFData =
new uchar*[N];
90 for (
int i = 0; i < N; i++)
92 int size = m_vecImgSize[i];
93 HANDLE_ERROR(cudaMalloc((
void**)&device_LFData[i],
sizeof(uchar1) * size));
94 HANDLE_ERROR(cudaMemcpy(device_LFData[i], m_vecImages[i],
sizeof(
uchar) * size, cudaMemcpyHostToDevice));
96 HANDLE_ERROR(cudaMemcpy(device_LF, device_LFData,
sizeof(
uchar*) * N, cudaMemcpyHostToDevice));
99 HANDLE_ERROR(cudaMalloc((
void**)&device_FFT_src,
sizeof(cufftDoubleComplex) * NR));
100 HANDLE_ERROR(cudaMalloc((
void**)&device_FFT_dst,
sizeof(cufftDoubleComplex) * NR));
101 HANDLE_ERROR(cudaMalloc((
void **)&device_dst,
sizeof(cufftDoubleComplex) * NR));
103 HANDLE_ERROR(cudaMemset(device_FFT_src, 0,
sizeof(cufftDoubleComplex) * NR));
104 HANDLE_ERROR(cudaMemset(device_FFT_dst, 0,
sizeof(cufftDoubleComplex) * NR));
106 HANDLE_ERROR(cudaMalloc((
void**)&device_FFT_tmp,
sizeof(cufftDoubleComplex) * pnXY));
107 HANDLE_ERROR(cudaMalloc((
void**)&device_FFT_tmp2,
sizeof(cufftDoubleComplex) * pnXY * 4));
108 HANDLE_ERROR(cudaMalloc((
void**)&device_FFT_tmp3,
sizeof(cufftDoubleComplex) * pnXY * 4));
113 int nBlocks = (R + nThreads - 1) / nThreads;
114 int nBlocks2 = (NR + nThreads - 1) / nThreads;
115 int nBlocks3 = (NR * 4 + nThreads - 1) / nThreads;
116 int nBlocks4 = (N + nThreads - 1) / nThreads;
119 for (
uint ch = 0; ch < nWave; ch++)
121 HANDLE_ERROR(cudaMemset(device_dst, 0,
sizeof(cuDoubleComplex) * NR));
122 HANDLE_ERROR(cudaMemset(device_FFT_tmp, 0,
sizeof(cuDoubleComplex) * pnXY));
123 HANDLE_ERROR(cudaMemset(device_FFT_tmp2, 0,
sizeof(cuDoubleComplex) * pnXY * 4));
124 HANDLE_ERROR(cudaMemset(device_FFT_tmp3, 0,
sizeof(cuDoubleComplex) * pnXY * 4));
129 nWave, nWave - 1 - ch, pnX, pnY, ppX, ppY, nX, nY, rX, rY, distance, pi2 / lambda, lambda, bRandomPhase
149 cudaError error = cudaGetLastError();
150 if (error != cudaSuccess) {
151 LOG(
"cudaGetLastError(): %s\n", cudaGetErrorName(error));
152 if (error == cudaErrorLaunchOutOfResources) {
155 nBlocks = (R + nThreads - 1) / nThreads;
156 nBlocks2 = (NR + nThreads - 1) / nThreads;
157 nBlocks3 = (NR * 4 + nThreads - 1) / nThreads;
158 nBlocks4 = (N * 4 + nThreads - 1) / nThreads;
168 result = cufftPlan2d(&plan, nY, nX, CUFFT_Z2Z);
169 if (result != CUFFT_SUCCESS)
171 LOG(
"<FAILED> cufftPlan2d (%d)\n", result);
175 cufftDoubleComplex* in, *out;
176 for (
int r = 0; r < R; r++)
179 in = &device_FFT_src[offset];
180 out = &device_FFT_dst[offset];
181 cudaFFT_LF(&plan, 0, nBlocks4, nThreads, nX, nY, in, out, -1);
183 if (cudaDeviceSynchronize() != cudaSuccess)
184 LOG(
"<FAILED> Synchronize\n");
187 procMultiplyPhase(0, nBlocks, nThreads, device_config, device_FFT_dst, device_FFT_tmp);
188 cudaFresnelPropagationLF(nBlocks2, nBlocks3, nThreads, pnX, pnY, device_FFT_tmp, device_FFT_tmp2, device_FFT_tmp3, device_dst, device_config);
191 HANDLE_ERROR(cudaMemcpy(
complex_H[ch], device_dst,
sizeof(cuDoubleComplex) * pnXY, cudaMemcpyDeviceToHost));
196 delete[] host_FFT_tmp;
198 for (
int i = 0; i < N; i++)
199 cudaFree(device_LFData[i]);
200 delete[] device_LFData;
202 cudaFree(device_config);
203 cudaFree(device_FFT_src);
204 cudaFree(device_FFT_dst);
205 cudaFree(device_FFT_tmp);
206 cudaFree(device_FFT_tmp2);
207 cudaFree(device_FFT_tmp3);
208 cudaFree(device_dst);
static CUDA * getInstance()
void cudaConvertLF2ComplexField_Kernel(CUstream_st *stream, const int &nBlocks, const int &nThreads, const LFGpuConst *config, uchar1 **LF, cufftDoubleComplex *output)
struct KernelConst LFGpuConst
#define HANDLE_ERROR(err)
void convertLF2ComplexField_GPU()
bool GetRandomPhase()
Function for getting the random phase.
void cudaFresnelPropagationLF(const int &nBlocks, const int &nBlocks2, const int &nThreads, const int &nx, const int &ny, cufftDoubleComplex *src, cufftDoubleComplex *tmp, cufftDoubleComplex *tmp2, cufftDoubleComplex *dst, const LFGpuConst *cuda_config)
Complex< Real > ** complex_H
#define ELAPSED_TIME(x, y)
void cudaFFT_LF(cufftHandle *plan, CUstream_st *stream, const int &nBlocks, const int &nThreads, const int &nx, const int &ny, cufftDoubleComplex *in_field, cufftDoubleComplex *output_field, const int &direction)
void procMultiplyPhase(CUstream_st *stream, const int &nBlocks, const int &nThreads, const LFGpuConst *config, cufftDoubleComplex *in, cufftDoubleComplex *output)