Openholo  v4.0
Open Source Digital Holographic Library
ophPCKernel.cu
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
7 //
8 //
9 // License Agreement
10 // For Open Source Digital Holographic Library
11 //
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
14 //
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
43 //
44 //M*/
45 
46 /**
47 * @file ophPCKernel.cu
48 * @brief Openholo Point Cloud based CGH generation with CUDA GPGPU
49 * @author Hyeong-Hak Ahn, Minwoo Nam
50 * @date 2018/09
51 */
52 
53 #ifndef OphPCKernel_cu__
54 #define OphPCKernel_cu__
55 
56 #include <cuda_runtime.h>
57 #include <cuda_runtime_api.h>
58 #include <cuda.h>
59 
60 #include <device_launch_parameters.h>
61 #include <device_functions.h>
62 #include <math_functions.h>
63 #include "typedef.h"
64 #include "ophPointCloud_GPU.h"
65 
66 __global__
67 void cudaKernel_double_RS_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNERS* config, const int N, cuDoubleComplex* dst)
68 {
69  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
70  __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
71 
72  if (threadIdx.x == 0) {
73  schannel = channel;
74  pnX = config->pn_X;
75  pnY = config->pn_Y;
76  ppX = config->pp_X;
77  ppY = config->pp_Y;
78  offsetX = config->offset_X;
79  offsetY = config->offset_Y;
80  scaleX = config->scale_X;
81  scaleY = config->scale_Y;
82  scaleZ = config->scale_Z;
83  half_ssX = config->half_ss_X;
84  half_ssY = config->half_ss_Y;
85  distance = config->offset_depth;
86  det_tx = config->det_tx;
87  det_ty = config->det_ty;
88  k = config->k;
89  lambda = config->lambda;
90  loop = N;
91  }
92  __syncthreads();
93 
94  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
95  int xxtr = tid % pnX;
96  int yytr = tid / pnX;
97  ulonglong idx = xxtr + yytr * pnX;
98 
99  double xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
100  double yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
101 
102  for (int j = 0; j < loop; ++j)
103  { // Create Fringe Pattern
104  double pcx = vertex_data[j].point.pos[_X] * scaleX;
105  double pcy = vertex_data[j].point.pos[_Y] * scaleY;
106  double pcz = vertex_data[j].point.pos[_Z] * scaleZ;
107 
108  pcz = pcz + distance;
109 
110  double amp = vertex_data[j].color.color[schannel];
111 
112  double abs_det_txy_pcz = abs(det_tx * pcz);
113 
114  double _xbound[2] = {
115  pcx + abs_det_txy_pcz,
116  pcx - abs_det_txy_pcz
117  };
118 
119  abs_det_txy_pcz = abs(det_ty * pcz);
120 
121  double _ybound[2] = {
122  pcy + abs_det_txy_pcz,
123  pcy - abs_det_txy_pcz
124  };
125 
126  double Xbound[2] = {
127  floor((_xbound[_X] + half_ssX) / ppX) + 1,
128  floor((_xbound[_Y] + half_ssX) / ppX) + 1
129  };
130 
131  double Ybound[2] = {
132  pnY - floor((_ybound[_Y] + half_ssY) / ppY),
133  pnY - floor((_ybound[_X] + half_ssY) / ppY)
134  };
135 
136  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
137  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
138  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
139  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
140 
141  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
142  ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
143 
144  double xxx_pcx_sq = (xxx - pcx) * (xxx - pcx);
145  double yyy_pcy_sq = (yyy - pcy) * (yyy - pcy);
146  double pcz_sq = pcz * pcz;
147 
148  // abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
149  double abs_det_txy_sqrt = abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
150 
151  double range_x[2] = {
152  pcx + abs_det_txy_sqrt,
153  pcx - abs_det_txy_sqrt
154  };
155 
156  abs_det_txy_sqrt = abs(det_ty * sqrt(xxx_pcx_sq + pcz_sq));
157 
158  double range_y[2] = {
159  pcy + abs_det_txy_sqrt,
160  pcy - abs_det_txy_sqrt
161  };
162 
163  if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
164  ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
165 
166  double r = sqrt(xxx_pcx_sq + yyy_pcy_sq + pcz_sq);
167  double p = k * r;
168  double a = (amp * pcz) / (lambda * (r * r));
169  double res_real = sin(p) * a;
170  double res_imag = -cos(p) * a;
171  dst[idx].x += res_real;
172  dst[idx].y += res_imag;
173  }
174  }
175  }
176 }
177 
178 __global__
179 void cudaKernel_double_FastMath_RS_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNERS* config, const int N, cuDoubleComplex* dst)
180 {
181  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
182  __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
183 
184  if (threadIdx.x == 0) {
185  schannel = channel;
186  pnX = config->pn_X;
187  pnY = config->pn_Y;
188  ppX = config->pp_X;
189  ppY = config->pp_Y;
190  offsetX = config->offset_X;
191  offsetY = config->offset_Y;
192  scaleX = config->scale_X;
193  scaleY = config->scale_Y;
194  scaleZ = config->scale_Z;
195  half_ssX = config->half_ss_X;
196  half_ssY = config->half_ss_Y;
197  distance = config->offset_depth;
198  det_tx = config->det_tx;
199  det_ty = config->det_ty;
200  k = config->k;
201  lambda = config->lambda;
202  loop = N;
203  }
204  __syncthreads();
205 
206  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
207  int xxtr = tid % pnX;
208  int yytr = tid / pnX;
209  ulonglong idx = xxtr + yytr * pnX;
210 
211  double xxx = __dadd_rz(-half_ssX, __dmul_rz(__dadd_rz(__dsub_rz(xxtr, 1), offsetX), ppX));
212  double yyy = __dadd_rz(-half_ssY, __dmul_rz(__dadd_rz(__dsub_rz(pnY, yytr), offsetY), ppY));
213 
214  for (int j = 0; j < loop; ++j)
215  { // Create Fringe Pattern
216  double pcx = __dmul_rz(vertex_data[j].point.pos[_X], scaleX);
217  double pcy = __dmul_rz(vertex_data[j].point.pos[_Y], scaleY);
218  double pcz = __dmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
219 
220  pcz = __dadd_rz(pcz, distance);
221 
222  double amp = vertex_data[j].color.color[schannel];
223 
224  double abs_det_txy_pcz = abs(__dmul_rz(det_tx, pcz));
225 
226  double _xbound[2] = {
227  __dadd_rz(pcx, abs_det_txy_pcz),
228  __dsub_rz(pcx, abs_det_txy_pcz)
229  };
230 
231  abs_det_txy_pcz = abs(__dmul_rz(det_ty, pcz));
232 
233  double _ybound[2] = {
234  __dadd_rz(pcy, abs_det_txy_pcz),
235  __dsub_rz(pcy, abs_det_txy_pcz)
236  };
237 
238  double Xbound[2] = {
239  __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_X], half_ssX), ppX)), 1),
240  __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
241  };
242 
243  double Ybound[2] = {
244  __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_Y], half_ssY), ppY))),
245  __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_X], half_ssY), ppY)))
246  };
247 
248  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
249  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
250  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
251  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
252 
253  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
254  ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
255  double xx = __dsub_rz(xxx, pcx);
256  double yy = __dsub_rz(yyy, pcy);
257 
258  double xxx_pcx_sq = __dmul_rz(xx, xx);
259  double yyy_pcy_sq = __dmul_rz(yy, yy);
260  double pcz_sq = __dmul_rz(pcz, pcz);
261 
262  double abs_det_txy_sqrt = abs(__dmul_rz(det_tx, __dsqrt_rz(__dadd_rz(yyy_pcy_sq, pcz_sq))));
263 
264  double range_x[2] = {
265  __dadd_rz(pcx, abs_det_txy_sqrt),
266  __dsub_rz(pcx, abs_det_txy_sqrt)
267  };
268 
269  abs_det_txy_sqrt = abs(__dmul_rz(det_ty, __dsqrt_rz(__dadd_rz(xxx_pcx_sq, pcz_sq))));
270 
271  double range_y[2] = {
272  __dadd_rz(pcy, abs_det_txy_sqrt),
273  __dsub_rz(pcy, abs_det_txy_sqrt)
274  };
275 
276  if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
277  ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
278 
279  double r = __dsqrt_rz(__dadd_rz(__dadd_rz(xxx_pcx_sq, yyy_pcy_sq), pcz_sq));
280  double p = __dmul_rz(k, r);
281  double a = __ddiv_rz(__dmul_rz(amp, pcz), __dmul_rz(lambda, __dmul_rz(r, r)));
282  double res_real = __dmul_rz(sin(p), a);
283  double res_imag = __dmul_rz(-cos(p), a);
284  dst[idx].x = __dadd_rz(dst[idx].x, res_real);
285  dst[idx].y = __dadd_rz(dst[idx].y, res_imag);
286  }
287  }
288  }
289 }
290 
291 __global__
292 void cudaKernel_single_RS_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNERS* config, const int N, cuDoubleComplex* dst)
293 {
294  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
295  __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
296 
297  if (threadIdx.x == 0) {
298  schannel = channel;
299  pnX = config->pn_X;
300  pnY = config->pn_Y;
301  ppX = config->pp_X;
302  ppY = config->pp_Y;
303  offsetX = config->offset_X;
304  offsetY = config->offset_Y;
305  scaleX = config->scale_X;
306  scaleY = config->scale_Y;
307  scaleZ = config->scale_Z;
308  half_ssX = config->half_ss_X;
309  half_ssY = config->half_ss_Y;
310  distance = config->offset_depth;
311  det_tx = config->det_tx;
312  det_ty = config->det_ty;
313  k = config->k;
314  lambda = config->lambda;
315  loop = N;
316  }
317  __syncthreads();
318 
319  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
320  int xxtr = tid % pnX;
321  int yytr = tid / pnX;
322  ulonglong idx = xxtr + yytr * pnX;
323 
324  float xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
325  float yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
326 
327  for (int j = 0; j < loop; ++j)
328  { // Create Fringe Pattern
329  float pcx = vertex_data[j].point.pos[_X] * scaleX;
330  float pcy = vertex_data[j].point.pos[_Y] * scaleY;
331  float pcz = vertex_data[j].point.pos[_Z] * scaleZ;
332 
333  pcz = pcz + distance;
334 
335  float amp = vertex_data[j].color.color[schannel];
336 
337  float abs_det_txy_pcz = abs(det_tx * pcz);
338 
339  float _xbound[2] = {
340  pcx + abs_det_txy_pcz,
341  pcx - abs_det_txy_pcz
342  };
343 
344  abs_det_txy_pcz = abs(det_ty * pcz);
345 
346  float _ybound[2] = {
347  pcy + abs_det_txy_pcz,
348  pcy - abs_det_txy_pcz
349  };
350 
351  float Xbound[2] = {
352  floor((_xbound[_X] + half_ssX) / ppX) + 1,
353  floor((_xbound[_Y] + half_ssX) / ppX) + 1
354  };
355 
356  float Ybound[2] = {
357  pnY - floor((_ybound[_Y] + half_ssY) / ppY),
358  pnY - floor((_ybound[_X] + half_ssY) / ppY)
359  };
360 
361  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
362  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
363  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
364  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
365 
366  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
367  ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
368 
369  float xxx_pcx_sq = (xxx - pcx) * (xxx - pcx);
370  float yyy_pcy_sq = (yyy - pcy) * (yyy - pcy);
371  float pcz_sq = pcz * pcz;
372 
373  // abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
374  float abs_det_txy_sqrt = abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
375 
376  float range_x[2] = {
377  pcx + abs_det_txy_sqrt,
378  pcx - abs_det_txy_sqrt
379  };
380 
381  abs_det_txy_sqrt = abs(det_ty * sqrt(xxx_pcx_sq + pcz_sq));
382 
383  float range_y[2] = {
384  pcy + abs_det_txy_sqrt,
385  pcy - abs_det_txy_sqrt
386  };
387 
388  if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
389  ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
390 
391  float r = sqrt(xxx_pcx_sq + yyy_pcy_sq + pcz_sq);
392  float p = k * r;
393  float a = (amp * pcz) / (lambda * (r * r));
394  float res_real = sinf(p) * a;
395  float res_imag = -cosf(p) * a;
396  dst[idx].x += res_real;
397  dst[idx].y += res_imag;
398  }
399  }
400  }
401 }
402 
403 __global__
404 void cudaKernel_single_FastMath_RS_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNERS* config, const int N, cuDoubleComplex* dst)
405 {
406  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
407  __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
408 
409  if (threadIdx.x == 0) {
410  schannel = channel;
411  pnX = config->pn_X;
412  pnY = config->pn_Y;
413  ppX = config->pp_X;
414  ppY = config->pp_Y;
415  offsetX = config->offset_X;
416  offsetY = config->offset_Y;
417  scaleX = config->scale_X;
418  scaleY = config->scale_Y;
419  scaleZ = config->scale_Z;
420  half_ssX = config->half_ss_X;
421  half_ssY = config->half_ss_Y;
422  distance = config->offset_depth;
423  det_tx = config->det_tx;
424  det_ty = config->det_ty;
425  k = config->k;
426  lambda = config->lambda;
427  loop = N;
428  }
429  __syncthreads();
430 
431  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
432  int xxtr = tid % pnX;
433  int yytr = tid / pnX;
434  ulonglong idx = xxtr + yytr * pnX;
435 
436  float xxx = __fadd_rz(-half_ssX, __fmul_rz(__fadd_rz(__fsub_rz(xxtr, 1), offsetX), ppX));
437  float yyy = __fadd_rz(-half_ssY, __fmul_rz(__fadd_rz(__fsub_rz(pnY, yytr), offsetY), ppY));
438 
439  for (int j = 0; j < loop; ++j)
440  { // Create Fringe Pattern
441  float pcx = __fmul_rz(vertex_data[j].point.pos[_X], scaleX);
442  float pcy = __fmul_rz(vertex_data[j].point.pos[_Y], scaleY);
443  float pcz = __fmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
444 
445  pcz = __fadd_rz(pcz, distance);
446 
447  float amp = vertex_data[j].color.color[schannel];
448 
449  float abs_det_txy_pcz = abs(__fmul_rz(det_tx, pcz));
450 
451  float _xbound[2] = {
452  __fadd_rz(pcx, abs_det_txy_pcz),
453  __fsub_rz(pcx, abs_det_txy_pcz)
454  };
455 
456  abs_det_txy_pcz = abs(__fmul_rz(det_ty, pcz));
457 
458  float _ybound[2] = {
459  __fadd_rz(pcy, abs_det_txy_pcz),
460  __fsub_rz(pcy, abs_det_txy_pcz)
461  };
462 
463  float Xbound[2] = {
464  __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_X], half_ssX), ppX)), 1),
465  __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
466  };
467 
468  float Ybound[2] = {
469  __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_Y], half_ssY), ppY))),
470  __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_X], half_ssY), ppY)))
471  };
472 
473  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
474  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
475  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
476  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
477 
478  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
479  ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
480  float xx = __fsub_rz(xxx, pcx);
481  float yy = __fsub_rz(yyy, pcy);
482 
483  float xxx_pcx_sq = __fmul_rz(xx, xx);
484  float yyy_pcy_sq = __fmul_rz(yy, yy);
485  float pcz_sq = __fmul_rz(pcz, pcz);
486 
487  float abs_det_txy_sqrt = abs(__fmul_rz(det_tx, __fsqrt_rz(__fadd_rz(yyy_pcy_sq, pcz_sq))));
488 
489  float range_x[2] = {
490  __fadd_rz(pcx, abs_det_txy_sqrt),
491  __fsub_rz(pcx, abs_det_txy_sqrt)
492  };
493 
494  abs_det_txy_sqrt = abs(__fmul_rz(det_ty, __fsqrt_rz(__fadd_rz(xxx_pcx_sq, pcz_sq))));
495 
496  float range_y[2] = {
497  __fadd_rz(pcy, abs_det_txy_sqrt),
498  __fsub_rz(pcy, abs_det_txy_sqrt)
499  };
500 
501  if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
502  ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
503 
504  float r = __fsqrt_rz(__fadd_rz(__fadd_rz(xxx_pcx_sq, yyy_pcy_sq), pcz_sq));
505  float p = __fmul_rz(k, r);
506  float a = __fdiv_rz(__fmul_rz(amp, pcz), __fmul_rz(lambda, __fmul_rz(r, r)));
507  float res_real = __fmul_rz(sin(p), a);
508  float res_imag = __fmul_rz(-cos(p), a);
509  dst[idx].x = __dadd_rz(dst[idx].x, res_real);
510  dst[idx].y = __dadd_rz(dst[idx].y, res_imag);
511  }
512  }
513  }
514 }
515 
516 __global__
517 void cudaKernel_double_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNEFR* config, const int N, cuDoubleComplex* dst)
518 {
519  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
520  __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
521 
522  if (threadIdx.x == 0) {
523  schannel = channel;
524  pnX = config->pn_X;
525  pnY = config->pn_Y;
526  ppX = config->pp_X;
527  ppY = config->pp_Y;
528  offsetX = config->offset_X;
529  offsetY = config->offset_Y;
530  scaleX = config->scale_X;
531  scaleY = config->scale_Y;
532  scaleZ = config->scale_Z;
533  half_ssX = config->half_ss_X;
534  half_ssY = config->half_ss_Y;
535  distance = config->offset_depth;
536  k = config->k;
537  tx = config->tx;
538  ty = config->ty;
539  lambda = config->lambda;
540  loop = N;
541  }
542  __syncthreads();
543 
544  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
545  int xxtr = tid % pnX;
546  int yytr = tid / pnX;
547  ulonglong idx = xxtr + yytr * pnX;
548 
549  double xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
550  double yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
551 
552  for (int j = 0; j < loop; ++j)
553  { //Create Fringe Pattern
554 
555  double pcx = vertex_data[j].point.pos[_X] * scaleX;
556  double pcy = vertex_data[j].point.pos[_Y] * scaleY;
557  double pcz = vertex_data[j].point.pos[_Z] * scaleZ;
558  pcz = pcz + distance;
559 
560  double amp = vertex_data[j].color.color[schannel];
561 
562  //boundary test
563  double abs_txy_pcz = abs(tx * pcz);
564  double _xbound[2] = {
565  pcx + abs_txy_pcz,
566  pcx - abs_txy_pcz
567  };
568 
569  abs_txy_pcz = abs(ty * pcz);
570  double _ybound[2] = {
571  pcy + abs_txy_pcz,
572  pcy - abs_txy_pcz
573  };
574 
575  double Xbound[2] = {
576  floor((_xbound[_X] + half_ssX) / ppX) + 1,
577  floor((_xbound[_Y] + half_ssX) / ppX) + 1
578  };
579 
580  double Ybound[2] = {
581  pnY - floor((_ybound[_Y] + half_ssY) / ppY),
582  pnY - floor((_ybound[_X] + half_ssY) / ppY)
583  };
584 
585  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
586  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
587  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
588  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
589  //
590 
591  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
592  double p = k * ((xxx - pcx) * (xxx - pcx) + (yyy - pcy) * (yyy - pcy) + (2 * pcz * pcz)) / (2 * pcz);
593  double a = amp / (lambda * pcz);
594  double res_real = sin(p) * a;
595  double res_imag = -cos(p) * a;
596 
597  dst[idx].x += res_real;
598  dst[idx].y += res_imag;
599  }
600  }
601 }
602 
603 __global__
604 void cudaKernel_double_FastMath_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNEFR* config, const int N, cuDoubleComplex* dst)
605 {
606  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
607  __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
608 
609  if (threadIdx.x == 0) {
610  schannel = channel;
611  pnX = config->pn_X;
612  pnY = config->pn_Y;
613  ppX = config->pp_X;
614  ppY = config->pp_Y;
615  offsetX = config->offset_X;
616  offsetY = config->offset_Y;
617  scaleX = config->scale_X;
618  scaleY = config->scale_Y;
619  scaleZ = config->scale_Z;
620  half_ssX = config->half_ss_X;
621  half_ssY = config->half_ss_Y;
622  distance = config->offset_depth;
623  k = config->k;
624  tx = config->tx;
625  ty = config->ty;
626  lambda = config->lambda;
627  loop = N;
628  }
629  __syncthreads();
630 
631  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
632  int xxtr = tid % pnX;
633  int yytr = tid / pnX;
634  ulonglong idx = xxtr + yytr * pnX;
635 
636  double xxx = __dadd_rz(-half_ssX, __dmul_rz(__dadd_rz(__dsub_rz(xxtr, 1), offsetX), ppX));
637  double yyy = __dadd_rz(-half_ssY, __dmul_rz(__dadd_rz(__dsub_rz(pnY, yytr), offsetY), ppY));
638 
639  for (int j = 0; j < loop; ++j)
640  { //Create Fringe Pattern
641 
642  double pcx = __dmul_rz(vertex_data[j].point.pos[_X], scaleX);
643  double pcy = __dmul_rz(vertex_data[j].point.pos[_Y], scaleY);
644  double pcz = __dmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
645  pcz = __dadd_rz(pcz, distance);
646 
647  double amp = vertex_data[j].color.color[schannel];
648 
649  //boundary test
650  double abs_txy_pcz = abs(__dmul_rz(tx, pcz));
651  double _xbound[2] = {
652  __dadd_rz(pcx, abs_txy_pcz),
653  __dsub_rz(pcx, abs_txy_pcz)
654  };
655 
656  abs_txy_pcz = abs(__dmul_rz(ty, pcz));
657  double _ybound[2] = {
658  __dadd_rz(pcy, abs_txy_pcz),
659  __dsub_rz(pcy, abs_txy_pcz)
660  };
661 
662  double Xbound[2] = {
663  __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_X], half_ssX), ppX)), 1),
664  __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
665  };
666 
667  double Ybound[2] = {
668  __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_Y], half_ssY), ppY))),
669  __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_X], half_ssY), ppY)))
670  };
671 
672  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
673  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
674  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
675  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
676  //
677 
678  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
679  double xx = __dsub_rz(xxx, pcx);
680  double yy = __dsub_rz(yyy, pcy);
681  double z2 = __dmul_rz(2, pcz);
682 
683  double p = __ddiv_rz(__dmul_rz(k, __dadd_rz(__dadd_rz(__dmul_rz(xx, xx), __dmul_rz(yy, yy)), __dmul_rz(z2, pcz))), z2);
684 
685  double a = __ddiv_rz(amp, __dmul_rz(lambda, pcz));
686  double res_real = __dmul_rz(sin(p), a);
687  double res_imag = __dmul_rz(-cos(p), a);
688 
689  dst[idx].x = __dadd_rz(dst[idx].x, res_real);
690  dst[idx].y = __dadd_rz(dst[idx].y, res_imag);
691  }
692  }
693 }
694 
695 __global__
696 void cudaKernel_single_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNEFR* config, const int N, cuDoubleComplex* dst)
697 {
698  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
699  __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
700 
701  if (threadIdx.x == 0) {
702  schannel = channel;
703  pnX = config->pn_X;
704  pnY = config->pn_Y;
705  ppX = config->pp_X;
706  ppY = config->pp_Y;
707  offsetX = config->offset_X;
708  offsetY = config->offset_Y;
709  scaleX = config->scale_X;
710  scaleY = config->scale_Y;
711  scaleZ = config->scale_Z;
712  half_ssX = config->half_ss_X;
713  half_ssY = config->half_ss_Y;
714  distance = config->offset_depth;
715  k = config->k;
716  tx = config->tx;
717  ty = config->ty;
718  lambda = config->lambda;
719  loop = N;
720  }
721  __syncthreads();
722 
723  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
724  int xxtr = tid % pnX;
725  int yytr = tid / pnX;
726  ulonglong idx = xxtr + yytr * pnX;
727 
728  float xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
729  float yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
730 
731  for (int j = 0; j < loop; ++j)
732  { //Create Fringe Pattern
733 
734  float pcx = vertex_data[j].point.pos[_X] * scaleX;
735  float pcy = vertex_data[j].point.pos[_Y] * scaleY;
736  float pcz = vertex_data[j].point.pos[_Z] * scaleZ;
737  pcz = pcz + distance;
738 
739  float amp = vertex_data[j].color.color[schannel];
740 
741  //boundary test
742  float abs_txy_pcz = abs(tx * pcz);
743  float _xbound[2] = {
744  pcx + abs_txy_pcz,
745  pcx - abs_txy_pcz
746  };
747 
748  abs_txy_pcz = abs(ty * pcz);
749  float _ybound[2] = {
750  pcy + abs_txy_pcz,
751  pcy - abs_txy_pcz
752  };
753 
754  float Xbound[2] = {
755  floor((_xbound[_X] + half_ssX) / ppX) + 1,
756  floor((_xbound[_Y] + half_ssX) / ppX) + 1
757  };
758 
759  float Ybound[2] = {
760  pnY - floor((_ybound[_Y] + half_ssY) / ppY),
761  pnY - floor((_ybound[_X] + half_ssY) / ppY)
762  };
763 
764  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
765  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
766  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
767  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
768  //
769 
770  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
771  float p = k * ((xxx - pcx) * (xxx - pcx) + (yyy - pcy) * (yyy - pcy) + (2 * pcz * pcz)) / (2 * pcz);
772  float a = amp / (lambda * pcz);
773  float res_real = sinf(p) * a;
774  float res_imag = -cosf(p) * a;
775 
776  dst[idx].x += res_real;
777  dst[idx].y += res_imag;
778  }
779  }
780 }
781 
782 __global__
783 void cudaKernel_single_FastMath_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const GpuConstNEFR* config, const int N, cuDoubleComplex* dst)
784 {
785  __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
786  __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
787 
788  if (threadIdx.x == 0) {
789  schannel = channel;
790  pnX = config->pn_X;
791  pnY = config->pn_Y;
792  ppX = config->pp_X;
793  ppY = config->pp_Y;
794  offsetX = config->offset_X;
795  offsetY = config->offset_Y;
796  scaleX = config->scale_X;
797  scaleY = config->scale_Y;
798  scaleZ = config->scale_Z;
799  half_ssX = config->half_ss_X;
800  half_ssY = config->half_ss_Y;
801  distance = config->offset_depth;
802  k = config->k;
803  tx = config->tx;
804  ty = config->ty;
805  lambda = config->lambda;
806  loop = N;
807  }
808  __syncthreads();
809 
810  ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
811  int xxtr = tid % pnX;
812  int yytr = tid / pnX;
813  ulonglong idx = xxtr + yytr * pnX;
814 
815  float xxx = __fadd_rz(-half_ssX, __fmul_rz(__fadd_rz(__fsub_rz(xxtr, 1), offsetX), ppX));
816  float yyy = __fadd_rz(-half_ssY, __fmul_rz(__fadd_rz(__fsub_rz(pnY, yytr), offsetY), ppY));
817 
818  for (int j = 0; j < loop; ++j)
819  { //Create Fringe Pattern
820 
821  float pcx = __fmul_rz(vertex_data[j].point.pos[_X], scaleX);
822  float pcy = __fmul_rz(vertex_data[j].point.pos[_Y], scaleY);
823  float pcz = __fmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
824  pcz = __fadd_rz(pcz, distance);
825 
826  float amp = vertex_data[j].color.color[schannel];
827 
828  //boundary test
829  float abs_txy_pcz = abs(__fmul_rz(tx, pcz));
830  float _xbound[2] = {
831  __fadd_rz(pcx, abs_txy_pcz),
832  __fsub_rz(pcx, abs_txy_pcz)
833  };
834 
835  abs_txy_pcz = abs(__fmul_rz(ty, pcz));
836  float _ybound[2] = {
837  __fadd_rz(pcy, abs_txy_pcz),
838  __fsub_rz(pcy, abs_txy_pcz)
839  };
840 
841  float Xbound[2] = {
842  __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_X], half_ssX), ppX)), 1),
843  __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
844  };
845 
846  float Ybound[2] = {
847  __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_Y], half_ssY), ppY))),
848  __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_X], half_ssY), ppY)))
849  };
850 
851  if (Xbound[_X] > pnX) Xbound[_X] = pnX;
852  if (Xbound[_Y] < 0) Xbound[_Y] = 0;
853  if (Ybound[_X] > pnY) Ybound[_X] = pnY;
854  if (Ybound[_Y] < 0) Ybound[_Y] = 0;
855  //
856 
857  if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
858  float xx = __fsub_rz(xxx, pcx);
859  float yy = __fsub_rz(yyy, pcy);
860  float z2 = __fmul_rz(2, pcz);
861 
862  float p = __fdiv_rz(__fmul_rz(k, __fadd_rz(__fadd_rz(__fmul_rz(xx, xx), __fmul_rz(yy, yy)), __fmul_rz(z2, pcz))), z2);
863 
864  float a = __fdiv_rz(amp, __fmul_rz(lambda, pcz));
865  float res_real = __fmul_rz(sin(p), a);
866  float res_imag = __fmul_rz(-cos(p), a);
867 
868  dst[idx].x = __fadd_rz(dst[idx].x, res_real);
869  dst[idx].y = __fadd_rz(dst[idx].y, res_imag);
870  }
871  }
872 }
873 
874 
875 extern "C"
876 {
877  void cudaPointCloud_RS(
878  const int& nBlocks, const int& nThreads, const int& n_pts_per_stream,
879  Vertex* cuda_vertex_data,
880  cuDoubleComplex* cuda_dst,
881  const GpuConstNERS* cuda_config, const uint& iChannel, const uint& mode
882  )
883  {
884  if (mode & MODE_FLOAT)
885  {
886  if (mode & MODE_FASTMATH)
887  cudaKernel_single_FastMath_RS_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
888  else
889  cudaKernel_single_RS_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
890  }
891  else
892  {
893  if (mode & MODE_FASTMATH)
894  cudaKernel_double_FastMath_RS_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
895  else
896  cudaKernel_double_RS_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
897  }
898  }
899 
900  void cudaPointCloud_Fresnel(
901  const int& nBlocks, const int& nThreads, const int& n_pts_per_stream,
902  Vertex* cuda_vertex_data,
903  cuDoubleComplex* cuda_dst,
904  const GpuConstNEFR* cuda_config, const uint& iChannel, const uint& mode
905  )
906  {
907  if (mode & MODE_FLOAT)
908  {
909  if (mode & MODE_FASTMATH)
910  cudaKernel_single_FastMath_Fresnel_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
911  else
912  cudaKernel_single_Fresnel_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
913  }
914  else
915  {
916  if (mode & MODE_FASTMATH)
917  cudaKernel_double_FastMath_Fresnel_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
918  else
919  cudaKernel_double_Fresnel_Diffraction << < nBlocks, nThreads >> > (iChannel, cuda_vertex_data, cuda_config, n_pts_per_stream, cuda_dst);
920  }
921  }
922 }
923 
924 #endif // !OphPCKernel_cu__