1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
10 // For Open Source Digital Holographic Library
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
47 * @file ophPCKernel.cu
48 * @brief Openholo Point Cloud based CGH generation with CUDA GPGPU
49 * @author Hyeong-Hak Ahn, Minwoo Nam
53 #ifndef OphPCKernel_cu__
54 #define OphPCKernel_cu__
56 #include <cuda_runtime.h>
57 #include <cuda_runtime_api.h>
60 #include <device_launch_parameters.h>
61 #include <device_functions.h>
62 #include <math_functions.h>
64 #include "ophPointCloud_GPU.h"
67 void cudaKernel_double_RS_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigRS* config, cuDoubleComplex* dst)
69 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
70 __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
72 if (threadIdx.x == 0) {
78 offsetX = config->offset_X;
79 offsetY = config->offset_Y;
80 scaleX = config->scale_X;
81 scaleY = config->scale_Y;
82 scaleZ = config->scale_Z;
83 half_ssX = config->half_ss_X;
84 half_ssY = config->half_ss_Y;
85 distance = config->offset_depth;
86 det_tx = config->det_tx;
87 det_ty = config->det_ty;
89 lambda = config->lambda;
90 loop = config->n_points;
94 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
97 ulonglong idx = xxtr + yytr * pnX;
99 double xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
100 double yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
102 for (int j = 0; j < loop; ++j)
103 { // Create Fringe Pattern
104 double pcx = vertex_data[j].point.pos[_X] * scaleX;
105 double pcy = vertex_data[j].point.pos[_Y] * scaleY;
106 double pcz = vertex_data[j].point.pos[_Z] * scaleZ;
108 pcz = pcz + distance;
110 double amp = vertex_data[j].color.color[schannel];
112 double abs_det_txy_pcz = abs(det_tx * pcz);
114 double _xbound[2] = {
115 pcx + abs_det_txy_pcz,
116 pcx - abs_det_txy_pcz
119 abs_det_txy_pcz = abs(det_ty * pcz);
121 double _ybound[2] = {
122 pcy + abs_det_txy_pcz,
123 pcy - abs_det_txy_pcz
127 floor((_xbound[_X] + half_ssX) / ppX) + 1,
128 floor((_xbound[_Y] + half_ssX) / ppX) + 1
132 pnY - floor((_ybound[_Y] + half_ssY) / ppY),
133 pnY - floor((_ybound[_X] + half_ssY) / ppY)
136 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
137 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
138 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
139 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
141 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
142 ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
144 double xxx_pcx_sq = (xxx - pcx) * (xxx - pcx);
145 double yyy_pcy_sq = (yyy - pcy) * (yyy - pcy);
146 double pcz_sq = pcz * pcz;
148 // abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
149 double abs_det_txy_sqrt = abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
151 double range_x[2] = {
152 pcx + abs_det_txy_sqrt,
153 pcx - abs_det_txy_sqrt
156 abs_det_txy_sqrt = abs(det_ty * sqrt(xxx_pcx_sq + pcz_sq));
158 double range_y[2] = {
159 pcy + abs_det_txy_sqrt,
160 pcy - abs_det_txy_sqrt
163 if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
164 ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
166 double r = sqrt(xxx_pcx_sq + yyy_pcy_sq + pcz_sq);
168 double a = (amp * pcz) / (lambda * (r * r));
169 double res_real = sin(p) * a;
170 double res_imag = -cos(p) * a;
171 dst[idx].x += res_real;
172 dst[idx].y += res_imag;
179 void cudaKernel_double_FastMath_RS_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigRS* config, cuDoubleComplex* dst)
181 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
182 __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
184 if (threadIdx.x == 0) {
190 offsetX = config->offset_X;
191 offsetY = config->offset_Y;
192 scaleX = config->scale_X;
193 scaleY = config->scale_Y;
194 scaleZ = config->scale_Z;
195 half_ssX = config->half_ss_X;
196 half_ssY = config->half_ss_Y;
197 distance = config->offset_depth;
198 det_tx = config->det_tx;
199 det_ty = config->det_ty;
201 lambda = config->lambda;
202 loop = config->n_points;
206 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
207 int xxtr = tid % pnX;
208 int yytr = tid / pnX;
209 ulonglong idx = xxtr + yytr * pnX;
211 double xxx = __dadd_rz(-half_ssX, __dmul_rz(__dadd_rz(__dsub_rz(xxtr, 1), offsetX), ppX));
212 double yyy = __dadd_rz(-half_ssY, __dmul_rz(__dadd_rz(__dsub_rz(pnY, yytr), offsetY), ppY));
214 for (int j = 0; j < loop; ++j)
215 { // Create Fringe Pattern
216 double pcx = __dmul_rz(vertex_data[j].point.pos[_X], scaleX);
217 double pcy = __dmul_rz(vertex_data[j].point.pos[_Y], scaleY);
218 double pcz = __dmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
220 pcz = __dadd_rz(pcz, distance);
222 double amp = vertex_data[j].color.color[schannel];
224 double abs_det_txy_pcz = abs(__dmul_rz(det_tx, pcz));
226 double _xbound[2] = {
227 __dadd_rz(pcx, abs_det_txy_pcz),
228 __dsub_rz(pcx, abs_det_txy_pcz)
231 abs_det_txy_pcz = abs(__dmul_rz(det_ty, pcz));
233 double _ybound[2] = {
234 __dadd_rz(pcy, abs_det_txy_pcz),
235 __dsub_rz(pcy, abs_det_txy_pcz)
239 __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_X], half_ssX), ppX)), 1),
240 __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
244 __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_Y], half_ssY), ppY))),
245 __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_X], half_ssY), ppY)))
248 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
249 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
250 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
251 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
253 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
254 ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
255 double xx = __dsub_rz(xxx, pcx);
256 double yy = __dsub_rz(yyy, pcy);
258 double xxx_pcx_sq = __dmul_rz(xx, xx);
259 double yyy_pcy_sq = __dmul_rz(yy, yy);
260 double pcz_sq = __dmul_rz(pcz, pcz);
262 double abs_det_txy_sqrt = abs(__dmul_rz(det_tx, __dsqrt_rz(__dadd_rz(yyy_pcy_sq, pcz_sq))));
264 double range_x[2] = {
265 __dadd_rz(pcx, abs_det_txy_sqrt),
266 __dsub_rz(pcx, abs_det_txy_sqrt)
269 abs_det_txy_sqrt = abs(__dmul_rz(det_ty, __dsqrt_rz(__dadd_rz(xxx_pcx_sq, pcz_sq))));
271 double range_y[2] = {
272 __dadd_rz(pcy, abs_det_txy_sqrt),
273 __dsub_rz(pcy, abs_det_txy_sqrt)
276 if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
277 ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
279 double r = __dsqrt_rz(__dadd_rz(__dadd_rz(xxx_pcx_sq, yyy_pcy_sq), pcz_sq));
280 double p = __dmul_rz(k, r);
281 double a = __ddiv_rz(__dmul_rz(amp, pcz), __dmul_rz(lambda, __dmul_rz(r, r)));
282 double res_real = __dmul_rz(sin(p), a);
283 double res_imag = __dmul_rz(-cos(p), a);
284 dst[idx].x = __dadd_rz(dst[idx].x, res_real);
285 dst[idx].y = __dadd_rz(dst[idx].y, res_imag);
292 void cudaKernel_single_RS_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigRS* config, cuDoubleComplex* dst)
294 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
295 __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
297 if (threadIdx.x == 0) {
303 offsetX = config->offset_X;
304 offsetY = config->offset_Y;
305 scaleX = config->scale_X;
306 scaleY = config->scale_Y;
307 scaleZ = config->scale_Z;
308 half_ssX = config->half_ss_X;
309 half_ssY = config->half_ss_Y;
310 distance = config->offset_depth;
311 det_tx = config->det_tx;
312 det_ty = config->det_ty;
314 lambda = config->lambda;
315 loop = config->n_points;
319 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
320 int xxtr = tid % pnX;
321 int yytr = tid / pnX;
322 ulonglong idx = xxtr + yytr * pnX;
324 float xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
325 float yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
327 for (int j = 0; j < loop; ++j)
328 { // Create Fringe Pattern
329 float pcx = vertex_data[j].point.pos[_X] * scaleX;
330 float pcy = vertex_data[j].point.pos[_Y] * scaleY;
331 float pcz = vertex_data[j].point.pos[_Z] * scaleZ;
333 pcz = pcz + distance;
335 float amp = vertex_data[j].color.color[schannel];
337 float abs_det_txy_pcz = abs(det_tx * pcz);
340 pcx + abs_det_txy_pcz,
341 pcx - abs_det_txy_pcz
344 abs_det_txy_pcz = abs(det_ty * pcz);
347 pcy + abs_det_txy_pcz,
348 pcy - abs_det_txy_pcz
352 floor((_xbound[_X] + half_ssX) / ppX) + 1,
353 floor((_xbound[_Y] + half_ssX) / ppX) + 1
357 pnY - floor((_ybound[_Y] + half_ssY) / ppY),
358 pnY - floor((_ybound[_X] + half_ssY) / ppY)
361 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
362 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
363 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
364 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
366 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
367 ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
369 float xxx_pcx_sq = (xxx - pcx) * (xxx - pcx);
370 float yyy_pcy_sq = (yyy - pcy) * (yyy - pcy);
371 float pcz_sq = pcz * pcz;
373 // abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
374 float abs_det_txy_sqrt = abs(det_tx * sqrt(yyy_pcy_sq + pcz_sq));
377 pcx + abs_det_txy_sqrt,
378 pcx - abs_det_txy_sqrt
381 abs_det_txy_sqrt = abs(det_ty * sqrt(xxx_pcx_sq + pcz_sq));
384 pcy + abs_det_txy_sqrt,
385 pcy - abs_det_txy_sqrt
388 if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
389 ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
391 float r = sqrt(xxx_pcx_sq + yyy_pcy_sq + pcz_sq);
393 float a = (amp * pcz) / (lambda * (r * r));
394 float res_real = sinf(p) * a;
395 float res_imag = -cosf(p) * a;
396 dst[idx].x += res_real;
397 dst[idx].y += res_imag;
404 void cudaKernel_single_FastMath_RS_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigRS* config, cuDoubleComplex* dst)
406 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
407 __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, det_tx, det_ty, k, lambda;
409 if (threadIdx.x == 0) {
415 offsetX = config->offset_X;
416 offsetY = config->offset_Y;
417 scaleX = config->scale_X;
418 scaleY = config->scale_Y;
419 scaleZ = config->scale_Z;
420 half_ssX = config->half_ss_X;
421 half_ssY = config->half_ss_Y;
422 distance = config->offset_depth;
423 det_tx = config->det_tx;
424 det_ty = config->det_ty;
426 lambda = config->lambda;
427 loop = config->n_points;
431 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
432 int xxtr = tid % pnX;
433 int yytr = tid / pnX;
434 ulonglong idx = xxtr + yytr * pnX;
436 float xxx = __fadd_rz(-half_ssX, __fmul_rz(__fadd_rz(__fsub_rz(xxtr, 1), offsetX), ppX));
437 float yyy = __fadd_rz(-half_ssY, __fmul_rz(__fadd_rz(__fsub_rz(pnY, yytr), offsetY), ppY));
439 for (int j = 0; j < loop; ++j)
440 { // Create Fringe Pattern
441 float pcx = __fmul_rz(vertex_data[j].point.pos[_X], scaleX);
442 float pcy = __fmul_rz(vertex_data[j].point.pos[_Y], scaleY);
443 float pcz = __fmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
445 pcz = __fadd_rz(pcz, distance);
447 float amp = vertex_data[j].color.color[schannel];
449 float abs_det_txy_pcz = abs(__fmul_rz(det_tx, pcz));
452 __fadd_rz(pcx, abs_det_txy_pcz),
453 __fsub_rz(pcx, abs_det_txy_pcz)
456 abs_det_txy_pcz = abs(__fmul_rz(det_ty, pcz));
459 __fadd_rz(pcy, abs_det_txy_pcz),
460 __fsub_rz(pcy, abs_det_txy_pcz)
464 __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_X], half_ssX), ppX)), 1),
465 __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
469 __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_Y], half_ssY), ppY))),
470 __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_X], half_ssY), ppY)))
473 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
474 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
475 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
476 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
478 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) &&
479 ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
480 float xx = __fsub_rz(xxx, pcx);
481 float yy = __fsub_rz(yyy, pcy);
483 float xxx_pcx_sq = __fmul_rz(xx, xx);
484 float yyy_pcy_sq = __fmul_rz(yy, yy);
485 float pcz_sq = __fmul_rz(pcz, pcz);
487 float abs_det_txy_sqrt = abs(__fmul_rz(det_tx, __fsqrt_rz(__fadd_rz(yyy_pcy_sq, pcz_sq))));
490 __fadd_rz(pcx, abs_det_txy_sqrt),
491 __fsub_rz(pcx, abs_det_txy_sqrt)
494 abs_det_txy_sqrt = abs(__fmul_rz(det_ty, __fsqrt_rz(__fadd_rz(xxx_pcx_sq, pcz_sq))));
497 __fadd_rz(pcy, abs_det_txy_sqrt),
498 __fsub_rz(pcy, abs_det_txy_sqrt)
501 if (((xxx < range_x[_X]) && (xxx > range_x[_Y])) &&
502 ((yyy < range_y[_X]) && (yyy > range_y[_Y]))) {
504 float r = __fsqrt_rz(__fadd_rz(__fadd_rz(xxx_pcx_sq, yyy_pcy_sq), pcz_sq));
505 float p = __fmul_rz(k, r);
506 float a = __fdiv_rz(__fmul_rz(amp, pcz), __fmul_rz(lambda, __fmul_rz(r, r)));
507 float res_real = __fmul_rz(sin(p), a);
508 float res_imag = __fmul_rz(-cos(p), a);
509 dst[idx].x = __dadd_rz(dst[idx].x, res_real);
510 dst[idx].y = __dadd_rz(dst[idx].y, res_imag);
517 void cudaKernel_double_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigFresnel* config, cuDoubleComplex* dst)
519 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
520 __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
522 if (threadIdx.x == 0) {
528 offsetX = config->offset_X;
529 offsetY = config->offset_Y;
530 scaleX = config->scale_X;
531 scaleY = config->scale_Y;
532 scaleZ = config->scale_Z;
533 half_ssX = config->half_ss_X;
534 half_ssY = config->half_ss_Y;
535 distance = config->offset_depth;
539 lambda = config->lambda;
540 loop = config->n_points;
544 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
545 int xxtr = tid % pnX;
546 int yytr = tid / pnX;
547 ulonglong idx = xxtr + yytr * pnX;
549 double xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
550 double yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
552 for (int j = 0; j < loop; ++j)
553 { //Create Fringe Pattern
555 double pcx = vertex_data[j].point.pos[_X] * scaleX;
556 double pcy = vertex_data[j].point.pos[_Y] * scaleY;
557 double pcz = vertex_data[j].point.pos[_Z] * scaleZ;
558 pcz = pcz + distance;
560 double amp = vertex_data[j].color.color[schannel];
563 double abs_txy_pcz = abs(tx * pcz);
564 double _xbound[2] = {
569 abs_txy_pcz = abs(ty * pcz);
570 double _ybound[2] = {
576 floor((_xbound[_X] + half_ssX) / ppX) + 1,
577 floor((_xbound[_Y] + half_ssX) / ppX) + 1
581 pnY - floor((_ybound[_Y] + half_ssY) / ppY),
582 pnY - floor((_ybound[_X] + half_ssY) / ppY)
585 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
586 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
587 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
588 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
591 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
592 double p = k * ((xxx - pcx) * (xxx - pcx) + (yyy - pcy) * (yyy - pcy) + (2 * pcz * pcz)) / (2 * pcz);
593 double a = amp / (lambda * pcz);
594 double res_real = sin(p) * a;
595 double res_imag = -cos(p) * a;
597 dst[idx].x += res_real;
598 dst[idx].y += res_imag;
604 void cudaKernel_double_FastMath_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigFresnel* config, cuDoubleComplex* dst)
606 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
607 __shared__ double ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
609 if (threadIdx.x == 0) {
615 offsetX = config->offset_X;
616 offsetY = config->offset_Y;
617 scaleX = config->scale_X;
618 scaleY = config->scale_Y;
619 scaleZ = config->scale_Z;
620 half_ssX = config->half_ss_X;
621 half_ssY = config->half_ss_Y;
622 distance = config->offset_depth;
626 lambda = config->lambda;
627 loop = config->n_points;
631 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
632 int xxtr = tid % pnX;
633 int yytr = tid / pnX;
634 ulonglong idx = xxtr + yytr * pnX;
636 double xxx = __dadd_rz(-half_ssX, __dmul_rz(__dadd_rz(__dsub_rz(xxtr, 1), offsetX), ppX));
637 double yyy = __dadd_rz(-half_ssY, __dmul_rz(__dadd_rz(__dsub_rz(pnY, yytr), offsetY), ppY));
639 for (int j = 0; j < loop; ++j)
640 { //Create Fringe Pattern
642 double pcx = __dmul_rz(vertex_data[j].point.pos[_X], scaleX);
643 double pcy = __dmul_rz(vertex_data[j].point.pos[_Y], scaleY);
644 double pcz = __dmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
645 pcz = __dadd_rz(pcz, distance);
647 double amp = vertex_data[j].color.color[schannel];
650 double abs_txy_pcz = abs(__dmul_rz(tx, pcz));
651 double _xbound[2] = {
652 __dadd_rz(pcx, abs_txy_pcz),
653 __dsub_rz(pcx, abs_txy_pcz)
656 abs_txy_pcz = abs(__dmul_rz(ty, pcz));
657 double _ybound[2] = {
658 __dadd_rz(pcy, abs_txy_pcz),
659 __dsub_rz(pcy, abs_txy_pcz)
663 __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_X], half_ssX), ppX)), 1),
664 __dadd_rz(floor(__ddiv_rz(__dadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
668 __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_Y], half_ssY), ppY))),
669 __dsub_rz(pnY, floor(__ddiv_rz(__dadd_rz(_ybound[_X], half_ssY), ppY)))
672 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
673 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
674 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
675 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
678 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
679 double xx = __dsub_rz(xxx, pcx);
680 double yy = __dsub_rz(yyy, pcy);
681 double z2 = __dmul_rz(2, pcz);
683 double p = __ddiv_rz(__dmul_rz(k, __dadd_rz(__dadd_rz(__dmul_rz(xx, xx), __dmul_rz(yy, yy)), __dmul_rz(z2, pcz))), z2);
685 double a = __ddiv_rz(amp, __dmul_rz(lambda, pcz));
686 double res_real = __dmul_rz(sin(p), a);
687 double res_imag = __dmul_rz(-cos(p), a);
689 dst[idx].x = __dadd_rz(dst[idx].x, res_real);
690 dst[idx].y = __dadd_rz(dst[idx].y, res_imag);
696 void cudaKernel_single_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigFresnel* config, cuDoubleComplex* dst)
698 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
699 __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
701 if (threadIdx.x == 0) {
707 offsetX = config->offset_X;
708 offsetY = config->offset_Y;
709 scaleX = config->scale_X;
710 scaleY = config->scale_Y;
711 scaleZ = config->scale_Z;
712 half_ssX = config->half_ss_X;
713 half_ssY = config->half_ss_Y;
714 distance = config->offset_depth;
718 lambda = config->lambda;
719 loop = config->n_points;
723 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
724 int xxtr = tid % pnX;
725 int yytr = tid / pnX;
726 ulonglong idx = xxtr + yytr * pnX;
728 float xxx = -half_ssX + (xxtr - 1 + offsetX) * ppX;
729 float yyy = -half_ssY + (pnY - yytr + offsetY) * ppY;
731 for (int j = 0; j < loop; ++j)
732 { //Create Fringe Pattern
734 float pcx = vertex_data[j].point.pos[_X] * scaleX;
735 float pcy = vertex_data[j].point.pos[_Y] * scaleY;
736 float pcz = vertex_data[j].point.pos[_Z] * scaleZ;
737 pcz = pcz + distance;
739 float amp = vertex_data[j].color.color[schannel];
742 float abs_txy_pcz = abs(tx * pcz);
748 abs_txy_pcz = abs(ty * pcz);
755 floor((_xbound[_X] + half_ssX) / ppX) + 1,
756 floor((_xbound[_Y] + half_ssX) / ppX) + 1
760 pnY - floor((_ybound[_Y] + half_ssY) / ppY),
761 pnY - floor((_ybound[_X] + half_ssY) / ppY)
764 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
765 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
766 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
767 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
770 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
771 float p = k * ((xxx - pcx) * (xxx - pcx) + (yyy - pcy) * (yyy - pcy) + (2 * pcz * pcz)) / (2 * pcz);
772 float a = amp / (lambda * pcz);
773 float res_real = sinf(p) * a;
774 float res_imag = -cosf(p) * a;
776 dst[idx].x += res_real;
777 dst[idx].y += res_imag;
783 void cudaKernel_single_FastMath_Fresnel_Diffraction(uint channel, Vertex* vertex_data, const CudaPointCloudConfigFresnel* config, cuDoubleComplex* dst)
785 __shared__ int pnX, pnY, loop, schannel, offsetX, offsetY;
786 __shared__ float ppX, ppY, scaleX, scaleY, scaleZ, half_ssX, half_ssY, distance, k, tx, ty, lambda;
788 if (threadIdx.x == 0) {
794 offsetX = config->offset_X;
795 offsetY = config->offset_Y;
796 scaleX = config->scale_X;
797 scaleY = config->scale_Y;
798 scaleZ = config->scale_Z;
799 half_ssX = config->half_ss_X;
800 half_ssY = config->half_ss_Y;
801 distance = config->offset_depth;
805 lambda = config->lambda;
806 loop = config->n_points;
810 ulonglong tid = blockIdx.x * blockDim.x + threadIdx.x;
811 int xxtr = tid % pnX;
812 int yytr = tid / pnX;
813 ulonglong idx = xxtr + yytr * pnX;
815 float xxx = __fadd_rz(-half_ssX, __fmul_rz(__fadd_rz(__fsub_rz(xxtr, 1), offsetX), ppX));
816 float yyy = __fadd_rz(-half_ssY, __fmul_rz(__fadd_rz(__fsub_rz(pnY, yytr), offsetY), ppY));
818 for (int j = 0; j < loop; ++j)
819 { //Create Fringe Pattern
821 float pcx = __fmul_rz(vertex_data[j].point.pos[_X], scaleX);
822 float pcy = __fmul_rz(vertex_data[j].point.pos[_Y], scaleY);
823 float pcz = __fmul_rz(vertex_data[j].point.pos[_Z], scaleZ);
824 pcz = __fadd_rz(pcz, distance);
826 float amp = vertex_data[j].color.color[schannel];
829 float abs_txy_pcz = abs(__fmul_rz(tx, pcz));
831 __fadd_rz(pcx, abs_txy_pcz),
832 __fsub_rz(pcx, abs_txy_pcz)
835 abs_txy_pcz = abs(__fmul_rz(ty, pcz));
837 __fadd_rz(pcy, abs_txy_pcz),
838 __fsub_rz(pcy, abs_txy_pcz)
842 __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_X], half_ssX), ppX)), 1),
843 __fadd_rz(floor(__fdiv_rz(__fadd_rz(_xbound[_Y], half_ssX), ppX)), 1)
847 __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_Y], half_ssY), ppY))),
848 __fsub_rz(pnY, floor(__fdiv_rz(__fadd_rz(_ybound[_X], half_ssY), ppY)))
851 if (Xbound[_X] > pnX) Xbound[_X] = pnX;
852 if (Xbound[_Y] < 0) Xbound[_Y] = 0;
853 if (Ybound[_X] > pnY) Ybound[_X] = pnY;
854 if (Ybound[_Y] < 0) Ybound[_Y] = 0;
857 if (((xxtr >= Xbound[_Y]) && (xxtr < Xbound[_X])) && ((yytr >= Ybound[_Y]) && (yytr < Ybound[_X]))) {
858 float xx = __fsub_rz(xxx, pcx);
859 float yy = __fsub_rz(yyy, pcy);
860 float z2 = __fmul_rz(2, pcz);
862 float p = __fdiv_rz(__fmul_rz(k, __fadd_rz(__fadd_rz(__fmul_rz(xx, xx), __fmul_rz(yy, yy)), __fmul_rz(z2, pcz))), z2);
864 float a = __fdiv_rz(amp, __fmul_rz(lambda, pcz));
865 float res_real = __fmul_rz(sin(p), a);
866 float res_imag = __fmul_rz(-cos(p), a);
868 dst[idx].x = __fadd_rz(dst[idx].x, res_real);
869 dst[idx].y = __fadd_rz(dst[idx].y, res_imag);
875 void sumKernel(cuDoubleComplex* dst, cuDoubleComplex* src, int size)
877 int idx = blockIdx.x * blockDim.x + threadIdx.x;
879 dst[idx].x += src[idx].x;
880 dst[idx].y += src[idx].y;
887 const int& nBlocks, const int& nThreads, cuDoubleComplex* dst, cuDoubleComplex* src, int size
890 sumKernel << < nBlocks, nThreads >> > (dst, src, size);
893 void cudaPointCloud_RS(
894 const int& nBlocks, const int& nThreads, Vertex* cuda_vertex_data, cuDoubleComplex* cuda_dst,
895 const CudaPointCloudConfigRS* cuda_config, const uint& iColor, const uint& mode
898 if (mode & MODE_FLOAT)
900 if (mode & MODE_FASTMATH)
901 cudaKernel_single_FastMath_RS_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
903 cudaKernel_single_RS_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
907 if (mode & MODE_FASTMATH)
908 cudaKernel_double_FastMath_RS_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
910 cudaKernel_double_RS_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
914 void cudaPointCloud_Fresnel(
915 const int& nBlocks, const int& nThreads, Vertex* cuda_vertex_data, cuDoubleComplex* cuda_dst,
916 const CudaPointCloudConfigFresnel* cuda_config, const uint& iColor, const uint& mode
919 if (mode & MODE_FLOAT)
921 if (mode & MODE_FASTMATH)
922 cudaKernel_single_FastMath_Fresnel_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
924 cudaKernel_single_Fresnel_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
928 if (mode & MODE_FASTMATH)
929 cudaKernel_double_FastMath_Fresnel_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
931 cudaKernel_double_Fresnel_Diffraction << < nBlocks, nThreads >> > (iColor, cuda_vertex_data, cuda_config, cuda_dst);
936 #endif // !OphPCKernel_cu__