Openholo  v4.0
Open Source Digital Holographic Library
ophPointCloud_GPU.cpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
7 //
8 //
9 // License Agreement
10 // For Open Source Digital Holographic Library
11 //
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
14 //
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
43 //
44 //M*/
45 
46 #include "ophPointCloud.h"
47 #include "ophPointCloud_GPU.h"
48 #include "CUDA.h"
49 #ifdef _USE_OPENCL
50 #include "OpenCL.h"
51 
52 void ophPointCloud::genCghPointCloudGPU(uint diff_flag)
53 {
54  int nErr;
55  auto begin = CUR_TIME;
57 
58  cl_context context = cl->getContext();
59  cl_command_queue commands = cl->getCommand();
60  cl_mem device_pc_data;
61  cl_mem device_amp_data;
62  cl_mem device_result;
63  cl_mem device_config;
64 
65  //threads number
67  const ulonglong bufferSize = pnXY * sizeof(Real);
68 
69  //Host Memory Location
70  const int n_colors = pc_data_.n_colors;
71  Real* host_pc_data = nullptr;
72  Real* host_amp_data = pc_data_.color;
73  Real* host_dst = nullptr;
74 
75  // Keep original buffer
76  if (is_ViewingWindow) {
77  host_pc_data = new Real[n_points * 3];
78  transVW(n_points * 3, host_pc_data, pc_data_.vertex);
79  }
80  else {
81  host_pc_data = pc_data_.vertex;
82  }
83 
84  uint nChannel = context_.waveNum;
85  bool bIsGrayScale = n_colors == 1 ? true : false;
86 
87  cl->LoadKernel();
88 
89  cl_kernel* kernel = cl->getKernel();
90 
91  cl_kernel* current_kernel = nullptr;
92  if ((diff_flag == PC_DIFF_RS) || (diff_flag == PC_DIFF_FRESNEL)) {
93  host_dst = new Real[pnXY * 2];
94  memset(host_dst, 0., bufferSize * 2);
95 
96  current_kernel = diff_flag == PC_DIFF_RS ? &kernel[0] : &kernel[1];
97 
98  device_pc_data = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Real) * n_points * 3, nullptr, &nErr);
99  device_amp_data = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Real) * n_points * n_colors, nullptr, &nErr);
100  nErr = clEnqueueWriteBuffer(commands, device_pc_data, CL_TRUE, 0, sizeof(Real) * n_points * 3, host_pc_data, 0, nullptr, nullptr);
101  nErr = clEnqueueWriteBuffer(commands, device_amp_data, CL_TRUE, 0, sizeof(Real) * n_points * n_colors, host_amp_data, 0, nullptr, nullptr);
102 
103  device_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(Real) * pnXY * 2, nullptr, &nErr);
104 
105 
106  size_t global[2] = { context_.pixel_number[_X], context_.pixel_number[_Y] };
107  size_t local[2] = { 32, 32 };
108 
109  clSetKernelArg(*current_kernel, 1, sizeof(cl_mem), &device_pc_data);
110  clSetKernelArg(*current_kernel, 2, sizeof(cl_mem), &device_amp_data);
111  clSetKernelArg(*current_kernel, 4, sizeof(uint), &n_points);
112  for (uint ch = 0; ch < nChannel; ch++)
113  {
114  uint nAdd = bIsGrayScale ? 0 : ch;
115  context_.k = (2 * M_PI) / context_.wave_length[ch];
116  Real ratio = 1.0; //context_.wave_length[nChannel - 1] / context_.wave_length[ch];
117 
118  GpuConst* host_config = new GpuConst(
119  n_points, n_colors, 1,
120  pc_config_.scale, pc_config_.distance,
123  context_.ss,
124  context_.k,
125  context_.wave_length[ch],
126  ratio
127  );
128 
129  if (diff_flag == PC_DIFF_RS)
130  {
131  host_config = new GpuConstNERS(*host_config);
132  device_config = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(GpuConstNERS), nullptr, &nErr);
133 
134  nErr = clEnqueueWriteBuffer(commands, device_result, CL_TRUE, 0, sizeof(Real) * pnXY * 2, host_dst, 0, nullptr, nullptr);
135  nErr = clEnqueueWriteBuffer(commands, device_config, CL_TRUE, 0, sizeof(GpuConstNERS), host_config, 0, nullptr, nullptr);
136  }
137  else if (diff_flag == PC_DIFF_FRESNEL)
138  {
139  host_config = new GpuConstNEFR(*host_config);
140  device_config = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(GpuConstNEFR), nullptr, &nErr);
141 
142  nErr = clEnqueueWriteBuffer(commands, device_result, CL_TRUE, 0, sizeof(Real) * pnXY * 2, host_dst, 0, nullptr, nullptr);
143  nErr = clEnqueueWriteBuffer(commands, device_config, CL_TRUE, 0, sizeof(GpuConstNEFR), host_config, 0, nullptr, nullptr);
144  }
145 
146  clSetKernelArg(*current_kernel, 0, sizeof(cl_mem), &device_result);
147  clSetKernelArg(*current_kernel, 3, sizeof(cl_mem), &device_config);
148  clSetKernelArg(*current_kernel, 5, sizeof(uint), &ch);
149 
150  nErr = clEnqueueNDRangeKernel(commands, *current_kernel, 2, nullptr, global, nullptr/*local*/, 0, nullptr, nullptr);
151 
152 
153  //nErr = clFlush(commands);
154  nErr = clFinish(commands);
155 
156  if (nErr != CL_SUCCESS) cl->errorCheck(nErr, "Check", __FILE__, __LINE__);
157 
158  nErr = clEnqueueReadBuffer(commands, device_result, CL_TRUE, 0, sizeof(Real) * pnXY * 2, complex_H[ch], 0, nullptr, nullptr);
159 
160  delete host_config;
161 
162  m_nProgress = (ch + 1) * 100 / nChannel;
163  }
164 
165  clReleaseMemObject(device_result);
166  clReleaseMemObject(device_amp_data);
167  clReleaseMemObject(device_pc_data);
168  if (host_dst) delete[] host_dst;
169  if (is_ViewingWindow && host_pc_data) delete[] host_pc_data;
170  }
171 
172  LOG("%s : %.5lf (sec)\n", __FUNCTION__, ELAPSED_TIME(begin, CUR_TIME));
173 }
174 #else
175 
176 using namespace oph;
177 void ophPointCloud::genCghPointCloudGPU(uint diff_flag)
178 {
179  if ((diff_flag != PC_DIFF_RS) && (diff_flag != PC_DIFF_FRESNEL))
180  {
181  LOG("<FAILED> Wrong parameters.");
182  return;
183  }
184 
185  //cudaStream_t* streams = nullptr;
186  auto begin = CUR_TIME;
187  const ulonglong pnXY = context_.pixel_number[_X] * context_.pixel_number[_Y];
188  int blockSize = CUDA::getInstance()->getMaxThreads(); //n_threads // blockSize < devProp.maxThreadsPerBlock
189  ulonglong gridSize = (pnXY + blockSize - 1) / blockSize; //n_blocks
190 
191  cout << ">>> All " << blockSize * gridSize << " threads in CUDA" << endl;
192  cout << ">>> " << blockSize << " threads/block, " << gridSize << " blocks/grid" << endl;
193 
194  //const int n_streams = OPH_CUDA_N_STREAM;
195  int n_streams;
196  if (getStream() == 0)
197  n_streams = pc_data_.n_points / 300 + 1;
198  else if (getStream() < 0)
199  {
200  LOG("<FAILED> Wrong parameters.");
201  return;
202  }
203  else
204  n_streams = getStream();
205 
206 
207 
208  // Keep original buffer
209  //if (is_ViewingWindow) {
210  // host_pc_data = new Vertex[pc_data_.n_points];
211  // transVW(pc_data_.n_points, host_pc_data, pc_data_.vertices);
212  //}
213  //else {
214  // host_pc_data = pc_data_.vertices;
215 
216  //}
217 
218  //threads number
219  const ulonglong bufferSize = pnXY * sizeof(cuDoubleComplex);
220  int n_points = pc_data_.n_points;
221 
222  //Host Memory Location
223  Vertex* host_vertex_data = nullptr;
224  if (!is_ViewingWindow)
225  host_vertex_data = pc_data_.vertices;
226  else
227  {
228  host_vertex_data = new Vertex[n_points];
229  std::memcpy(host_vertex_data, pc_data_.vertices, sizeof(Vertex) * n_points);
230  transVW(n_points, host_vertex_data, host_vertex_data);
231  }
232  cuDoubleComplex* host_dst = nullptr;
233  host_dst = new cuDoubleComplex[pnXY];
234  memset(host_dst, 0, bufferSize);
235 
236  Vertex* device_vertex_data;
237  HANDLE_ERROR(cudaMalloc((void**)&device_vertex_data, n_points * sizeof(Vertex)));
238 
239  cuDoubleComplex* device_dst = nullptr;
240  HANDLE_ERROR(cudaMalloc((void**)&device_dst, bufferSize));
241  HANDLE_ERROR(cudaMemsetAsync(device_dst, 0., bufferSize));
242 
243  uint nChannel = context_.waveNum;
244  size_t free, total;
245  cudaMemGetInfo(&free, &total);
246 
247 #ifdef _WIN64
248  LOG("CUDA Memory Total: %llu (byte) / Free: %llu (byte)\n", total, free);
249 #else
250  LOG("CUDA Memory Total: %zu (byte) / Free: %zu (byte)\n", total, free);
251 #endif
252  for (uint ch = 0; ch < nChannel; ch++)
253  {
254  context_.k = (2 * M_PI) / context_.wave_length[ch];
255 
256  GpuConst* host_config = new GpuConst(
257  n_points, n_streams,
258  pc_config_.scale, pc_config_.distance,
259  context_.pixel_number,
260  context_.offset,
261  context_.pixel_pitch,
262  context_.ss,
263  context_.k,
264  context_.wave_length[ch]
265  );
266 
267 
268  GpuConst* device_config = nullptr;
269  switch (diff_flag) {
270  case PC_DIFF_RS: {
271  host_config = new GpuConstNERS(*host_config);
272  HANDLE_ERROR(cudaMalloc((void**)&device_config, sizeof(GpuConstNERS)));
273  HANDLE_ERROR(cudaMemcpyAsync(device_config, host_config, sizeof(GpuConstNERS), cudaMemcpyHostToDevice));
274  break;
275  }
276  case PC_DIFF_FRESNEL: {
277  host_config = new GpuConstNEFR(*host_config);
278  HANDLE_ERROR(cudaMalloc((void**)&device_config, sizeof(GpuConstNEFR)));
279  HANDLE_ERROR(cudaMemcpyAsync(device_config, host_config, sizeof(GpuConstNEFR), cudaMemcpyHostToDevice));
280  break;
281  }
282  }
283 
284  int stream_points = n_points / n_streams;
285  int remainder = n_points % n_streams;
286 
287 
288  int offset = 0;
289  for (int i = 0; i < n_streams; ++i) {
290  offset = i * stream_points;
291  if (i == n_streams - 1) { // ¸¶Áö¸· ½ºÆ®¸² ¿¬»ê ½Ã,
292  stream_points += remainder;
293  }
294 
295  HANDLE_ERROR(cudaMemcpyAsync(device_vertex_data + offset, host_vertex_data + offset, stream_points * sizeof(Vertex), cudaMemcpyHostToDevice));
296 
297  switch (diff_flag) {
298  case PC_DIFF_RS: {
299  cudaPointCloud_RS(gridSize, blockSize, stream_points, device_vertex_data + offset, device_dst, (GpuConstNERS*)device_config, ch, m_mode);
300  break;
301  }
302  case PC_DIFF_FRESNEL: {
303  cudaPointCloud_Fresnel(gridSize, blockSize, stream_points, device_vertex_data + offset, device_dst, (GpuConstNEFR*)device_config, ch, m_mode);
304  break;
305  } // case
306  } // switch
307 
308  cudaError error = cudaGetLastError();
309  if (error != cudaSuccess) {
310  LOG("cudaGetLastError(): %s\n", cudaGetErrorName(error));
311  if (error == cudaErrorLaunchOutOfResources) {
312  i--;
313  blockSize /= 2;
314  gridSize *= 2;
315  continue;
316  }
317  }
318  HANDLE_ERROR(cudaMemcpyAsync(host_dst, device_dst, bufferSize, cudaMemcpyDeviceToHost));
319  HANDLE_ERROR(cudaMemsetAsync(device_dst, 0., bufferSize));
320 
321  for (ulonglong n = 0; n < pnXY; ++n) {
322  complex_H[ch][n][_RE] += host_dst[n].x;
323  complex_H[ch][n][_IM] += host_dst[n].y;
324  }
325 
326  m_nProgress = (int)((Real)(ch*n_streams + i + 1) * 100 / ((Real)n_streams * nChannel));
327  } // for
328 
330  HANDLE_ERROR(cudaFree(device_config));
331 
332  delete host_config;
333  }
334 
335  HANDLE_ERROR(cudaFree(device_vertex_data));
336  HANDLE_ERROR(cudaFree(device_dst));
337  delete[] host_dst;
338 
339  if (is_ViewingWindow) {
340  delete[] host_vertex_data;
341  }
342 
343  LOG("%s : %.5lf (sec)\n", __FUNCTION__, ELAPSED_TIME(begin, CUR_TIME));
344 }
345 #endif
static CUDA * getInstance()
Definition: CUDA.h:17
void errorCheck(cl_int err, const char *operation, char *filename, int line)
Definition: OpenCL.cpp:150
bool LoadKernel()
Definition: OpenCL.cpp:162
Real k
Definition: Openholo.h:67
#define HANDLE_ERROR(err)
Definition: CUDA.cpp:13
if(infile==nullptr)
Definition: Openholo.cpp:419
Real distance
Offset value of point cloud.
Definition: ophGen.h:560
Real * wave_length
Definition: Openholo.h:70
int getMaxThreads()
Definition: CUDA.h:34
float Real
Definition: typedef.h:55
cl_context & getContext()
Definition: OpenCL.h:45
#define CUR_TIME
Definition: function.h:58
Openholo Point Cloud based CGH generation with CUDA GPGPU.
vec2 ss
Definition: Openholo.h:68
void cudaPointCloud_RS(const int &nBlocks, const int &nThreads, const int &n_pts_per_stream, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const GpuConstNERS *cuda_config, const uint &iChannel, const uint &mode)
#define _Y
Definition: define.h:96
#define _IM
Definition: complex.h:58
KernelConst_NotEncodedFrsn GpuConstNEFR
vec2 pixel_pitch
Definition: Openholo.h:65
vec3 scale
Scaling factor of coordinate of point cloud.
Definition: ophGen.h:558
unsigned long long ulonglong
Definition: typedef.h:67
#define _X
Definition: define.h:92
return true
Definition: Openholo.cpp:434
static OpenCL * getInstance()
Definition: OpenCL.h:17
KernelConst_NotEncodedRS GpuConstNERS
#define _RE
Definition: complex.h:55
cl_kernel * getKernel()
Definition: OpenCL.h:48
Definition: struct.h:102
#define ELAPSED_TIME(x, y)
Definition: function.h:59
uint waveNum
Definition: Openholo.h:69
int n_colors
Number of color channel.
Definition: ophGen.h:587
ivec2 pixel_number
Definition: Openholo.h:63
cl_command_queue & getCommand()
Definition: OpenCL.h:46
void cudaPointCloud_Fresnel(const int &nBlocks, const int &nThreads, const int &n_pts_per_stream, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const GpuConstNEFR *cuda_config, const uint &iChannel, const uint &mode)
OphConfig context_
Definition: Openholo.h:449
Complex< Real > ** complex_H
Definition: Openholo.h:452
Definition: Bitmap.h:49
void transVW(int nVertex, Vertex *dst, Vertex *src)
Definition: ophGen.cpp:2067
unsigned int uint
Definition: typedef.h:62
#define M_PI
Definition: define.h:52
Definition: OpenCL.h:8