Openholo  v5.0
Open Source Digital Holographic Library
ophPointCloud_GPU.cpp
Go to the documentation of this file.
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install, copy or use the software.
7 //
8 //
9 // License Agreement
10 // For Open Source Digital Holographic Library
11 //
12 // Openholo library is free software;
13 // you can redistribute it and/or modify it under the terms of the BSD 2-Clause license.
14 //
15 // Copyright (C) 2017-2024, Korea Electronics Technology Institute. All rights reserved.
16 // E-mail : contact.openholo@gmail.com
17 // Web : http://www.openholo.org
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // 1. Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // 2. Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the copyright holder or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 // This software contains opensource software released under GNU Generic Public License,
41 // NVDIA Software License Agreement, or CUDA supplement to Software License Agreement.
42 // Check whether software you use contains licensed software.
43 //
44 //M*/
45 
46 #include "ophPointCloud.h"
47 #include "ophPointCloud_GPU.h"
48 #include "sys.h"
49 #include "cudaWrapper.h"
50 
51 #ifdef _USE_OPENCL
52 #include "OpenCL.h"
53 
54 void ophPointCloud::genCghPointCloudGPU(uint diff_flag)
55 {
56  int nErr;
57  auto begin = CUR_TIME;
59 
60  cl_context context = cl->getContext();
61  cl_command_queue commands = cl->getCommand();
62  cl_mem d_pc_data;
63  cl_mem d_amp_data;
64  cl_mem d_result;
65  cl_mem d_config;
66 
67  //threads number
69  const ulonglong bufferSize = pnXY * sizeof(Real);
70 
71  //Host Memory Location
72  const int n_colors = pc_data_.n_colors;
73  Real* h_pc_data = nullptr;
74  Real* h_amp_data = pc_data_.color;
75  Real* h_dst = nullptr;
76 
77  // Keep original buffer
78  if (is_ViewingWindow) {
79  h_pc_data = new Real[n_points * 3];
80  transVW(n_points * 3, h_pc_data, pc_data_.vertex);
81  }
82  else {
83  h_pc_data = pc_data_.vertex;
84  }
85 
86  uint nChannel = context_.waveNum;
87  bool bIsGrayScale = n_colors == 1 ? true : false;
88 
89  cl->LoadKernel();
90 
91  cl_kernel* kernel = cl->getKernel();
92 
93  cl_kernel* current_kernel = nullptr;
94  if ((diff_flag == PC_DIFF_RS) || (diff_flag == PC_DIFF_FRESNEL)) {
95  h_dst = new Real[pnXY * 2];
96  memset(h_dst, 0., bufferSize * 2);
97 
98  current_kernel = diff_flag == PC_DIFF_RS ? &kernel[0] : &kernel[1];
99 
100  d_pc_data = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Real) * n_points * 3, nullptr, &nErr);
101  d_amp_data = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Real) * n_points * n_colors, nullptr, &nErr);
102  nErr = clEnqueueWriteBuffer(commands, d_pc_data, CL_TRUE, 0, sizeof(Real) * n_points * 3, h_pc_data, 0, nullptr, nullptr);
103  nErr = clEnqueueWriteBuffer(commands, d_amp_data, CL_TRUE, 0, sizeof(Real) * n_points * n_colors, h_amp_data, 0, nullptr, nullptr);
104 
105  d_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(Real) * pnXY * 2, nullptr, &nErr);
106 
107 
108  size_t global[2] = { context_.pixel_number[_X], context_.pixel_number[_Y] };
109  size_t local[2] = { 32, 32 };
110 
111  clSetKernelArg(*current_kernel, 1, sizeof(cl_mem), &d_pc_data);
112  clSetKernelArg(*current_kernel, 2, sizeof(cl_mem), &d_amp_data);
113  clSetKernelArg(*current_kernel, 4, sizeof(uint), &n_points);
114  for (uint ch = 0; ch < nChannel; ch++)
115  {
116  uint nAdd = bIsGrayScale ? 0 : ch;
117  context_.k = (2 * M_PI) / context_.wave_length[ch];
118  Real ratio = 1.0; //context_.wave_length[nChannel - 1] / context_.wave_length[ch];
119 
120  GpuConst* h_config = new GpuConst(
121  n_points, n_colors, 1,
122  pc_config_.scale, pc_config_.distance,
125  context_.ss,
126  context_.k,
127  context_.wave_length[ch],
128  ratio
129  );
130 
131  if (diff_flag == PC_DIFF_RS)
132  {
133  h_config = new GpuConstNERS(*h_config);
134  d_config = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(GpuConstNERS), nullptr, &nErr);
135 
136  nErr = clEnqueueWriteBuffer(commands, d_result, CL_TRUE, 0, sizeof(Real) * pnXY * 2, h_dst, 0, nullptr, nullptr);
137  nErr = clEnqueueWriteBuffer(commands, d_config, CL_TRUE, 0, sizeof(GpuConstNERS), h_config, 0, nullptr, nullptr);
138  }
139  else if (diff_flag == PC_DIFF_FRESNEL)
140  {
141  h_config = new GpuConstNEFR(*h_config);
142  d_config = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(GpuConstNEFR), nullptr, &nErr);
143 
144  nErr = clEnqueueWriteBuffer(commands, d_result, CL_TRUE, 0, sizeof(Real) * pnXY * 2, h_dst, 0, nullptr, nullptr);
145  nErr = clEnqueueWriteBuffer(commands, d_config, CL_TRUE, 0, sizeof(GpuConstNEFR), h_config, 0, nullptr, nullptr);
146  }
147 
148  clSetKernelArg(*current_kernel, 0, sizeof(cl_mem), &d_result);
149  clSetKernelArg(*current_kernel, 3, sizeof(cl_mem), &d_config);
150  clSetKernelArg(*current_kernel, 5, sizeof(uint), &ch);
151 
152  nErr = clEnqueueNDRangeKernel(commands, *current_kernel, 2, nullptr, global, nullptr/*local*/, 0, nullptr, nullptr);
153 
154 
155  //nErr = clFlush(commands);
156  nErr = clFinish(commands);
157 
158  if (nErr != CL_SUCCESS) cl->errorCheck(nErr, "Check", __FILE__, __LINE__);
159 
160  nErr = clEnqueueReadBuffer(commands, d_result, CL_TRUE, 0, sizeof(Real) * pnXY * 2, complex_H[ch], 0, nullptr, nullptr);
161 
162  delete h_config;
163 
164  m_nProgress = (ch + 1) * 100 / nChannel;
165  }
166 
167  clReleaseMemObject(d_result);
168  clReleaseMemObject(d_amp_data);
169  clReleaseMemObject(d_pc_data);
170  if (h_dst) delete[] h_dst;
171  if (is_ViewingWindow && h_pc_data) delete[] h_pc_data;
172  }
173 
174  LOG("%s : %.5lf (sec)\n", __FUNCTION__, ELAPSED_TIME(begin, CUR_TIME));
175 }
176 #endif
177 
178 using namespace oph;
179 void ophPointCloud::genCghPointCloudGPU(uint diff_flag)
180 {
181  if ((diff_flag != PC_DIFF_RS) && (diff_flag != PC_DIFF_FRESNEL))
182  {
183  LOG("<FAILED> Wrong parameters.");
184  return;
185  }
186  cudaWrapper* pCudaWrapper = cudaWrapper::getInstance();
187 
188  auto begin = CUR_TIME;
189  const ulonglong pnXY = context_.pixel_number[_X] * context_.pixel_number[_Y];
190  int blockSize = pCudaWrapper->getMaxThreads(0); //n_threads // blockSize < devProp.maxThreadsPerBlock
191  ulonglong gridSize = (pnXY + blockSize - 1) / blockSize; //n_blocks
192 
193  cout << ">>> All " << blockSize * gridSize << " threads in CUDA" << endl;
194  cout << ">>> " << blockSize << " threads/block, " << gridSize << " blocks/grid" << endl;
195 
196 
197  //Host Memory Location
198  Vertex* h_vertex_data = nullptr;
199  if (!is_ViewingWindow)
200  h_vertex_data = pc_data_.vertices;
201  else
202  {
203  h_vertex_data = new Vertex[pc_data_.n_points];
204  std::memcpy(h_vertex_data, pc_data_.vertices, sizeof(Vertex) * pc_data_.n_points);
205  transVW(pc_data_.n_points, h_vertex_data, h_vertex_data);
206  }
207 
208  //threads number
209  const ulonglong bufferSize = pnXY * sizeof(cuDoubleComplex);
210  int gpu_num = pCudaWrapper->getActiveGPUs();
211 
212  uint nChannel = context_.waveNum;
213  // cacl workload
214  pCudaWrapper->setWorkload(pc_data_.n_points);
215 
216 
217  // default
218  if (gpu_num == 1) {
219  cuDoubleComplex* d_dst = nullptr;
220  HANDLE_ERROR(cudaMalloc((void**)&d_dst, bufferSize));
221  HANDLE_ERROR(cudaMemset(d_dst, 0., bufferSize));
222 
223  Vertex* d_vertex_data = nullptr;
224  pCudaWrapper->printMemoryInfo(0);
225  HANDLE_ERROR(cudaMalloc((void**)&d_vertex_data, pc_data_.n_points * sizeof(Vertex)));
226  CudaPointCloudConfig *base_config = new CudaPointCloudConfig(
227  pc_data_.n_points,
228  pc_config_.scale,
229  pc_config_.distance,
230  context_.pixel_number,
231  context_.offset,
232  context_.pixel_pitch,
233  context_.ss,
234  context_.k,
235  context_.wave_length[0]
236  );
237 
238  HANDLE_ERROR(cudaMemcpy(d_vertex_data, h_vertex_data, pc_data_.n_points * sizeof(Vertex), cudaMemcpyHostToDevice));
239 
240  for (uint ch = 0; ch < nChannel; ch++)
241  {
242  base_config->k = context_.k = (2 * M_PI) / context_.wave_length[ch];
243  base_config->lambda = context_.wave_length[ch];
244  CudaPointCloudConfig* h_config = nullptr;
245  CudaPointCloudConfig* d_config = nullptr;
246  switch (diff_flag) {
247  case PC_DIFF_RS: {
248  h_config = new CudaPointCloudConfigRS(*base_config);
249  HANDLE_ERROR(cudaMalloc((void**)&d_config, sizeof(CudaPointCloudConfigRS)));
250  HANDLE_ERROR(cudaMemcpy(d_config, h_config, sizeof(CudaPointCloudConfigRS), cudaMemcpyHostToDevice));
251  cudaPointCloud_RS(gridSize, blockSize, d_vertex_data, d_dst, (CudaPointCloudConfigRS*)d_config, ch, m_mode);
252  break;
253  }
254  case PC_DIFF_FRESNEL: {
255  h_config = new CudaPointCloudConfigFresnel(*base_config);
256  HANDLE_ERROR(cudaMalloc((void**)&d_config, sizeof(CudaPointCloudConfigFresnel)));
257  HANDLE_ERROR(cudaMemcpy(d_config, h_config, sizeof(CudaPointCloudConfigFresnel), cudaMemcpyHostToDevice));
258  cudaPointCloud_Fresnel(gridSize, blockSize, d_vertex_data, d_dst, (CudaPointCloudConfigFresnel*)d_config, ch, m_mode);
259  break;
260  }
261  }
262  cudaError error = cudaGetLastError();
263  if (error != cudaSuccess) {
264  LOG("cudaGetLastError(): %s\n", cudaGetErrorName(error));
265  if (error == cudaErrorLaunchOutOfResources) {
266  ch--;
267  blockSize /= 2;
268  gridSize *= 2;
269  continue;
270  }
271  }
272  HANDLE_ERROR(cudaMemcpy(complex_H[ch], d_dst, bufferSize, cudaMemcpyDeviceToHost));
273  HANDLE_ERROR(cudaMemset(d_dst, 0., bufferSize));
274  HANDLE_ERROR(cudaFree(d_config));
275  delete h_config;
276 
277  m_nProgress = (ch + 1) * 100 / nChannel;
278  }
279  HANDLE_ERROR(cudaFree(d_vertex_data));
280  HANDLE_ERROR(cudaFree(d_dst));
281  delete base_config;
282 
283  if (is_ViewingWindow) {
284  delete[] h_vertex_data;
285  }
286  }
287  // multi gpu
288  else {
289  int current_idx = 0;
290  cuDoubleComplex* d_dst = nullptr;
291  //Complex<Real>* h_dst[MAX_GPU];
292  cuDoubleComplex* d_tmp[MAX_GPU];
293  Vertex* d_vertex_data[MAX_GPU];
294  CudaPointCloudConfig* d_config[MAX_GPU];// = nullptr;
295 
296  HANDLE_ERROR(cudaSetDevice(0));
297  HANDLE_ERROR(cudaMalloc((void**)&d_dst, bufferSize));
298  HANDLE_ERROR(cudaMemset(d_dst, 0., bufferSize));
299 
300  CudaPointCloudConfig* base_config = new CudaPointCloudConfig(
301  pc_data_.n_points,
302  pc_config_.scale,
303  pc_config_.distance,
304  context_.pixel_number,
305  context_.offset,
306  context_.pixel_pitch,
307  context_.ss,
308  context_.k,
309  context_.wave_length[0]
310  );
311 
312  for (int devID = 0; devID < gpu_num && devID < MAX_GPU; devID++)
313  {
314  pCudaWrapper->printMemoryInfo(devID);
315  HANDLE_ERROR(cudaSetDevice(devID));
316  int n_point = pCudaWrapper->getWorkload(devID);
317  int n_size = n_point * sizeof(Vertex);
318 
319  HANDLE_ERROR(cudaMalloc((void**)&d_vertex_data[devID], n_size));
320  HANDLE_ERROR(cudaMemcpy((void*)d_vertex_data[devID], (const void *)&h_vertex_data[current_idx], n_size, cudaMemcpyHostToDevice));
321 
322  HANDLE_ERROR(cudaMalloc((void**)&d_tmp[devID], bufferSize));
323  HANDLE_ERROR(cudaMemset(d_tmp[devID], 0., bufferSize));
324 
325  switch (diff_flag) {
326  case PC_DIFF_RS: {
327  HANDLE_ERROR(cudaMalloc((void**)&d_config[devID], sizeof(CudaPointCloudConfigRS)));
328  break;
329  }
330  case PC_DIFF_FRESNEL: {
331  HANDLE_ERROR(cudaMalloc((void**)&d_config[devID], sizeof(CudaPointCloudConfigFresnel)));
332  break;
333  }
334  }
335 
336  current_idx += n_point;
337  }
338 
339  for (uint ch = 0; ch < nChannel; ch++)
340  {
341  base_config->k = context_.k = (2 * M_PI) / context_.wave_length[ch];
342  base_config->lambda = context_.wave_length[ch];
343  current_idx = 0;
344  for (int devID = 0; devID < gpu_num && devID < MAX_GPU; devID++)
345  {
346  int n_point = pCudaWrapper->getWorkload(devID);
347  HANDLE_ERROR(cudaSetDevice(devID));
348  base_config->n_points = n_point;
349 
350  HANDLE_ERROR(cudaMemset(d_tmp[devID], 0., bufferSize));
351  CudaPointCloudConfig* h_config = nullptr;
352 
353  switch (diff_flag) {
354  case PC_DIFF_RS: {
355  h_config = new CudaPointCloudConfigRS(*base_config);
356  HANDLE_ERROR(cudaMemcpy(d_config[devID], h_config, sizeof(CudaPointCloudConfigRS), cudaMemcpyHostToDevice));
357  cudaPointCloud_RS(gridSize, blockSize, d_vertex_data[devID], d_tmp[devID], (CudaPointCloudConfigRS*)d_config[devID], ch, m_mode);
358  break;
359  }
360  case PC_DIFF_FRESNEL: {
361  h_config = new CudaPointCloudConfigFresnel(*base_config);
362  HANDLE_ERROR(cudaMemcpy(d_config[devID], h_config, sizeof(CudaPointCloudConfigFresnel), cudaMemcpyHostToDevice));
363  cudaPointCloud_Fresnel(gridSize, blockSize, d_vertex_data[devID], d_tmp[devID], (CudaPointCloudConfigFresnel*)d_config[devID], ch, m_mode);
364  break;
365  }
366  }
367  delete h_config;
368  }
369 
370  // wait for cudaPointCloud_* kernel
371  for (int devID = 0; devID < gpu_num && devID < MAX_GPU; devID++)
372  {
373  HANDLE_ERROR(cudaSetDevice(devID));
374  cudaDeviceSynchronize();
375  }
376 
377  HANDLE_ERROR(cudaSetDevice(0));
378  for (int devID = 0; devID < gpu_num && devID < MAX_GPU; devID++)
379  {
380  if (devID != 0)
381  {
382  // GPU N to GPU 0
383  cudaMemcpyPeer(d_tmp[0], 0, d_tmp[devID], devID, bufferSize);
384  }
385 
386  sum_Kernel(gridSize, blockSize, d_dst, d_tmp[0], pnXY);
387  cudaDeviceSynchronize();
388  }
389  HANDLE_ERROR(cudaMemcpy(complex_H[ch], d_dst, bufferSize, cudaMemcpyDeviceToHost));
390  HANDLE_ERROR(cudaMemset(d_dst, 0, bufferSize));
391 
392  m_nProgress = (ch + 1) * 100 / nChannel;
393  }
394 
395  for (int devID = 0; devID < gpu_num && devID < MAX_GPU; devID++)
396  {
397  //delete[] h_dst[devID];
398  cudaSetDevice(devID);
399  HANDLE_ERROR(cudaFree(d_config[devID]));
400  HANDLE_ERROR(cudaFree(d_vertex_data[devID]));
401  HANDLE_ERROR(cudaFree(d_tmp[devID]));
402  }
403 
404  HANDLE_ERROR(cudaSetDevice(0));
405  HANDLE_ERROR(cudaFree(d_dst));
406  delete base_config;
407  if (is_ViewingWindow) {
408  delete[] h_vertex_data;
409  }
410  }
411 
412  LOG("%s : %.5lf (sec)\n", __FUNCTION__, ELAPSED_TIME(begin, CUR_TIME));
413 }
int getActiveGPUs()
Definition: cudaWrapper.h:79
void errorCheck(cl_int err, const char *operation, char *filename, int line)
Definition: OpenCL.cpp:150
bool LoadKernel()
Definition: OpenCL.cpp:162
Real k
Definition: Openholo.h:103
void cudaPointCloud_Fresnel(const int &nBlocks, const int &nThreads, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const CudaPointCloudConfigFresnel *cuda_config, const uint &iColor, const uint &mode)
double lambda
Wave Number = (2 * PI) / lambda;.
Real distance
Offset value of point cloud.
Definition: ophGen.h:550
Real * wave_length
Definition: Openholo.h:106
#define HANDLE_ERROR(err)
Definition: cudaWrapper.cpp:13
static cudaWrapper * getInstance()
Definition: cudaWrapper.h:50
void printMemoryInfo(int idx)
float Real
Definition: typedef.h:55
cl_context & getContext()
Definition: OpenCL.h:45
#define CUR_TIME
Definition: function.h:58
Openholo Point Cloud based CGH generation with CUDA GPGPU.
vec2 ss
Definition: Openholo.h:104
struct _CudaPointCloudConfig CudaPointCloudConfig
#define _Y
Definition: define.h:96
vec2 pixel_pitch
Definition: Openholo.h:101
void setWorkload(int size)
vec3 scale
Scaling factor of coordinate of point cloud.
Definition: ophGen.h:548
unsigned long long ulonglong
Definition: typedef.h:67
#define _X
Definition: define.h:92
_CudaPointCloudConfigRS CudaPointCloudConfigRS
double k
(pixel_y * ny) / 2
return true
Definition: Openholo.cpp:434
int getMaxThreads(int idx)
Definition: cudaWrapper.h:70
static OpenCL * getInstance()
Definition: OpenCL.h:17
cl_kernel * getKernel()
Definition: OpenCL.h:48
Definition: struct.h:102
#define ELAPSED_TIME(x, y)
Definition: function.h:59
uint waveNum
Definition: Openholo.h:105
_CudaPointCloudConfigFresnel CudaPointCloudConfigFresnel
int n_colors
Number of color channel.
Definition: ophGen.h:577
ivec2 pixel_number
Definition: Openholo.h:99
void sum_Kernel(const int &nBlocks, const int &nThreads, cuDoubleComplex *dst, cuDoubleComplex *src, int size)
cl_command_queue & getCommand()
Definition: OpenCL.h:46
#define MAX_GPU
Definition: cudaWrapper.h:9
void cudaPointCloud_RS(const int &nBlocks, const int &nThreads, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const CudaPointCloudConfigRS *cuda_config, const uint &iColor, const uint &mode)
int getWorkload(int idx)
Definition: cudaWrapper.h:77
OphConfig context_
Definition: Openholo.h:486
Complex< Real > ** complex_H
Definition: Openholo.h:490
Definition: Bitmap.h:49
void transVW(int nVertex, Vertex *dst, Vertex *src)
Definition: ophGen.cpp:1975
unsigned int uint
Definition: typedef.h:62
#define M_PI
Definition: define.h:52
Definition: OpenCL.h:8