54 void ophPointCloud::genCghPointCloudGPU(
uint diff_flag)
72 const int n_colors = pc_data_.
n_colors;
73 Real* h_pc_data =
nullptr;
74 Real* h_amp_data = pc_data_.color;
75 Real* h_dst =
nullptr;
78 if (is_ViewingWindow) {
79 h_pc_data =
new Real[n_points * 3];
80 transVW(n_points * 3, h_pc_data, pc_data_.vertex);
83 h_pc_data = pc_data_.vertex;
87 bool bIsGrayScale = n_colors == 1 ?
true :
false;
93 cl_kernel* current_kernel =
nullptr;
95 h_dst =
new Real[pnXY * 2];
96 memset(h_dst, 0., bufferSize * 2);
98 current_kernel = diff_flag ==
PC_DIFF_RS ? &kernel[0] : &kernel[1];
100 d_pc_data = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(
Real) * n_points * 3,
nullptr, &nErr);
101 d_amp_data = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(
Real) * n_points * n_colors,
nullptr, &nErr);
102 nErr = clEnqueueWriteBuffer(commands, d_pc_data, CL_TRUE, 0,
sizeof(
Real) * n_points * 3, h_pc_data, 0,
nullptr,
nullptr);
103 nErr = clEnqueueWriteBuffer(commands, d_amp_data, CL_TRUE, 0,
sizeof(
Real) * n_points * n_colors, h_amp_data, 0,
nullptr,
nullptr);
105 d_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(
Real) * pnXY * 2,
nullptr, &nErr);
109 size_t local[2] = { 32, 32 };
111 clSetKernelArg(*current_kernel, 1,
sizeof(cl_mem), &d_pc_data);
112 clSetKernelArg(*current_kernel, 2,
sizeof(cl_mem), &d_amp_data);
113 clSetKernelArg(*current_kernel, 4,
sizeof(
uint), &n_points);
114 for (
uint ch = 0; ch < nChannel; ch++)
116 uint nAdd = bIsGrayScale ? 0 : ch;
120 GpuConst* h_config =
new GpuConst(
121 n_points, n_colors, 1,
133 h_config =
new GpuConstNERS(*h_config);
134 d_config = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(GpuConstNERS),
nullptr, &nErr);
136 nErr = clEnqueueWriteBuffer(commands, d_result, CL_TRUE, 0,
sizeof(
Real) * pnXY * 2, h_dst, 0,
nullptr,
nullptr);
137 nErr = clEnqueueWriteBuffer(commands, d_config, CL_TRUE, 0,
sizeof(GpuConstNERS), h_config, 0,
nullptr,
nullptr);
141 h_config =
new GpuConstNEFR(*h_config);
142 d_config = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(GpuConstNEFR),
nullptr, &nErr);
144 nErr = clEnqueueWriteBuffer(commands, d_result, CL_TRUE, 0,
sizeof(
Real) * pnXY * 2, h_dst, 0,
nullptr,
nullptr);
145 nErr = clEnqueueWriteBuffer(commands, d_config, CL_TRUE, 0,
sizeof(GpuConstNEFR), h_config, 0,
nullptr,
nullptr);
148 clSetKernelArg(*current_kernel, 0,
sizeof(cl_mem), &d_result);
149 clSetKernelArg(*current_kernel, 3,
sizeof(cl_mem), &d_config);
150 clSetKernelArg(*current_kernel, 5,
sizeof(
uint), &ch);
152 nErr = clEnqueueNDRangeKernel(commands, *current_kernel, 2,
nullptr, global,
nullptr, 0,
nullptr,
nullptr);
156 nErr = clFinish(commands);
158 if (nErr != CL_SUCCESS) cl->
errorCheck(nErr,
"Check", __FILE__, __LINE__);
160 nErr = clEnqueueReadBuffer(commands, d_result, CL_TRUE, 0,
sizeof(
Real) * pnXY * 2,
complex_H[ch], 0,
nullptr,
nullptr);
164 m_nProgress = (ch + 1) * 100 / nChannel;
167 clReleaseMemObject(d_result);
168 clReleaseMemObject(d_amp_data);
169 clReleaseMemObject(d_pc_data);
170 if (h_dst)
delete[] h_dst;
171 if (is_ViewingWindow && h_pc_data)
delete[] h_pc_data;
179 void ophPointCloud::genCghPointCloudGPU(
uint diff_flag)
181 if ((diff_flag != PC_DIFF_RS) && (diff_flag != PC_DIFF_FRESNEL))
183 LOG(
"<FAILED> Wrong parameters.");
189 const ulonglong pnXY = context_.pixel_number[
_X] * context_.pixel_number[
_Y];
191 ulonglong gridSize = (pnXY + blockSize - 1) / blockSize;
193 cout <<
">>> All " << blockSize * gridSize <<
" threads in CUDA" << endl;
194 cout <<
">>> " << blockSize <<
" threads/block, " << gridSize <<
" blocks/grid" << endl;
198 Vertex* h_vertex_data =
nullptr;
199 if (!is_ViewingWindow)
200 h_vertex_data = pc_data_.vertices;
203 h_vertex_data =
new Vertex[pc_data_.n_points];
204 std::memcpy(h_vertex_data, pc_data_.vertices,
sizeof(
Vertex) * pc_data_.n_points);
205 transVW(pc_data_.n_points, h_vertex_data, h_vertex_data);
209 const ulonglong bufferSize = pnXY *
sizeof(cuDoubleComplex);
212 uint nChannel = context_.waveNum;
219 cuDoubleComplex* d_dst =
nullptr;
223 Vertex* d_vertex_data =
nullptr;
230 context_.pixel_number,
232 context_.pixel_pitch,
235 context_.wave_length[0]
238 HANDLE_ERROR(cudaMemcpy(d_vertex_data, h_vertex_data, pc_data_.n_points *
sizeof(
Vertex), cudaMemcpyHostToDevice));
240 for (
uint ch = 0; ch < nChannel; ch++)
242 base_config->
k = context_.k = (2 *
M_PI) / context_.wave_length[ch];
243 base_config->
lambda = context_.wave_length[ch];
254 case PC_DIFF_FRESNEL: {
262 cudaError error = cudaGetLastError();
263 if (error != cudaSuccess) {
264 LOG(
"cudaGetLastError(): %s\n", cudaGetErrorName(error));
265 if (error == cudaErrorLaunchOutOfResources) {
272 HANDLE_ERROR(cudaMemcpy(complex_H[ch], d_dst, bufferSize, cudaMemcpyDeviceToHost));
277 m_nProgress = (ch + 1) * 100 / nChannel;
283 if (is_ViewingWindow) {
284 delete[] h_vertex_data;
290 cuDoubleComplex* d_dst =
nullptr;
292 cuDoubleComplex* d_tmp[
MAX_GPU];
304 context_.pixel_number,
306 context_.pixel_pitch,
309 context_.wave_length[0]
312 for (
int devID = 0; devID < gpu_num && devID <
MAX_GPU; devID++)
317 int n_size = n_point *
sizeof(
Vertex);
319 HANDLE_ERROR(cudaMalloc((
void**)&d_vertex_data[devID], n_size));
320 HANDLE_ERROR(cudaMemcpy((
void*)d_vertex_data[devID], (
const void *)&h_vertex_data[current_idx], n_size, cudaMemcpyHostToDevice));
322 HANDLE_ERROR(cudaMalloc((
void**)&d_tmp[devID], bufferSize));
330 case PC_DIFF_FRESNEL: {
336 current_idx += n_point;
339 for (
uint ch = 0; ch < nChannel; ch++)
341 base_config->
k = context_.k = (2 *
M_PI) / context_.wave_length[ch];
342 base_config->
lambda = context_.wave_length[ch];
344 for (
int devID = 0; devID < gpu_num && devID <
MAX_GPU; devID++)
360 case PC_DIFF_FRESNEL: {
371 for (
int devID = 0; devID < gpu_num && devID <
MAX_GPU; devID++)
374 cudaDeviceSynchronize();
378 for (
int devID = 0; devID < gpu_num && devID <
MAX_GPU; devID++)
383 cudaMemcpyPeer(d_tmp[0], 0, d_tmp[devID], devID, bufferSize);
386 sum_Kernel(gridSize, blockSize, d_dst, d_tmp[0], pnXY);
387 cudaDeviceSynchronize();
389 HANDLE_ERROR(cudaMemcpy(complex_H[ch], d_dst, bufferSize, cudaMemcpyDeviceToHost));
392 m_nProgress = (ch + 1) * 100 / nChannel;
395 for (
int devID = 0; devID < gpu_num && devID <
MAX_GPU; devID++)
398 cudaSetDevice(devID);
407 if (is_ViewingWindow) {
408 delete[] h_vertex_data;
void errorCheck(cl_int err, const char *operation, char *filename, int line)
void cudaPointCloud_Fresnel(const int &nBlocks, const int &nThreads, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const CudaPointCloudConfigFresnel *cuda_config, const uint &iColor, const uint &mode)
double lambda
Wave Number = (2 * PI) / lambda;.
Real distance
Offset value of point cloud.
#define HANDLE_ERROR(err)
static cudaWrapper * getInstance()
void printMemoryInfo(int idx)
cl_context & getContext()
Openholo Point Cloud based CGH generation with CUDA GPGPU.
struct _CudaPointCloudConfig CudaPointCloudConfig
void setWorkload(int size)
vec3 scale
Scaling factor of coordinate of point cloud.
unsigned long long ulonglong
_CudaPointCloudConfigRS CudaPointCloudConfigRS
double k
(pixel_y * ny) / 2
int getMaxThreads(int idx)
static OpenCL * getInstance()
#define ELAPSED_TIME(x, y)
_CudaPointCloudConfigFresnel CudaPointCloudConfigFresnel
int n_colors
Number of color channel.
void sum_Kernel(const int &nBlocks, const int &nThreads, cuDoubleComplex *dst, cuDoubleComplex *src, int size)
cl_command_queue & getCommand()
void cudaPointCloud_RS(const int &nBlocks, const int &nThreads, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const CudaPointCloudConfigRS *cuda_config, const uint &iColor, const uint &mode)
Complex< Real > ** complex_H
void transVW(int nVertex, Vertex *dst, Vertex *src)