52 void ophPointCloud::genCghPointCloudGPU(
uint diff_flag)
60 cl_mem device_pc_data;
61 cl_mem device_amp_data;
70 const int n_colors = pc_data_.
n_colors;
71 Real* host_pc_data =
nullptr;
72 Real* host_amp_data = pc_data_.color;
73 Real* host_dst =
nullptr;
76 if (is_ViewingWindow) {
77 host_pc_data =
new Real[n_points * 3];
78 transVW(n_points * 3, host_pc_data, pc_data_.vertex);
81 host_pc_data = pc_data_.vertex;
85 bool bIsGrayScale = n_colors == 1 ?
true :
false;
91 cl_kernel* current_kernel =
nullptr;
93 host_dst =
new Real[pnXY * 2];
94 memset(host_dst, 0., bufferSize * 2);
96 current_kernel = diff_flag ==
PC_DIFF_RS ? &kernel[0] : &kernel[1];
98 device_pc_data = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(
Real) * n_points * 3,
nullptr, &nErr);
99 device_amp_data = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(
Real) * n_points * n_colors,
nullptr, &nErr);
100 nErr = clEnqueueWriteBuffer(commands, device_pc_data, CL_TRUE, 0,
sizeof(
Real) * n_points * 3, host_pc_data, 0,
nullptr,
nullptr);
101 nErr = clEnqueueWriteBuffer(commands, device_amp_data, CL_TRUE, 0,
sizeof(
Real) * n_points * n_colors, host_amp_data, 0,
nullptr,
nullptr);
103 device_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(
Real) * pnXY * 2,
nullptr, &nErr);
107 size_t local[2] = { 32, 32 };
109 clSetKernelArg(*current_kernel, 1,
sizeof(cl_mem), &device_pc_data);
110 clSetKernelArg(*current_kernel, 2,
sizeof(cl_mem), &device_amp_data);
111 clSetKernelArg(*current_kernel, 4,
sizeof(
uint), &n_points);
112 for (
uint ch = 0; ch < nChannel; ch++)
114 uint nAdd = bIsGrayScale ? 0 : ch;
119 n_points, n_colors, 1,
132 device_config = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(
GpuConstNERS),
nullptr, &nErr);
134 nErr = clEnqueueWriteBuffer(commands, device_result, CL_TRUE, 0,
sizeof(
Real) * pnXY * 2, host_dst, 0,
nullptr,
nullptr);
135 nErr = clEnqueueWriteBuffer(commands, device_config, CL_TRUE, 0,
sizeof(
GpuConstNERS), host_config, 0,
nullptr,
nullptr);
140 device_config = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(
GpuConstNEFR),
nullptr, &nErr);
142 nErr = clEnqueueWriteBuffer(commands, device_result, CL_TRUE, 0,
sizeof(
Real) * pnXY * 2, host_dst, 0,
nullptr,
nullptr);
143 nErr = clEnqueueWriteBuffer(commands, device_config, CL_TRUE, 0,
sizeof(
GpuConstNEFR), host_config, 0,
nullptr,
nullptr);
146 clSetKernelArg(*current_kernel, 0,
sizeof(cl_mem), &device_result);
147 clSetKernelArg(*current_kernel, 3,
sizeof(cl_mem), &device_config);
148 clSetKernelArg(*current_kernel, 5,
sizeof(
uint), &ch);
150 nErr = clEnqueueNDRangeKernel(commands, *current_kernel, 2,
nullptr, global,
nullptr, 0,
nullptr,
nullptr);
154 nErr = clFinish(commands);
156 if (nErr != CL_SUCCESS) cl->
errorCheck(nErr,
"Check", __FILE__, __LINE__);
158 nErr = clEnqueueReadBuffer(commands, device_result, CL_TRUE, 0,
sizeof(
Real) * pnXY * 2,
complex_H[ch], 0,
nullptr,
nullptr);
162 m_nProgress = (ch + 1) * 100 / nChannel;
165 clReleaseMemObject(device_result);
166 clReleaseMemObject(device_amp_data);
167 clReleaseMemObject(device_pc_data);
168 if (host_dst)
delete[] host_dst;
169 if (is_ViewingWindow && host_pc_data)
delete[] host_pc_data;
177 void ophPointCloud::genCghPointCloudGPU(
uint diff_flag)
179 if ((diff_flag != PC_DIFF_RS) && (diff_flag != PC_DIFF_FRESNEL))
181 LOG(
"<FAILED> Wrong parameters.");
187 const ulonglong pnXY = context_.pixel_number[
_X] * context_.pixel_number[
_Y];
189 ulonglong gridSize = (pnXY + blockSize - 1) / blockSize;
191 cout <<
">>> All " << blockSize * gridSize <<
" threads in CUDA" << endl;
192 cout <<
">>> " << blockSize <<
" threads/block, " << gridSize <<
" blocks/grid" << endl;
196 if (getStream() == 0)
197 n_streams = pc_data_.n_points / 300 + 1;
198 else if (getStream() < 0)
200 LOG(
"<FAILED> Wrong parameters.");
204 n_streams = getStream();
219 const ulonglong bufferSize = pnXY *
sizeof(cuDoubleComplex);
220 int n_points = pc_data_.n_points;
223 Vertex* host_vertex_data =
nullptr;
224 if (!is_ViewingWindow)
225 host_vertex_data = pc_data_.vertices;
228 host_vertex_data =
new Vertex[n_points];
229 std::memcpy(host_vertex_data, pc_data_.vertices,
sizeof(
Vertex) * n_points);
230 transVW(n_points, host_vertex_data, host_vertex_data);
232 cuDoubleComplex* host_dst =
nullptr;
233 host_dst =
new cuDoubleComplex[pnXY];
234 memset(host_dst, 0, bufferSize);
236 Vertex* device_vertex_data;
239 cuDoubleComplex* device_dst =
nullptr;
240 HANDLE_ERROR(cudaMalloc((
void**)&device_dst, bufferSize));
241 HANDLE_ERROR(cudaMemsetAsync(device_dst, 0., bufferSize));
243 uint nChannel = context_.waveNum;
245 cudaMemGetInfo(&free, &total);
248 LOG(
"CUDA Memory Total: %llu (byte) / Free: %llu (byte)\n", total, free);
250 LOG(
"CUDA Memory Total: %zu (byte) / Free: %zu (byte)\n", total, free);
252 for (
uint ch = 0; ch < nChannel; ch++)
254 context_.k = (2 *
M_PI) / context_.wave_length[ch];
258 pc_config_.scale, pc_config_.distance,
259 context_.pixel_number,
261 context_.pixel_pitch,
264 context_.wave_length[ch]
276 case PC_DIFF_FRESNEL: {
284 int stream_points = n_points / n_streams;
285 int remainder = n_points % n_streams;
289 for (
int i = 0; i < n_streams; ++i) {
290 offset = i * stream_points;
291 if (i == n_streams - 1) {
292 stream_points += remainder;
295 HANDLE_ERROR(cudaMemcpyAsync(device_vertex_data + offset, host_vertex_data + offset, stream_points *
sizeof(
Vertex), cudaMemcpyHostToDevice));
302 case PC_DIFF_FRESNEL: {
308 cudaError error = cudaGetLastError();
309 if (error != cudaSuccess) {
310 LOG(
"cudaGetLastError(): %s\n", cudaGetErrorName(error));
311 if (error == cudaErrorLaunchOutOfResources) {
318 HANDLE_ERROR(cudaMemcpyAsync(host_dst, device_dst, bufferSize, cudaMemcpyDeviceToHost));
319 HANDLE_ERROR(cudaMemsetAsync(device_dst, 0., bufferSize));
322 complex_H[ch][n][
_RE] += host_dst[n].x;
323 complex_H[ch][n][
_IM] += host_dst[n].y;
326 m_nProgress = (int)((
Real)(ch*n_streams + i + 1) * 100 / ((
Real)n_streams * nChannel));
339 if (is_ViewingWindow) {
340 delete[] host_vertex_data;
static CUDA * getInstance()
void errorCheck(cl_int err, const char *operation, char *filename, int line)
#define HANDLE_ERROR(err)
Real distance
Offset value of point cloud.
cl_context & getContext()
Openholo Point Cloud based CGH generation with CUDA GPGPU.
void cudaPointCloud_RS(const int &nBlocks, const int &nThreads, const int &n_pts_per_stream, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const GpuConstNERS *cuda_config, const uint &iChannel, const uint &mode)
KernelConst_NotEncodedFrsn GpuConstNEFR
vec3 scale
Scaling factor of coordinate of point cloud.
unsigned long long ulonglong
static OpenCL * getInstance()
KernelConst_NotEncodedRS GpuConstNERS
#define ELAPSED_TIME(x, y)
int n_colors
Number of color channel.
cl_command_queue & getCommand()
void cudaPointCloud_Fresnel(const int &nBlocks, const int &nThreads, const int &n_pts_per_stream, Vertex *cuda_vertex_data, cuDoubleComplex *cuda_dst, const GpuConstNEFR *cuda_config, const uint &iChannel, const uint &mode)
Complex< Real > ** complex_H
void transVW(int nVertex, Vertex *dst, Vertex *src)