Openholo  v5.0
Open Source Digital Holographic Library
cudaWrapper.cpp
Go to the documentation of this file.
1 #include "cudaWrapper.h"
2 #include "define.h"
3 
4 static void HandleError(cudaError_t err,
5  const char *file,
6  int line) {
7  if (err != cudaSuccess) {
8  printf("%s in %s at line %d\n", cudaGetErrorString(err),
9  file, line);
10  exit(EXIT_FAILURE);
11  }
12 }
13 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
14 
15 
16 #define HANDLE_NULL( a ) {if (a == NULL) { \
17  printf( "Host memory failed in %s at line %d\n", \
18  __FILE__, __LINE__ ); \
19  exit( EXIT_FAILURE );}}
20 
21 
22 cudaWrapper* cudaWrapper::instance = nullptr;
23 std::mutex cudaWrapper::mtx;
24 
25 cudaWrapper::cudaWrapper()
26  : m_nThread(512)
27  , num_gpu(1)
28  , cur_gpu(0)
29 {
30  initGPU();
31 }
32 
33 
34 cudaWrapper::~cudaWrapper()
35 {
36 }
37 
38 
39 void cudaWrapper::initGPU()
40 {
41  cudaGetDeviceCount(&num_gpu);
42  printf("%d GPU Detected.\n", num_gpu);
43  active_gpus = num_gpu;
44  cudaDeviceProp devProp;
45 
46  for (int i = 0; i < num_gpu && i < MAX_GPU; i++)
47  {
48  int devID;
49  cudaError_t res = cudaGetDevice(&devID);
50  if (res == cudaSuccess) {
51  cudaSetDevice(i);
52  cur_gpu = i;
53  HANDLE_ERROR(cudaGetDeviceProperties(&devProp, cur_gpu));
54  devProps[i] = devProp;
55  int smPerCore = getSMPerCore(devProp.major, devProp.minor);
56  cuda_cores[i] = smPerCore * devProp.multiProcessorCount;
57 
58  sprintf(devInfos[i][DEVICE_INFO::DEVICE_NAME], "GPU Spec: %s\n", devProp.name);
59 #ifdef _WIN64
60  sprintf(devInfos[i][DEVICE_INFO::GLOBAL_MEMORY], "Global Memory: %llu\n", devProp.totalGlobalMem);
61  sprintf(devInfos[i][DEVICE_INFO::CONSTANT_MEMORY], "Const Memory: %llu\n", devProp.totalConstMem);
62 #else
63  sprintf(devInfos[i][DEVICE_INFO::GLOBAL_MEMORY], "Global Memory: %zu\n", devProp.totalGlobalMem);
64  sprintf(devInfos[i][DEVICE_INFO::CONSTANT_MEMORY], "Const Memory: %zu\n", devProp.totalConstMem);
65 #endif
66  sprintf(devInfos[i][DEVICE_INFO::MANAGED_MEMORY], "Managed Memory: %d\n", devProp.managedMemory);
67  sprintf(devInfos[i][DEVICE_INFO::MP_COUNT], "MP(Multiprocessor) Count : %d\n", devProp.multiProcessorCount);
68 
69  sprintf(devInfos[i][DEVICE_INFO::TOTAL_MP_COUNT], "Total MP Count: %d\n", cuda_cores[i]);
70  sprintf(devInfos[i][DEVICE_INFO::MAX_THREADS_PER_MP], "Maximum Threads per Block : %d\n", devProp.maxThreadsPerBlock);
71 
72  sprintf(devInfos[i][DEVICE_INFO::WARP_SIZE], "Warp Size : %u\n", devProp.warpSize);
73  sprintf(devInfos[i][DEVICE_INFO::BLOCK_PER_MP], "Block per MP : %d\n", devProp.maxThreadsPerMultiProcessor / devProp.maxThreadsPerBlock);
74  sprintf(devInfos[i][DEVICE_INFO::SHARED_MEMORY_PER_MP], "Shared Memory per MP : %llu\n", devProp.sharedMemPerMultiprocessor);
75  sprintf(devInfos[i][DEVICE_INFO::SHARED_MEMORY_PER_BLOCK], "Shared Memory per Block : %llu\n", devProp.sharedMemPerBlock);
76  sprintf(devInfos[i][DEVICE_INFO::MAX_THREADS_PER_BLOCK], "Maximum Threads per Block : %d\n", devProp.maxThreadsPerBlock);
77  sprintf(devInfos[i][DEVICE_INFO::MAX_THREADS_DIMENSION], "Maximum Threads of each Dimension of a Block (X: %d / Y: %d / Z: %d)\n",
78  devProp.maxThreadsDim[_X], devProp.maxThreadsDim[_Y], devProp.maxThreadsDim[_Z]);
79  sprintf(devInfos[i][DEVICE_INFO::MAX_GRID_SIZE], "Maximum Blocks of each Dimension of a Grid, (X: %d / Y: %d / Z: %d)\n",
80  devProp.maxGridSize[_X], devProp.maxGridSize[_Y], devProp.maxGridSize[_Z]);
81  }
82  else {
83  printf("<FAILED> cudaGetDevice(%d)\n", i);
84  }
85  }
86  printDevInfo();
87 }
88 
89 bool cudaWrapper::printDevInfo()
90 {
91  cudaDeviceProp devProp;
92 
93  for (int i = 0; i < num_gpu && i < MAX_GPU; i++)
94  {
95  devProp = devProps[i];
96  printf("%d] ", i);
97  for (int j = 0; j < MAX_INFO; j++)
98  {
99  printf("%s", devInfos[i][j]);
100  }
101  }
102 
103  return true;
104 }
105 
107 {
108  cudaError_t ret = cudaSuccess;
109 
110  int old_idx = 0;
111  if (cudaGetDevice(&old_idx) == cudaSuccess) {
112  if (idx < active_gpus && idx < MAX_GPU) {
113  if (cudaSetDevice(idx) == cudaSuccess) {
114  size_t free, total;
115  if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
116  uint64_t gb = 1024 * 1024 * 1024;
117  printf("%d] CUDA Memory: %.1f/%.1fGB\n", idx, static_cast<double>(total - free) / gb, static_cast<double>(total) / gb);
118  }
119  }
120  }
121  cudaSetDevice(old_idx);
122  }
123 }
124 
125 int cudaWrapper::getSMPerCore(int major, int minor)
126 {
127  int smPerMultiproc = 0;
128 
129  switch (major) {
130  case 2: // Fermi
131  smPerMultiproc = 32;
132  break;
133  case 3: // Kepler
134  smPerMultiproc = 192;
135  break;
136  case 5: // Maxwell
137  smPerMultiproc = 128;
138  break;
139  case 6: // Pascal
140  smPerMultiproc = (minor == 1) ? 128 : 64;
141  break;
142  case 7: // Volta, Turing
143  smPerMultiproc = 64;
144  break;
145  case 8: // Ampere, Ada Lovelace
146  smPerMultiproc = 128;
147  break;
148  default:
149  printf("<FAILED> Unsupported cudaWrapper architecture");
150  }
151 
152  return smPerMultiproc;
153 }
154 
155 
157 {
158  int total_cores = 0;
159  int max_core = 0;
160  int max_idx = 0;
161  int total_workload = 0;
162  for (int i = 0; i < active_gpus; i++)
163  {
164  work_load[i] = 0;
165  total_cores += cuda_cores[i];
166  if (cuda_cores[i] > max_core) {
167  max_core = cuda_cores[i];
168  max_idx = i;
169  }
170  }
171  // distributed data
172  for (int i = 0; i < active_gpus; i++)
173  {
174  work_load[i] = (size * cuda_cores[i]) / total_cores;
175  total_workload += work_load[i];
176  }
177  // added loss data
178  work_load[max_idx] += (size - total_workload);
179 }
#define HANDLE_ERROR(err)
Definition: cudaWrapper.cpp:13
void printMemoryInfo(int idx)
#define _Y
Definition: define.h:96
void setWorkload(int size)
#define _X
Definition: define.h:92
#define MAX_INFO
Definition: cudaWrapper.h:10
#define MAX_GPU
Definition: cudaWrapper.h:9
#define _Z
Definition: define.h:100