4 static void HandleError(cudaError_t err,
7 if (err != cudaSuccess) {
8 printf(
"%s in %s at line %d\n", cudaGetErrorString(err),
13 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 16 #define HANDLE_NULL( a ) {if (a == NULL) { \ 17 printf( "Host memory failed in %s at line %d\n", \ 18 __FILE__, __LINE__ ); \ 19 exit( EXIT_FAILURE );}} 23 std::mutex cudaWrapper::mtx;
25 cudaWrapper::cudaWrapper()
34 cudaWrapper::~cudaWrapper()
39 void cudaWrapper::initGPU()
41 cudaGetDeviceCount(&num_gpu);
42 printf(
"%d GPU Detected.\n", num_gpu);
43 active_gpus = num_gpu;
44 cudaDeviceProp devProp;
46 for (
int i = 0; i < num_gpu && i <
MAX_GPU; i++)
49 cudaError_t res = cudaGetDevice(&devID);
50 if (res == cudaSuccess) {
53 HANDLE_ERROR(cudaGetDeviceProperties(&devProp, cur_gpu));
54 devProps[i] = devProp;
55 int smPerCore = getSMPerCore(devProp.major, devProp.minor);
56 cuda_cores[i] = smPerCore * devProp.multiProcessorCount;
58 sprintf(devInfos[i][DEVICE_INFO::DEVICE_NAME],
"GPU Spec: %s\n", devProp.name);
60 sprintf(devInfos[i][DEVICE_INFO::GLOBAL_MEMORY],
"Global Memory: %llu\n", devProp.totalGlobalMem);
61 sprintf(devInfos[i][DEVICE_INFO::CONSTANT_MEMORY],
"Const Memory: %llu\n", devProp.totalConstMem);
63 sprintf(devInfos[i][DEVICE_INFO::GLOBAL_MEMORY],
"Global Memory: %zu\n", devProp.totalGlobalMem);
64 sprintf(devInfos[i][DEVICE_INFO::CONSTANT_MEMORY],
"Const Memory: %zu\n", devProp.totalConstMem);
66 sprintf(devInfos[i][DEVICE_INFO::MANAGED_MEMORY],
"Managed Memory: %d\n", devProp.managedMemory);
67 sprintf(devInfos[i][DEVICE_INFO::MP_COUNT],
"MP(Multiprocessor) Count : %d\n", devProp.multiProcessorCount);
69 sprintf(devInfos[i][DEVICE_INFO::TOTAL_MP_COUNT],
"Total MP Count: %d\n", cuda_cores[i]);
70 sprintf(devInfos[i][DEVICE_INFO::MAX_THREADS_PER_MP],
"Maximum Threads per Block : %d\n", devProp.maxThreadsPerBlock);
72 sprintf(devInfos[i][DEVICE_INFO::WARP_SIZE],
"Warp Size : %u\n", devProp.warpSize);
73 sprintf(devInfos[i][DEVICE_INFO::BLOCK_PER_MP],
"Block per MP : %d\n", devProp.maxThreadsPerMultiProcessor / devProp.maxThreadsPerBlock);
74 sprintf(devInfos[i][DEVICE_INFO::SHARED_MEMORY_PER_MP],
"Shared Memory per MP : %llu\n", devProp.sharedMemPerMultiprocessor);
75 sprintf(devInfos[i][DEVICE_INFO::SHARED_MEMORY_PER_BLOCK],
"Shared Memory per Block : %llu\n", devProp.sharedMemPerBlock);
76 sprintf(devInfos[i][DEVICE_INFO::MAX_THREADS_PER_BLOCK],
"Maximum Threads per Block : %d\n", devProp.maxThreadsPerBlock);
77 sprintf(devInfos[i][DEVICE_INFO::MAX_THREADS_DIMENSION],
"Maximum Threads of each Dimension of a Block (X: %d / Y: %d / Z: %d)\n",
78 devProp.maxThreadsDim[
_X], devProp.maxThreadsDim[
_Y], devProp.maxThreadsDim[
_Z]);
79 sprintf(devInfos[i][DEVICE_INFO::MAX_GRID_SIZE],
"Maximum Blocks of each Dimension of a Grid, (X: %d / Y: %d / Z: %d)\n",
80 devProp.maxGridSize[
_X], devProp.maxGridSize[
_Y], devProp.maxGridSize[
_Z]);
83 printf(
"<FAILED> cudaGetDevice(%d)\n", i);
89 bool cudaWrapper::printDevInfo()
91 cudaDeviceProp devProp;
93 for (
int i = 0; i < num_gpu && i <
MAX_GPU; i++)
95 devProp = devProps[i];
99 printf(
"%s", devInfos[i][j]);
108 cudaError_t ret = cudaSuccess;
111 if (cudaGetDevice(&old_idx) == cudaSuccess) {
112 if (idx < active_gpus && idx <
MAX_GPU) {
113 if (cudaSetDevice(idx) == cudaSuccess) {
115 if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
116 uint64_t gb = 1024 * 1024 * 1024;
117 printf(
"%d] CUDA Memory: %.1f/%.1fGB\n", idx, static_cast<double>(total - free) / gb, static_cast<double>(total) / gb);
121 cudaSetDevice(old_idx);
125 int cudaWrapper::getSMPerCore(
int major,
int minor)
127 int smPerMultiproc = 0;
134 smPerMultiproc = 192;
137 smPerMultiproc = 128;
140 smPerMultiproc = (minor == 1) ? 128 : 64;
146 smPerMultiproc = 128;
149 printf(
"<FAILED> Unsupported cudaWrapper architecture");
152 return smPerMultiproc;
161 int total_workload = 0;
162 for (
int i = 0; i < active_gpus; i++)
165 total_cores += cuda_cores[i];
166 if (cuda_cores[i] > max_core) {
167 max_core = cuda_cores[i];
172 for (
int i = 0; i < active_gpus; i++)
174 work_load[i] = (size * cuda_cores[i]) / total_cores;
175 total_workload += work_load[i];
178 work_load[max_idx] += (size - total_workload);
#define HANDLE_ERROR(err)
void printMemoryInfo(int idx)
void setWorkload(int size)