00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00013 #define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00014
00015 #ifdef _WIN32
00016 #ifdef _DEBUG // Do this only in debug mode...
00017 # define WINDOWS_LEAN_AND_MEAN
00018 # include <windows.h>
00019 # include <stdlib.h>
00020 # undef min
00021 # undef max
00022 #endif
00023 #endif
00024
00025 #include <stdio.h>
00026 #include <string.h>
00027 #include <stdlib.h>
00028
00029 #include <cufft.h>
00030 #include <curand.h>
00031
00032
00033
00034 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
00035 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
00036 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
00037 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
00038 #define curandSafeCall(err) __curandSafeCall (err, __FILE__, __LINE__)
00039 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
00040 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
00041 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
00042 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
00043 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
00044 #define cutilExit(argc, argv) __cutilExit (argc, argv)
00045
00046 inline cudaError cutilDeviceSynchronize()
00047 {
00048 #if CUDART_VERSION >= 4000
00049 return cudaDeviceSynchronize();
00050 #else
00051 return cudaThreadSynchronize();
00052 #endif
00053 }
00054
00055 inline cudaError cutilDeviceReset()
00056 {
00057 #if CUDART_VERSION >= 4000
00058 return cudaDeviceReset();
00059 #else
00060 return cudaThreadExit();
00061 #endif
00062 }
00063
00064 inline void __cutilCondition(int val, char *file, int line)
00065 {
00066 if( CUTFalse == cutCheckCondition( val, file, line ) ) {
00067 exit(EXIT_FAILURE);
00068 }
00069 }
00070
00071 inline void __cutilExit(int argc, char **argv)
00072 {
00073 if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
00074 printf("\nPress ENTER to exit...\n");
00075 fflush( stdout);
00076 fflush( stderr);
00077 getchar();
00078 }
00079 exit(EXIT_SUCCESS);
00080 }
00081
00082 #define MIN(a,b) ((a < b) ? a : b)
00083 #define MAX(a,b) ((a > b) ? a : b)
00084
00085
00086 inline int _ConvertSMVer2Cores_local(int major, int minor)
00087 {
00088
00089 typedef struct {
00090 int SM;
00091 int Cores;
00092 } sSMtoCores;
00093
00094 sSMtoCores nGpuArchCoresPerSM[] =
00095 { { 0x10, 8 },
00096 { 0x11, 8 },
00097 { 0x12, 8 },
00098 { 0x13, 8 },
00099 { 0x20, 32 },
00100 { 0x21, 48 },
00101 { -1, -1 }
00102 };
00103
00104 int index = 0;
00105 while (nGpuArchCoresPerSM[index].SM != -1) {
00106 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
00107 return nGpuArchCoresPerSM[index].Cores;
00108 }
00109 index++;
00110 }
00111 printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
00112 return -1;
00113 }
00114
00115
00116
00117 inline int cutGetMaxGflopsDeviceId()
00118 {
00119 int current_device = 0, sm_per_multiproc = 0;
00120 int max_compute_perf = 0, max_perf_device = 0;
00121 int device_count = 0, best_SM_arch = 0;
00122 cudaDeviceProp deviceProp;
00123
00124 cudaGetDeviceCount( &device_count );
00125
00126 while ( current_device < device_count ) {
00127 cudaGetDeviceProperties( &deviceProp, current_device );
00128 if (deviceProp.major > 0 && deviceProp.major < 9999) {
00129 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00130 }
00131 current_device++;
00132 }
00133
00134
00135 current_device = 0;
00136 while( current_device < device_count ) {
00137 cudaGetDeviceProperties( &deviceProp, current_device );
00138 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00139 sm_per_multiproc = 1;
00140 } else {
00141 sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
00142 }
00143
00144 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00145 if( compute_perf > max_compute_perf ) {
00146
00147 if ( best_SM_arch > 2 ) {
00148
00149 if (deviceProp.major == best_SM_arch) {
00150 max_compute_perf = compute_perf;
00151 max_perf_device = current_device;
00152 }
00153 } else {
00154 max_compute_perf = compute_perf;
00155 max_perf_device = current_device;
00156 }
00157 }
00158 ++current_device;
00159 }
00160 return max_perf_device;
00161 }
00162
00163
00164 inline int cutGetMaxGflopsGraphicsDeviceId()
00165 {
00166 int current_device = 0, sm_per_multiproc = 0;
00167 int max_compute_perf = 0, max_perf_device = 0;
00168 int device_count = 0, best_SM_arch = 0;
00169 int bTCC = 0;
00170 cudaDeviceProp deviceProp;
00171
00172 cudaGetDeviceCount( &device_count );
00173
00174 while ( current_device < device_count ) {
00175 cudaGetDeviceProperties( &deviceProp, current_device );
00176
00177 #if CUDA_VERSION >= 3020
00178 if (deviceProp.tccDriver) bTCC = 1;
00179 #else
00180
00181 if (deviceProp.name[0] == 'T') bTCC = 1;
00182 #endif
00183
00184 if (!bTCC) {
00185 if (deviceProp.major > 0 && deviceProp.major < 9999) {
00186 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00187 }
00188 }
00189 current_device++;
00190 }
00191
00192
00193 current_device = 0;
00194 while( current_device < device_count ) {
00195 cudaGetDeviceProperties( &deviceProp, current_device );
00196 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00197 sm_per_multiproc = 1;
00198 } else {
00199 sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
00200 }
00201
00202 #if CUDA_VERSION >= 3020
00203 if (deviceProp.tccDriver) bTCC = 1;
00204 #else
00205
00206 if (deviceProp.name[0] == 'T') bTCC = 1;
00207 #endif
00208
00209 if (!bTCC)
00210 {
00211 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00212 if( compute_perf > max_compute_perf ) {
00213
00214 if ( best_SM_arch > 2 ) {
00215
00216 if (deviceProp.major == best_SM_arch) {
00217 max_compute_perf = compute_perf;
00218 max_perf_device = current_device;
00219 }
00220 } else {
00221 max_compute_perf = compute_perf;
00222 max_perf_device = current_device;
00223 }
00224 }
00225 }
00226 ++current_device;
00227 }
00228 return max_perf_device;
00229 }
00230
00231
00232 #ifdef _WIN32
00233 # if 1//ndef UNICODE
00234 # ifdef _DEBUG // Do this only in debug mode...
00235 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
00236 {
00237 size_t fmt2_sz = 2048;
00238 char *fmt2 = (char*)malloc(fmt2_sz);
00239 va_list vlist;
00240 va_start(vlist, fmt);
00241 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0)
00242 {
00243 fmt2_sz *= 2;
00244 if(fmt2) free(fmt2);
00245 fmt2 = (char*)malloc(fmt2_sz);
00246 }
00247 OutputDebugStringA(fmt2);
00248 fprintf(file, fmt2);
00249 free(fmt2);
00250 }
00251 # define FPRINTF(a) VSPrintf a
00252 # else //debug
00253 # define FPRINTF(a) fprintf a
00254
00255 # endif //debug
00256 # else //unicode
00257
00258 # define FPRINTF(a) fprintf a
00259 # endif //unicode
00260 #else //win32
00261 # define FPRINTF(a) fprintf a
00262 #endif //win32
00263
00264
00265
00266
00267 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
00268 {
00269 if( cudaSuccess != err) {
00270 FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
00271 file, line, (int)err, cudaGetErrorString( err ) ));
00272 exit(-1);
00273 }
00274 }
00275
00276 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
00277 {
00278 if( cudaSuccess != err) {
00279 FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
00280 file, line, (int)err, cudaGetErrorString( err ) ));
00281 exit(-1);
00282 }
00283 }
00284
00285 inline void __cudaSafeThreadSync( const char *file, const int line )
00286 {
00287 cudaError err = cutilDeviceSynchronize();
00288 if ( cudaSuccess != err) {
00289 FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
00290 file, line, (int)err, cudaGetErrorString( err ) ));
00291 exit(-1);
00292 }
00293 }
00294
00295 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
00296 {
00297 if( CUFFT_SUCCESS != err) {
00298 FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error %d: ",
00299 file, line, (int)err));
00300 switch (err) {
00301 case CUFFT_INVALID_PLAN: FPRINTF((stderr, "CUFFT_INVALID_PLAN\n"));
00302 case CUFFT_ALLOC_FAILED: FPRINTF((stderr, "CUFFT_ALLOC_FAILED\n"));
00303 case CUFFT_INVALID_TYPE: FPRINTF((stderr, "CUFFT_INVALID_TYPE\n"));
00304 case CUFFT_INVALID_VALUE: FPRINTF((stderr, "CUFFT_INVALID_VALUE\n"));
00305 case CUFFT_INTERNAL_ERROR: FPRINTF((stderr, "CUFFT_INTERNAL_ERROR\n"));
00306 case CUFFT_EXEC_FAILED: FPRINTF((stderr, "CUFFT_EXEC_FAILED\n"));
00307 case CUFFT_SETUP_FAILED: FPRINTF((stderr, "CUFFT_SETUP_FAILED\n"));
00308 case CUFFT_INVALID_SIZE: FPRINTF((stderr, "CUFFT_INVALID_SIZE\n"));
00309 case CUFFT_UNALIGNED_DATA: FPRINTF((stderr, "CUFFT_UNALIGNED_DATA\n"));
00310 default: FPRINTF((stderr, "CUFFT Unknown error code\n"));
00311 }
00312 exit(-1);
00313 }
00314 }
00315
00316 inline void __curandSafeCall( curandStatus_t err, const char *file, const int line )
00317 {
00318 if( CURAND_STATUS_SUCCESS != err) {
00319 FPRINTF((stderr, "%s(%i) : curandSafeCall() CURAND error %d: ",
00320 file, line, (int)err));
00321 switch (err) {
00322 case CURAND_STATUS_VERSION_MISMATCH: FPRINTF((stderr, "CURAND_STATUS_VERSION_MISMATCH"));
00323 case CURAND_STATUS_NOT_INITIALIZED: FPRINTF((stderr, "CURAND_STATUS_NOT_INITIALIZED"));
00324 case CURAND_STATUS_ALLOCATION_FAILED: FPRINTF((stderr, "CURAND_STATUS_ALLOCATION_FAILED"));
00325 case CURAND_STATUS_TYPE_ERROR: FPRINTF((stderr, "CURAND_STATUS_TYPE_ERROR"));
00326 case CURAND_STATUS_OUT_OF_RANGE: FPRINTF((stderr, "CURAND_STATUS_OUT_OF_RANGE"));
00327 case CURAND_STATUS_LENGTH_NOT_MULTIPLE: FPRINTF((stderr, "CURAND_STATUS_LENGTH_NOT_MULTIPLE"));
00328 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
00329 FPRINTF((stderr, "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"));
00330 case CURAND_STATUS_LAUNCH_FAILURE: FPRINTF((stderr, "CURAND_STATUS_LAUNCH_FAILURE"));
00331 case CURAND_STATUS_PREEXISTING_FAILURE: FPRINTF((stderr, "CURAND_STATUS_PREEXISTING_FAILURE"));
00332 case CURAND_STATUS_INITIALIZATION_FAILED:
00333 FPRINTF((stderr, "CURAND_STATUS_INITIALIZATION_FAILED"));
00334 case CURAND_STATUS_ARCH_MISMATCH: FPRINTF((stderr, "CURAND_STATUS_ARCH_MISMATCH"));
00335 case CURAND_STATUS_INTERNAL_ERROR: FPRINTF((stderr, "CURAND_STATUS_INTERNAL_ERROR"));
00336 default: FPRINTF((stderr, "CURAND Unknown error code\n"));
00337 }
00338 exit(-1);
00339 }
00340 }
00341
00342
00343 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
00344 {
00345 if( CUTTrue != err) {
00346 FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
00347 file, line));
00348 exit(-1);
00349 }
00350 }
00351
00352 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
00353 {
00354 cudaError_t err = cudaGetLastError();
00355 if( cudaSuccess != err) {
00356 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00357 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00358 exit(-1);
00359 }
00360 }
00361
00362 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
00363 {
00364 cudaError_t err = cudaGetLastError();
00365 if( cudaSuccess != err) {
00366 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00367 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00368 exit(-1);
00369 }
00370
00371 err = cutilDeviceSynchronize();
00372 if( cudaSuccess != err) {
00373 FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
00374 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00375 exit(-1);
00376 }
00377 }
00378
00379 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
00380 {
00381 if( !(pointer)) {
00382 FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
00383 file, line));
00384 exit(-1);
00385 }
00386 }
00387
00388 inline int cutilDeviceInit(int ARGC, char **ARGV)
00389 {
00390 int deviceCount;
00391 cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
00392 if (deviceCount == 0) {
00393 FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
00394 exit(-1);
00395 }
00396 int dev = 0;
00397 cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
00398 if (dev < 0)
00399 dev = 0;
00400 if (dev > deviceCount-1) {
00401 fprintf(stderr, "\n");
00402 fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
00403 fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
00404 fprintf(stderr, "\n");
00405 return -dev;
00406 }
00407 cudaDeviceProp deviceProp;
00408 cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
00409 if (deviceProp.major < 1) {
00410 FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
00411 exit(-1); \
00412 }
00413 printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
00414 cutilSafeCall(cudaSetDevice(dev));
00415
00416 return dev;
00417 }
00418
00419
00420 inline int cutilChooseCudaDevice(int argc, char **argv)
00421 {
00422 cudaDeviceProp deviceProp;
00423 int devID = 0;
00424
00425 if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
00426 devID = cutilDeviceInit(argc, argv);
00427 if (devID < 0) {
00428 printf("exiting...\n");
00429 cutilExit(argc, argv);
00430 exit(0);
00431 }
00432 } else {
00433
00434 devID = cutGetMaxGflopsDeviceId();
00435 cutilSafeCallNoSync( cudaSetDevice( devID ) );
00436 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
00437 printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
00438 }
00439 return devID;
00440 }
00441
00443 inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
00444 {
00445 cudaError_t err = cudaGetLastError();
00446 if( cudaSuccess != err) {
00447 FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00448 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00449 exit(-1);
00450 }
00451 err = cutilDeviceSynchronize();
00452 if( cudaSuccess != err) {
00453 FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00454 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00455 exit(-1);
00456 }
00457 }
00458
00459 #ifndef STRCASECMP
00460 #ifdef _WIN32
00461 #define STRCASECMP _stricmp
00462 #else
00463 #define STRCASECMP strcasecmp
00464 #endif
00465 #endif
00466
00467 #ifndef STRNCASECMP
00468 #ifdef _WIN32
00469 #define STRNCASECMP _strnicmp
00470 #else
00471 #define STRNCASECMP strncasecmp
00472 #endif
00473 #endif
00474
00475 inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
00476 {
00477 const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
00478
00479 bool bFlag = false;
00480 for (int i=1; i < argc; i++) {
00481 if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
00482 bFlag |= true;
00483 }
00484 }
00485
00486 if (bFlag) {
00487 printf("&&&& %s %s", sStatus[bStatus], argv[0]);
00488 for (int i=1; i < argc; i++) printf(" %s", argv[i]);
00489 } else {
00490 printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
00491 }
00492 }
00493
00494
00495 inline bool cutilCudaCapabilities(int major_version, int minor_version)
00496 {
00497 cudaDeviceProp deviceProp;
00498 deviceProp.major = 0;
00499 deviceProp.minor = 0;
00500 int dev;
00501
00502 cutilSafeCall( cudaGetDevice(&dev) );
00503 cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
00504
00505 if((deviceProp.major > major_version) ||
00506 (deviceProp.major == major_version && deviceProp.minor >= minor_version))
00507 {
00508 printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
00509 return true;
00510 }
00511 else
00512 {
00513 printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
00514 return false;
00515 }
00516 }
00517
00518 #endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_