00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00013 #define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00014
00015 #ifdef _WIN32
00016 #ifdef _DEBUG // Do this only in debug mode...
00017 # define WINDOWS_LEAN_AND_MEAN
00018 # include <windows.h>
00019 # include <stdlib.h>
00020 # undef min
00021 # undef max
00022 #endif
00023 #endif
00024
00025 #include <stdio.h>
00026 #include <string.h>
00027 #include <stdlib.h>
00028
00029 #include <cufft.h>
00030
00031
00032
00033 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
00034 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
00035 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
00036 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
00037 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
00038 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
00039 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
00040 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
00041 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
00042 #define cutilExit(argc, argv) __cutilExit (argc, argv)
00043
00044 inline cudaError cutilDeviceSynchronize()
00045 {
00046 #if CUDART_VERSION >= 4000
00047 return cudaDeviceSynchronize();
00048 #else
00049 return cudaThreadSynchronize();
00050 #endif
00051 }
00052
00053 inline cudaError cutilDeviceReset()
00054 {
00055 #if CUDART_VERSION >= 4000
00056 return cudaDeviceReset();
00057 #else
00058 return cudaThreadExit();
00059 #endif
00060 }
00061
00062 inline void __cutilCondition(int val, char *file, int line)
00063 {
00064 if( CUTFalse == cutCheckCondition( val, file, line ) ) {
00065 exit(EXIT_FAILURE);
00066 }
00067 }
00068
00069 inline void __cutilExit(int argc, char **argv)
00070 {
00071 if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
00072 printf("\nPress ENTER to exit...\n");
00073 fflush( stdout);
00074 fflush( stderr);
00075 getchar();
00076 }
00077 exit(EXIT_SUCCESS);
00078 }
00079
00080 #define MIN(a,b) ((a < b) ? a : b)
00081 #define MAX(a,b) ((a > b) ? a : b)
00082
00083
00084 inline int _ConvertSMVer2Cores(int major, int minor)
00085 {
00086
00087 typedef struct {
00088 int SM;
00089 int Cores;
00090 } sSMtoCores;
00091
00092 sSMtoCores nGpuArchCoresPerSM[] =
00093 { { 0x10, 8 },
00094 { 0x11, 8 },
00095 { 0x12, 8 },
00096 { 0x13, 8 },
00097 { 0x20, 32 },
00098 { 0x21, 48 },
00099 { -1, -1 }
00100 };
00101
00102 int index = 0;
00103 while (nGpuArchCoresPerSM[index].SM != -1) {
00104 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
00105 return nGpuArchCoresPerSM[index].Cores;
00106 }
00107 index++;
00108 }
00109 printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
00110 return -1;
00111 }
00112
00113
00114
00115 inline int cutGetMaxGflopsDeviceId()
00116 {
00117 int current_device = 0, sm_per_multiproc = 0;
00118 int max_compute_perf = 0, max_perf_device = 0;
00119 int device_count = 0, best_SM_arch = 0;
00120 cudaDeviceProp deviceProp;
00121
00122 cudaGetDeviceCount( &device_count );
00123
00124 while ( current_device < device_count ) {
00125 cudaGetDeviceProperties( &deviceProp, current_device );
00126 if (deviceProp.major > 0 && deviceProp.major < 9999) {
00127 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00128 }
00129 current_device++;
00130 }
00131
00132
00133 current_device = 0;
00134 while( current_device < device_count ) {
00135 cudaGetDeviceProperties( &deviceProp, current_device );
00136 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00137 sm_per_multiproc = 1;
00138 } else {
00139 sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
00140 }
00141
00142 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00143 if( compute_perf > max_compute_perf ) {
00144
00145 if ( best_SM_arch > 2 ) {
00146
00147 if (deviceProp.major == best_SM_arch) {
00148 max_compute_perf = compute_perf;
00149 max_perf_device = current_device;
00150 }
00151 } else {
00152 max_compute_perf = compute_perf;
00153 max_perf_device = current_device;
00154 }
00155 }
00156 ++current_device;
00157 }
00158 return max_perf_device;
00159 }
00160
00161
00162 inline int cutGetMaxGflopsGraphicsDeviceId()
00163 {
00164 int current_device = 0, sm_per_multiproc = 0;
00165 int max_compute_perf = 0, max_perf_device = 0;
00166 int device_count = 0, best_SM_arch = 0;
00167 int bTCC = 0;
00168 cudaDeviceProp deviceProp;
00169
00170 cudaGetDeviceCount( &device_count );
00171
00172 while ( current_device < device_count ) {
00173 cudaGetDeviceProperties( &deviceProp, current_device );
00174
00175 #if CUDA_VERSION >= 3020
00176 if (deviceProp.tccDriver) bTCC = 1;
00177 #else
00178
00179 if (deviceProp.name[0] == 'T') bTCC = 1;
00180 #endif
00181
00182 if (!bTCC) {
00183 if (deviceProp.major > 0 && deviceProp.major < 9999) {
00184 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00185 }
00186 }
00187 current_device++;
00188 }
00189
00190
00191 current_device = 0;
00192 while( current_device < device_count ) {
00193 cudaGetDeviceProperties( &deviceProp, current_device );
00194 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00195 sm_per_multiproc = 1;
00196 } else {
00197 sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
00198 }
00199
00200 #if CUDA_VERSION >= 3020
00201 if (deviceProp.tccDriver) bTCC = 1;
00202 #else
00203
00204 if (deviceProp.name[0] == 'T') bTCC = 1;
00205 #endif
00206
00207 if (!bTCC)
00208 {
00209 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00210 if( compute_perf > max_compute_perf ) {
00211
00212 if ( best_SM_arch > 2 ) {
00213
00214 if (deviceProp.major == best_SM_arch) {
00215 max_compute_perf = compute_perf;
00216 max_perf_device = current_device;
00217 }
00218 } else {
00219 max_compute_perf = compute_perf;
00220 max_perf_device = current_device;
00221 }
00222 }
00223 }
00224 ++current_device;
00225 }
00226 return max_perf_device;
00227 }
00228
00229
00230 #ifdef _WIN32
00231 # if 1//ndef UNICODE
00232 # ifdef _DEBUG // Do this only in debug mode...
00233 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
00234 {
00235 size_t fmt2_sz = 2048;
00236 char *fmt2 = (char*)malloc(fmt2_sz);
00237 va_list vlist;
00238 va_start(vlist, fmt);
00239 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0)
00240 {
00241 fmt2_sz *= 2;
00242 if(fmt2) free(fmt2);
00243 fmt2 = (char*)malloc(fmt2_sz);
00244 }
00245 OutputDebugStringA(fmt2);
00246 fprintf(file, fmt2);
00247 free(fmt2);
00248 }
00249 # define FPRINTF(a) VSPrintf a
00250 # else //debug
00251 # define FPRINTF(a) fprintf a
00252
00253 # endif //debug
00254 # else //unicode
00255
00256 # define FPRINTF(a) fprintf a
00257 # endif //unicode
00258 #else //win32
00259 # define FPRINTF(a) fprintf a
00260 #endif //win32
00261
00262
00263
00264
00265 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
00266 {
00267 if( cudaSuccess != err) {
00268 FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
00269 file, line, (int)err, cudaGetErrorString( err ) ));
00270 exit(-1);
00271 }
00272 }
00273
00274 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
00275 {
00276 if( cudaSuccess != err) {
00277 FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
00278 file, line, (int)err, cudaGetErrorString( err ) ));
00279 exit(-1);
00280 }
00281 }
00282
00283 inline void __cudaSafeThreadSync( const char *file, const int line )
00284 {
00285 cudaError err = cutilDeviceSynchronize();
00286 if ( cudaSuccess != err) {
00287 FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
00288 file, line, (int)err, cudaGetErrorString( err ) ));
00289 exit(-1);
00290 }
00291 }
00292
00293 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
00294 {
00295 if( CUFFT_SUCCESS != err) {
00296 FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error %d: ",
00297 file, line, (int)err));
00298 switch (err) {
00299 case CUFFT_INVALID_PLAN: FPRINTF((stderr, "CUFFT_INVALID_PLAN\n"));
00300 case CUFFT_ALLOC_FAILED: FPRINTF((stderr, "CUFFT_ALLOC_FAILED\n"));
00301 case CUFFT_INVALID_TYPE: FPRINTF((stderr, "CUFFT_INVALID_TYPE\n"));
00302 case CUFFT_INVALID_VALUE: FPRINTF((stderr, "CUFFT_INVALID_VALUE\n"));
00303 case CUFFT_INTERNAL_ERROR: FPRINTF((stderr, "CUFFT_INTERNAL_ERROR\n"));
00304 case CUFFT_EXEC_FAILED: FPRINTF((stderr, "CUFFT_EXEC_FAILED\n"));
00305 case CUFFT_SETUP_FAILED: FPRINTF((stderr, "CUFFT_SETUP_FAILED\n"));
00306 case CUFFT_INVALID_SIZE: FPRINTF((stderr, "CUFFT_INVALID_SIZE\n"));
00307 case CUFFT_UNALIGNED_DATA: FPRINTF((stderr, "CUFFT_UNALIGNED_DATA\n"));
00308 default: FPRINTF((stderr, "CUFFT Unknown error code\n"));
00309 }
00310 exit(-1);
00311 }
00312 }
00313
00314 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
00315 {
00316 if( CUTTrue != err) {
00317 FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
00318 file, line));
00319 exit(-1);
00320 }
00321 }
00322
00323 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
00324 {
00325 cudaError_t err = cudaGetLastError();
00326 if( cudaSuccess != err) {
00327 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00328 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00329 exit(-1);
00330 }
00331 }
00332
00333 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
00334 {
00335 cudaError_t err = cudaGetLastError();
00336 if( cudaSuccess != err) {
00337 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00338 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00339 exit(-1);
00340 }
00341
00342 err = cutilDeviceSynchronize();
00343 if( cudaSuccess != err) {
00344 FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
00345 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00346 exit(-1);
00347 }
00348 }
00349
00350 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
00351 {
00352 if( !(pointer)) {
00353 FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
00354 file, line));
00355 exit(-1);
00356 }
00357 }
00358
00359 #if __DEVICE_EMULATION__
00360 inline int cutilDeviceInit(int ARGC, char **ARGV) { }
00361 inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
00362 #else
00363 inline int cutilDeviceInit(int ARGC, char **ARGV)
00364 {
00365 int deviceCount;
00366 cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
00367 if (deviceCount == 0) {
00368 FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
00369 exit(-1);
00370 }
00371 int dev = 0;
00372 cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
00373 if (dev < 0)
00374 dev = 0;
00375 if (dev > deviceCount-1) {
00376 fprintf(stderr, "\n");
00377 fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
00378 fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
00379 fprintf(stderr, "\n");
00380 return -dev;
00381 }
00382 cudaDeviceProp deviceProp;
00383 cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
00384 if (deviceProp.major < 1) {
00385 FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
00386 exit(-1); \
00387 }
00388 printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
00389 cutilSafeCall(cudaSetDevice(dev));
00390
00391 return dev;
00392 }
00393
00394
00395 inline int cutilChooseCudaDevice(int argc, char **argv)
00396 {
00397 cudaDeviceProp deviceProp;
00398 int devID = 0;
00399
00400 if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
00401 devID = cutilDeviceInit(argc, argv);
00402 if (devID < 0) {
00403 printf("exiting...\n");
00404 cutilExit(argc, argv);
00405 exit(0);
00406 }
00407 } else {
00408
00409 devID = cutGetMaxGflopsDeviceId();
00410 cutilSafeCallNoSync( cudaSetDevice( devID ) );
00411 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
00412 printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
00413 }
00414 return devID;
00415 }
00416 #endif
00417
00418
00420 inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
00421 {
00422 cudaError_t err = cudaGetLastError();
00423 if( cudaSuccess != err) {
00424 FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00425 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00426 exit(-1);
00427 }
00428 err = cutilDeviceSynchronize();
00429 if( cudaSuccess != err) {
00430 FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00431 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00432 exit(-1);
00433 }
00434 }
00435
00436 #ifndef STRCASECMP
00437 #ifdef _WIN32
00438 #define STRCASECMP _stricmp
00439 #else
00440 #define STRCASECMP strcasecmp
00441 #endif
00442 #endif
00443
00444 #ifndef STRNCASECMP
00445 #ifdef _WIN32
00446 #define STRNCASECMP _strnicmp
00447 #else
00448 #define STRNCASECMP strncasecmp
00449 #endif
00450 #endif
00451
00452 inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
00453 {
00454 const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
00455
00456 bool bFlag = false;
00457 for (int i=1; i < argc; i++) {
00458 if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
00459 bFlag |= true;
00460 }
00461 }
00462
00463 if (bFlag) {
00464 printf("&&&& %s %s", sStatus[bStatus], argv[0]);
00465 for (int i=1; i < argc; i++) printf(" %s", argv[i]);
00466 } else {
00467 printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
00468 }
00469 }
00470
00471
00472 inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
00473 {
00474 cudaDeviceProp deviceProp;
00475 deviceProp.major = 0;
00476 deviceProp.minor = 0;
00477 int dev;
00478
00479 #ifdef __DEVICE_EMULATION__
00480 printf("> Compute Device Emulation Mode \n");
00481 #endif
00482
00483 cutilSafeCall( cudaGetDevice(&dev) );
00484 cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
00485
00486 if((deviceProp.major > major_version) ||
00487 (deviceProp.major == major_version && deviceProp.minor >= minor_version))
00488 {
00489 printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
00490 return true;
00491 }
00492 else
00493 {
00494 printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
00495
00496 return false;
00497 }
00498 }
00499
00500 #endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_