fupermod: Functional Performance Models of heterogeneous processors

File List
00001 /*
00002  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
00003  *
00004  * Please refer to the NVIDIA end user license agreement (EULA) associated
00005  * with this source code for terms and conditions that govern your use of
00006  * this software. Any use, reproduction, disclosure, or distribution of
00007  * this software and related documentation outside the terms of the EULA
00008  * is strictly prohibited.
00009  *
00010  */
00011  
00012 #ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00013 #define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00014 
00015 #ifdef _WIN32
00016 #ifdef _DEBUG // Do this only in debug mode...
00017 #  define WINDOWS_LEAN_AND_MEAN
00018 #  include <windows.h>
00019 #  include <stdlib.h>
00020 #  undef min
00021 #  undef max
00022 #endif
00023 #endif
00024 
00025 #include <stdio.h>
00026 #include <string.h>
00027 #include <stdlib.h>
00028 
00029 #include <cufft.h>
00030 #include <curand.h>
00031 
00032 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
00033 // The advantage is the developers gets to use the inline function so they can debug
00034 #define cutilSafeCallNoSync(err)     __cudaSafeCallNoSync(err, __FILE__, __LINE__)
00035 #define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
00036 #define cutilSafeThreadSync()        __cudaSafeThreadSync(__FILE__, __LINE__)
00037 #define cufftSafeCall(err)           __cufftSafeCall     (err, __FILE__, __LINE__)
00038 #define curandSafeCall(err)          __curandSafeCall    (err, __FILE__, __LINE__)
00039 #define cutilCheckError(err)         __cutilCheckError   (err, __FILE__, __LINE__)
00040 #define cutilCheckMsg(msg)           __cutilGetLastError (msg, __FILE__, __LINE__)
00041 #define cutilCheckMsgAndSync(msg)    __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
00042 #define cutilSafeMalloc(mallocCall)  __cutilSafeMalloc   ((mallocCall), __FILE__, __LINE__)
00043 #define cutilCondition(val)          __cutilCondition    (val, __FILE__, __LINE__)
00044 #define cutilExit(argc, argv)        __cutilExit         (argc, argv)
00045 
00046 inline cudaError cutilDeviceSynchronize()
00047 {
00048 #if CUDART_VERSION >= 4000
00049     return cudaDeviceSynchronize();
00050 #else
00051     return cudaThreadSynchronize();
00052 #endif
00053 }
00054 
00055 inline cudaError cutilDeviceReset()
00056 {
00057 #if CUDART_VERSION >= 4000
00058     return cudaDeviceReset();
00059 #else
00060     return cudaThreadExit();
00061 #endif
00062 }
00063 
00064 inline void __cutilCondition(int val, char *file, int line) 
00065 {
00066     if( CUTFalse == cutCheckCondition( val, file, line ) ) {
00067         exit(EXIT_FAILURE);
00068     }
00069 }
00070 
00071 inline void __cutilExit(int argc, char **argv)
00072 {     
00073     if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
00074         printf("\nPress ENTER to exit...\n");
00075         fflush( stdout);
00076         fflush( stderr);
00077         getchar();
00078     }
00079     exit(EXIT_SUCCESS);
00080 }
00081 
00082 #define MIN(a,b) ((a < b) ? a : b)
00083 #define MAX(a,b) ((a > b) ? a : b)
00084 
00085 // Beginning of GPU Architecture definitions
00086 inline int _ConvertSMVer2Cores_local(int major, int minor)
00087 {
00088     // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
00089     typedef struct {
00090         int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
00091         int Cores;
00092     } sSMtoCores;
00093 
00094     sSMtoCores nGpuArchCoresPerSM[] = 
00095     { { 0x10,  8 },
00096       { 0x11,  8 },
00097       { 0x12,  8 },
00098       { 0x13,  8 },
00099       { 0x20, 32 },
00100       { 0x21, 48 },
00101       {   -1, -1 } 
00102     };
00103 
00104     int index = 0;
00105     while (nGpuArchCoresPerSM[index].SM != -1) {
00106         if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
00107             return nGpuArchCoresPerSM[index].Cores;
00108         }
00109         index++;
00110     }
00111     printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
00112     return -1;
00113 }
00114 // end of GPU Architecture definitions
00115 
00116 // This function returns the best GPU (with maximum GFLOPS)
00117 inline int cutGetMaxGflopsDeviceId()
00118 {
00119     int current_device   = 0, sm_per_multiproc = 0;
00120     int max_compute_perf = 0, max_perf_device  = 0;
00121     int device_count     = 0, best_SM_arch     = 0;
00122     cudaDeviceProp deviceProp;
00123 
00124     cudaGetDeviceCount( &device_count );
00125     // Find the best major SM Architecture GPU device
00126     while ( current_device < device_count ) {
00127         cudaGetDeviceProperties( &deviceProp, current_device );
00128         if (deviceProp.major > 0 && deviceProp.major < 9999) {
00129             best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00130         }
00131         current_device++;
00132     }
00133 
00134     // Find the best CUDA capable GPU device
00135     current_device = 0;
00136     while( current_device < device_count ) {
00137         cudaGetDeviceProperties( &deviceProp, current_device );
00138         if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00139             sm_per_multiproc = 1;
00140         } else {
00141             sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
00142         }
00143 
00144         int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00145         if( compute_perf  > max_compute_perf ) {
00146             // If we find GPU with SM major > 2, search only these
00147             if ( best_SM_arch > 2 ) {
00148                 // If our device==dest_SM_arch, choose this, or else pass
00149                 if (deviceProp.major == best_SM_arch) { 
00150                     max_compute_perf  = compute_perf;
00151                     max_perf_device   = current_device;
00152                 }
00153             } else {
00154                 max_compute_perf  = compute_perf;
00155                 max_perf_device   = current_device;
00156             }
00157         }
00158         ++current_device;
00159     }
00160     return max_perf_device;
00161 }
00162 
00163 // This function returns the best GPU (with maximum GFLOPS)
00164 inline int cutGetMaxGflopsGraphicsDeviceId()
00165 {
00166     int current_device   = 0, sm_per_multiproc = 0;
00167     int max_compute_perf = 0, max_perf_device  = 0;
00168     int device_count     = 0, best_SM_arch     = 0;
00169     int bTCC = 0;
00170     cudaDeviceProp deviceProp;
00171 
00172     cudaGetDeviceCount( &device_count );
00173     // Find the best major SM Architecture GPU device that is graphics capable
00174     while ( current_device < device_count ) {
00175         cudaGetDeviceProperties( &deviceProp, current_device );
00176 
00177 #if CUDA_VERSION >= 3020
00178         if (deviceProp.tccDriver) bTCC = 1;
00179 #else
00180         // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
00181         if (deviceProp.name[0] == 'T') bTCC = 1;
00182 #endif
00183 
00184         if (!bTCC) {
00185             if (deviceProp.major > 0 && deviceProp.major < 9999) {
00186                 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00187             }
00188         }
00189         current_device++;
00190     }
00191 
00192     // Find the best CUDA capable GPU device
00193     current_device = 0;
00194     while( current_device < device_count ) {
00195         cudaGetDeviceProperties( &deviceProp, current_device );
00196         if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00197             sm_per_multiproc = 1;
00198         } else {
00199             sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
00200         }
00201 
00202 #if CUDA_VERSION >= 3020
00203         if (deviceProp.tccDriver) bTCC = 1;
00204 #else
00205         // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
00206         if (deviceProp.name[0] == 'T') bTCC = 1;
00207 #endif
00208 
00209         if (!bTCC) // Is this GPU running the TCC driver?  If so we pass on this
00210         {
00211             int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00212             if( compute_perf  > max_compute_perf ) {
00213                 // If we find GPU with SM major > 2, search only these
00214                 if ( best_SM_arch > 2 ) {
00215                     // If our device==dest_SM_arch, choose this, or else pass
00216                     if (deviceProp.major == best_SM_arch) { 
00217                         max_compute_perf  = compute_perf;
00218                         max_perf_device   = current_device;
00219                     }
00220                 } else {
00221                     max_compute_perf  = compute_perf;
00222                     max_perf_device   = current_device;
00223                 }
00224             }
00225         }
00226         ++current_device;
00227     }
00228     return max_perf_device;
00229 }
00230 
00231 // Give a little more for Windows : the console window often disapears before we can read the message
00232 #ifdef _WIN32
00233 # if 1//ndef UNICODE
00234 #  ifdef _DEBUG // Do this only in debug mode...
00235     inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
00236     {
00237         size_t fmt2_sz  = 2048;
00238         char *fmt2      = (char*)malloc(fmt2_sz);
00239         va_list  vlist;
00240         va_start(vlist, fmt);
00241         while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
00242         {
00243             fmt2_sz *= 2;
00244             if(fmt2) free(fmt2);
00245             fmt2 = (char*)malloc(fmt2_sz);
00246         }
00247         OutputDebugStringA(fmt2);
00248         fprintf(file, fmt2);
00249         free(fmt2);
00250     }
00251 #   define FPRINTF(a) VSPrintf a
00252 #  else //debug
00253 #   define FPRINTF(a) fprintf a
00254 // For other than Win32
00255 #  endif //debug
00256 # else //unicode
00257 // Unicode case... let's give-up for now and keep basic printf
00258 #   define FPRINTF(a) fprintf a
00259 # endif //unicode
00260 #else //win32
00261 #   define FPRINTF(a) fprintf a
00262 #endif //win32
00263 
00264 // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
00265 // when the user double clicks on the error line in the Output pane. Like any compile error.
00266 
00267 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
00268 {
00269     if( cudaSuccess != err) {
00270         FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
00271                 file, line, (int)err, cudaGetErrorString( err ) ));
00272         exit(-1);
00273     }
00274 }
00275 
00276 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
00277 {
00278     if( cudaSuccess != err) {
00279         FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
00280                 file, line, (int)err, cudaGetErrorString( err ) ));
00281         exit(-1);
00282     }
00283 }
00284 
00285 inline void __cudaSafeThreadSync( const char *file, const int line )
00286 {
00287     cudaError err = cutilDeviceSynchronize();
00288     if ( cudaSuccess != err) {
00289         FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
00290                 file, line, (int)err, cudaGetErrorString( err ) ));
00291         exit(-1);
00292     }
00293 }
00294 
00295 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
00296 {
00297     if( CUFFT_SUCCESS != err) {
00298         FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error %d: ",
00299                 file, line, (int)err));
00300         switch (err) {
00301             case CUFFT_INVALID_PLAN:   FPRINTF((stderr, "CUFFT_INVALID_PLAN\n"));
00302             case CUFFT_ALLOC_FAILED:   FPRINTF((stderr, "CUFFT_ALLOC_FAILED\n"));
00303             case CUFFT_INVALID_TYPE:   FPRINTF((stderr, "CUFFT_INVALID_TYPE\n"));
00304             case CUFFT_INVALID_VALUE:  FPRINTF((stderr, "CUFFT_INVALID_VALUE\n"));
00305             case CUFFT_INTERNAL_ERROR: FPRINTF((stderr, "CUFFT_INTERNAL_ERROR\n"));
00306             case CUFFT_EXEC_FAILED:    FPRINTF((stderr, "CUFFT_EXEC_FAILED\n"));
00307             case CUFFT_SETUP_FAILED:   FPRINTF((stderr, "CUFFT_SETUP_FAILED\n"));
00308             case CUFFT_INVALID_SIZE:   FPRINTF((stderr, "CUFFT_INVALID_SIZE\n"));
00309             case CUFFT_UNALIGNED_DATA: FPRINTF((stderr, "CUFFT_UNALIGNED_DATA\n"));
00310             default: FPRINTF((stderr, "CUFFT Unknown error code\n"));
00311         }
00312         exit(-1);
00313     }
00314 }
00315 
00316 inline void __curandSafeCall( curandStatus_t err, const char *file, const int line )
00317     {
00318     if( CURAND_STATUS_SUCCESS != err) {
00319         FPRINTF((stderr, "%s(%i) : curandSafeCall() CURAND error %d: ",
00320                 file, line, (int)err));
00321         switch (err) {
00322             case CURAND_STATUS_VERSION_MISMATCH:    FPRINTF((stderr, "CURAND_STATUS_VERSION_MISMATCH"));
00323             case CURAND_STATUS_NOT_INITIALIZED:     FPRINTF((stderr, "CURAND_STATUS_NOT_INITIALIZED"));
00324             case CURAND_STATUS_ALLOCATION_FAILED:   FPRINTF((stderr, "CURAND_STATUS_ALLOCATION_FAILED"));
00325             case CURAND_STATUS_TYPE_ERROR:          FPRINTF((stderr, "CURAND_STATUS_TYPE_ERROR"));
00326             case CURAND_STATUS_OUT_OF_RANGE:        FPRINTF((stderr, "CURAND_STATUS_OUT_OF_RANGE")); 
00327             case CURAND_STATUS_LENGTH_NOT_MULTIPLE: FPRINTF((stderr, "CURAND_STATUS_LENGTH_NOT_MULTIPLE"));
00328             case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: 
00329                                                     FPRINTF((stderr, "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"));
00330             case CURAND_STATUS_LAUNCH_FAILURE:      FPRINTF((stderr, "CURAND_STATUS_LAUNCH_FAILURE")); 
00331             case CURAND_STATUS_PREEXISTING_FAILURE: FPRINTF((stderr, "CURAND_STATUS_PREEXISTING_FAILURE"));
00332             case CURAND_STATUS_INITIALIZATION_FAILED:     
00333                                                     FPRINTF((stderr, "CURAND_STATUS_INITIALIZATION_FAILED"));
00334             case CURAND_STATUS_ARCH_MISMATCH:       FPRINTF((stderr, "CURAND_STATUS_ARCH_MISMATCH"));
00335             case CURAND_STATUS_INTERNAL_ERROR:      FPRINTF((stderr, "CURAND_STATUS_INTERNAL_ERROR"));
00336             default: FPRINTF((stderr, "CURAND Unknown error code\n"));
00337         }
00338         exit(-1);
00339     }
00340 }
00341 
00342 
00343 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
00344 {
00345     if( CUTTrue != err) {
00346         FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
00347                 file, line));
00348         exit(-1);
00349     }
00350 }
00351 
00352 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
00353 {
00354     cudaError_t err = cudaGetLastError();
00355     if( cudaSuccess != err) {
00356         FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00357                 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00358         exit(-1);
00359     }
00360 }
00361 
00362 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
00363 {
00364     cudaError_t err = cudaGetLastError();
00365     if( cudaSuccess != err) {
00366         FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00367                 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00368         exit(-1);
00369     }
00370 
00371     err = cutilDeviceSynchronize();
00372     if( cudaSuccess != err) {
00373         FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
00374                 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00375         exit(-1);
00376     }
00377 }
00378 
00379 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
00380 {
00381     if( !(pointer)) {
00382         FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
00383                 file, line));
00384         exit(-1);
00385     }
00386 }
00387 
00388 inline int cutilDeviceInit(int ARGC, char **ARGV)
00389 {
00390     int deviceCount;
00391     cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
00392     if (deviceCount == 0) {
00393         FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
00394         exit(-1);
00395     }
00396     int dev = 0;
00397     cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
00398     if (dev < 0) 
00399         dev = 0;
00400     if (dev > deviceCount-1) {
00401         fprintf(stderr, "\n");
00402         fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
00403         fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
00404         fprintf(stderr, "\n");
00405         return -dev;
00406     }  
00407     cudaDeviceProp deviceProp;
00408     cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
00409     if (deviceProp.major < 1) {
00410         FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
00411         exit(-1);                                                  \
00412     }
00413     printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
00414     cutilSafeCall(cudaSetDevice(dev));
00415 
00416     return dev;
00417 }
00418 
00419 // General initialization call to pick the best CUDA Device
00420 inline int cutilChooseCudaDevice(int argc, char **argv)
00421 {
00422     cudaDeviceProp deviceProp;
00423     int devID = 0;
00424     // If the command-line has a device number specified, use it
00425     if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
00426         devID = cutilDeviceInit(argc, argv);
00427         if (devID < 0) {
00428            printf("exiting...\n");
00429            cutilExit(argc, argv);
00430            exit(0);
00431         }
00432     } else {
00433         // Otherwise pick the device with highest Gflops/s
00434         devID = cutGetMaxGflopsDeviceId();
00435         cutilSafeCallNoSync( cudaSetDevice( devID ) );
00436         cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
00437         printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
00438     }
00439     return devID;
00440 }
00441 
00443 inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line ) 
00444 {
00445     cudaError_t err = cudaGetLastError();
00446     if( cudaSuccess != err) {
00447         FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00448                  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00449         exit(-1);
00450     }
00451     err = cutilDeviceSynchronize();
00452     if( cudaSuccess != err) {
00453         FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00454         file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00455         exit(-1);
00456     }
00457 }
00458 
00459 #ifndef STRCASECMP
00460 #ifdef _WIN32
00461 #define STRCASECMP  _stricmp
00462 #else
00463 #define STRCASECMP  strcasecmp
00464 #endif
00465 #endif
00466 
00467 #ifndef STRNCASECMP
00468 #ifdef _WIN32
00469 #define STRNCASECMP _strnicmp
00470 #else
00471 #define STRNCASECMP strncasecmp
00472 #endif
00473 #endif
00474 
00475 inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
00476 {
00477     const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
00478 
00479     bool bFlag = false;
00480     for (int i=1; i < argc; i++) {
00481         if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
00482             bFlag |= true;
00483         }
00484     }
00485 
00486     if (bFlag) {
00487         printf("&&&& %s %s", sStatus[bStatus], argv[0]);
00488         for (int i=1; i < argc; i++) printf(" %s", argv[i]);
00489     } else {
00490         printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
00491     }
00492 }
00493 
00494 // General check for CUDA GPU SM Capabilities
00495 inline bool cutilCudaCapabilities(int major_version, int minor_version)
00496 {
00497     cudaDeviceProp deviceProp;
00498     deviceProp.major = 0;
00499     deviceProp.minor = 0;
00500     int dev;
00501 
00502     cutilSafeCall( cudaGetDevice(&dev) );
00503     cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
00504 
00505     if((deviceProp.major > major_version) ||
00506        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
00507     {
00508         printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
00509         return true;
00510     }
00511     else
00512     {
00513         printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
00514         return false;
00515     }
00516 }
00517 
00518 #endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
fupermod: Functional Performance Models of heterogeneous processors

cblas_wrappers/cuda-cc-sdk/4.1/cutil_inline_runtime.h

Projects

Hosted by UCD Open Source Lab