fupermod: Functional Performance Models of heterogeneous processors

File List
00001 /*
00002  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
00003  *
00004  * Please refer to the NVIDIA end user license agreement (EULA) associated
00005  * with this source code for terms and conditions that govern your use of
00006  * this software. Any use, reproduction, disclosure, or distribution of
00007  * this software and related documentation outside the terms of the EULA
00008  * is strictly prohibited.
00009  *
00010  */
00011  
00012 #ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00013 #define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
00014 
00015 #ifdef _WIN32
00016 #ifdef _DEBUG // Do this only in debug mode...
00017 #  define WINDOWS_LEAN_AND_MEAN
00018 #  include <windows.h>
00019 #  include <stdlib.h>
00020 #  undef min
00021 #  undef max
00022 #endif
00023 #endif
00024 
00025 #include <stdio.h>
00026 #include <string.h>
00027 #include <stdlib.h>
00028 
00029 #include <cufft.h>
00030 
00031 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
00032 // The advantage is the developers gets to use the inline function so they can debug
00033 #define cutilSafeCallNoSync(err)     __cudaSafeCallNoSync(err, __FILE__, __LINE__)
00034 #define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
00035 #define cutilSafeThreadSync()        __cudaSafeThreadSync(__FILE__, __LINE__)
00036 #define cufftSafeCall(err)           __cufftSafeCall     (err, __FILE__, __LINE__)
00037 #define cutilCheckError(err)         __cutilCheckError   (err, __FILE__, __LINE__)
00038 #define cutilCheckMsg(msg)           __cutilGetLastError (msg, __FILE__, __LINE__)
00039 #define cutilCheckMsgAndSync(msg)    __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
00040 #define cutilSafeMalloc(mallocCall)  __cutilSafeMalloc   ((mallocCall), __FILE__, __LINE__)
00041 #define cutilCondition(val)          __cutilCondition    (val, __FILE__, __LINE__)
00042 #define cutilExit(argc, argv)        __cutilExit         (argc, argv)
00043 
00044 inline cudaError cutilDeviceSynchronize()
00045 {
00046 #if CUDART_VERSION >= 4000
00047     return cudaDeviceSynchronize();
00048 #else
00049     return cudaThreadSynchronize();
00050 #endif
00051 }
00052 
00053 inline cudaError cutilDeviceReset()
00054 {
00055 #if CUDART_VERSION >= 4000
00056     return cudaDeviceReset();
00057 #else
00058     return cudaThreadExit();
00059 #endif
00060 }
00061 
00062 inline void __cutilCondition(int val, char *file, int line) 
00063 {
00064     if( CUTFalse == cutCheckCondition( val, file, line ) ) {
00065         exit(EXIT_FAILURE);
00066     }
00067 }
00068 
00069 inline void __cutilExit(int argc, char **argv)
00070 {     
00071     if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
00072         printf("\nPress ENTER to exit...\n");
00073         fflush( stdout);
00074         fflush( stderr);
00075         getchar();
00076     }
00077     exit(EXIT_SUCCESS);
00078 }
00079 
00080 #define MIN(a,b) ((a < b) ? a : b)
00081 #define MAX(a,b) ((a > b) ? a : b)
00082 
00083 // Beginning of GPU Architecture definitions
00084 inline int _ConvertSMVer2Cores(int major, int minor)
00085 {
00086     // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
00087     typedef struct {
00088         int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
00089         int Cores;
00090     } sSMtoCores;
00091 
00092     sSMtoCores nGpuArchCoresPerSM[] = 
00093     { { 0x10,  8 },
00094       { 0x11,  8 },
00095       { 0x12,  8 },
00096       { 0x13,  8 },
00097       { 0x20, 32 },
00098       { 0x21, 48 },
00099       {   -1, -1 } 
00100     };
00101 
00102     int index = 0;
00103     while (nGpuArchCoresPerSM[index].SM != -1) {
00104         if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
00105             return nGpuArchCoresPerSM[index].Cores;
00106         }
00107         index++;
00108     }
00109     printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
00110     return -1;
00111 }
00112 // end of GPU Architecture definitions
00113 
00114 // This function returns the best GPU (with maximum GFLOPS)
00115 inline int cutGetMaxGflopsDeviceId()
00116 {
00117     int current_device   = 0, sm_per_multiproc = 0;
00118     int max_compute_perf = 0, max_perf_device  = 0;
00119     int device_count     = 0, best_SM_arch     = 0;
00120     cudaDeviceProp deviceProp;
00121 
00122     cudaGetDeviceCount( &device_count );
00123     // Find the best major SM Architecture GPU device
00124     while ( current_device < device_count ) {
00125         cudaGetDeviceProperties( &deviceProp, current_device );
00126         if (deviceProp.major > 0 && deviceProp.major < 9999) {
00127             best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00128         }
00129         current_device++;
00130     }
00131 
00132     // Find the best CUDA capable GPU device
00133     current_device = 0;
00134     while( current_device < device_count ) {
00135         cudaGetDeviceProperties( &deviceProp, current_device );
00136         if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00137             sm_per_multiproc = 1;
00138         } else {
00139             sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
00140         }
00141 
00142         int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00143         if( compute_perf  > max_compute_perf ) {
00144             // If we find GPU with SM major > 2, search only these
00145             if ( best_SM_arch > 2 ) {
00146                 // If our device==dest_SM_arch, choose this, or else pass
00147                 if (deviceProp.major == best_SM_arch) { 
00148                     max_compute_perf  = compute_perf;
00149                     max_perf_device   = current_device;
00150                 }
00151             } else {
00152                 max_compute_perf  = compute_perf;
00153                 max_perf_device   = current_device;
00154             }
00155         }
00156         ++current_device;
00157     }
00158     return max_perf_device;
00159 }
00160 
00161 // This function returns the best GPU (with maximum GFLOPS)
00162 inline int cutGetMaxGflopsGraphicsDeviceId()
00163 {
00164     int current_device   = 0, sm_per_multiproc = 0;
00165     int max_compute_perf = 0, max_perf_device  = 0;
00166     int device_count     = 0, best_SM_arch     = 0;
00167     int bTCC = 0;
00168     cudaDeviceProp deviceProp;
00169 
00170     cudaGetDeviceCount( &device_count );
00171     // Find the best major SM Architecture GPU device that is graphics capable
00172     while ( current_device < device_count ) {
00173         cudaGetDeviceProperties( &deviceProp, current_device );
00174 
00175 #if CUDA_VERSION >= 3020
00176         if (deviceProp.tccDriver) bTCC = 1;
00177 #else
00178         // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
00179         if (deviceProp.name[0] == 'T') bTCC = 1;
00180 #endif
00181 
00182         if (!bTCC) {
00183             if (deviceProp.major > 0 && deviceProp.major < 9999) {
00184                 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
00185             }
00186         }
00187         current_device++;
00188     }
00189 
00190     // Find the best CUDA capable GPU device
00191     current_device = 0;
00192     while( current_device < device_count ) {
00193         cudaGetDeviceProperties( &deviceProp, current_device );
00194         if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
00195             sm_per_multiproc = 1;
00196         } else {
00197             sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
00198         }
00199 
00200 #if CUDA_VERSION >= 3020
00201         if (deviceProp.tccDriver) bTCC = 1;
00202 #else
00203         // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
00204         if (deviceProp.name[0] == 'T') bTCC = 1;
00205 #endif
00206 
00207         if (!bTCC) // Is this GPU running the TCC driver?  If so we pass on this
00208         {
00209             int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
00210             if( compute_perf  > max_compute_perf ) {
00211                 // If we find GPU with SM major > 2, search only these
00212                 if ( best_SM_arch > 2 ) {
00213                     // If our device==dest_SM_arch, choose this, or else pass
00214                     if (deviceProp.major == best_SM_arch) { 
00215                         max_compute_perf  = compute_perf;
00216                         max_perf_device   = current_device;
00217                     }
00218                 } else {
00219                     max_compute_perf  = compute_perf;
00220                     max_perf_device   = current_device;
00221                 }
00222             }
00223         }
00224         ++current_device;
00225     }
00226     return max_perf_device;
00227 }
00228 
00229 // Give a little more for Windows : the console window often disapears before we can read the message
00230 #ifdef _WIN32
00231 # if 1//ndef UNICODE
00232 #  ifdef _DEBUG // Do this only in debug mode...
00233     inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
00234     {
00235         size_t fmt2_sz  = 2048;
00236         char *fmt2      = (char*)malloc(fmt2_sz);
00237         va_list  vlist;
00238         va_start(vlist, fmt);
00239         while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
00240         {
00241             fmt2_sz *= 2;
00242             if(fmt2) free(fmt2);
00243             fmt2 = (char*)malloc(fmt2_sz);
00244         }
00245         OutputDebugStringA(fmt2);
00246         fprintf(file, fmt2);
00247         free(fmt2);
00248     }
00249 #   define FPRINTF(a) VSPrintf a
00250 #  else //debug
00251 #   define FPRINTF(a) fprintf a
00252 // For other than Win32
00253 #  endif //debug
00254 # else //unicode
00255 // Unicode case... let's give-up for now and keep basic printf
00256 #   define FPRINTF(a) fprintf a
00257 # endif //unicode
00258 #else //win32
00259 #   define FPRINTF(a) fprintf a
00260 #endif //win32
00261 
00262 // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
00263 // when the user double clicks on the error line in the Output pane. Like any compile error.
00264 
00265 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
00266 {
00267     if( cudaSuccess != err) {
00268         FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
00269                 file, line, (int)err, cudaGetErrorString( err ) ));
00270         exit(-1);
00271     }
00272 }
00273 
00274 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
00275 {
00276     if( cudaSuccess != err) {
00277         FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
00278                 file, line, (int)err, cudaGetErrorString( err ) ));
00279         exit(-1);
00280     }
00281 }
00282 
00283 inline void __cudaSafeThreadSync( const char *file, const int line )
00284 {
00285     cudaError err = cutilDeviceSynchronize();
00286     if ( cudaSuccess != err) {
00287         FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
00288                 file, line, (int)err, cudaGetErrorString( err ) ));
00289         exit(-1);
00290     }
00291 }
00292 
00293 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
00294 {
00295     if( CUFFT_SUCCESS != err) {
00296         FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error %d: ",
00297                 file, line, (int)err));
00298         switch (err) {
00299             case CUFFT_INVALID_PLAN:   FPRINTF((stderr, "CUFFT_INVALID_PLAN\n"));
00300             case CUFFT_ALLOC_FAILED:   FPRINTF((stderr, "CUFFT_ALLOC_FAILED\n"));
00301             case CUFFT_INVALID_TYPE:   FPRINTF((stderr, "CUFFT_INVALID_TYPE\n"));
00302             case CUFFT_INVALID_VALUE:  FPRINTF((stderr, "CUFFT_INVALID_VALUE\n"));
00303             case CUFFT_INTERNAL_ERROR: FPRINTF((stderr, "CUFFT_INTERNAL_ERROR\n"));
00304             case CUFFT_EXEC_FAILED:    FPRINTF((stderr, "CUFFT_EXEC_FAILED\n"));
00305             case CUFFT_SETUP_FAILED:   FPRINTF((stderr, "CUFFT_SETUP_FAILED\n"));
00306             case CUFFT_INVALID_SIZE:   FPRINTF((stderr, "CUFFT_INVALID_SIZE\n"));
00307             case CUFFT_UNALIGNED_DATA: FPRINTF((stderr, "CUFFT_UNALIGNED_DATA\n"));
00308             default: FPRINTF((stderr, "CUFFT Unknown error code\n"));
00309         }
00310         exit(-1);
00311     }
00312 }
00313 
00314 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
00315 {
00316     if( CUTTrue != err) {
00317         FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
00318                 file, line));
00319         exit(-1);
00320     }
00321 }
00322 
00323 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
00324 {
00325     cudaError_t err = cudaGetLastError();
00326     if( cudaSuccess != err) {
00327         FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00328                 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00329         exit(-1);
00330     }
00331 }
00332 
00333 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
00334 {
00335     cudaError_t err = cudaGetLastError();
00336     if( cudaSuccess != err) {
00337         FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
00338                 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00339         exit(-1);
00340     }
00341 
00342     err = cutilDeviceSynchronize();
00343     if( cudaSuccess != err) {
00344         FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
00345                 file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00346         exit(-1);
00347     }
00348 }
00349 
00350 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
00351 {
00352     if( !(pointer)) {
00353         FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
00354                 file, line));
00355         exit(-1);
00356     }
00357 }
00358 
00359 #if __DEVICE_EMULATION__
00360     inline int cutilDeviceInit(int ARGC, char **ARGV) { }
00361     inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
00362 #else
00363     inline int cutilDeviceInit(int ARGC, char **ARGV)
00364     {
00365         int deviceCount;
00366         cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
00367         if (deviceCount == 0) {
00368             FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
00369             exit(-1);
00370         }
00371         int dev = 0;
00372         cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
00373         if (dev < 0) 
00374             dev = 0;
00375         if (dev > deviceCount-1) {
00376             fprintf(stderr, "\n");
00377             fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
00378             fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
00379             fprintf(stderr, "\n");
00380             return -dev;
00381         }  
00382         cudaDeviceProp deviceProp;
00383         cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
00384         if (deviceProp.major < 1) {
00385             FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
00386             exit(-1);                                                  \
00387         }
00388         printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
00389         cutilSafeCall(cudaSetDevice(dev));
00390 
00391         return dev;
00392     }
00393 
00394     // General initialization call to pick the best CUDA Device
00395     inline int cutilChooseCudaDevice(int argc, char **argv)
00396     {
00397         cudaDeviceProp deviceProp;
00398         int devID = 0;
00399         // If the command-line has a device number specified, use it
00400         if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
00401             devID = cutilDeviceInit(argc, argv);
00402             if (devID < 0) {
00403                printf("exiting...\n");
00404                cutilExit(argc, argv);
00405                exit(0);
00406             }
00407         } else {
00408             // Otherwise pick the device with highest Gflops/s
00409             devID = cutGetMaxGflopsDeviceId();
00410             cutilSafeCallNoSync( cudaSetDevice( devID ) );
00411             cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
00412             printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
00413         }
00414         return devID;
00415     }
00416 #endif
00417 
00418 
00420 inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line ) 
00421 {
00422     cudaError_t err = cudaGetLastError();
00423     if( cudaSuccess != err) {
00424         FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00425                  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00426         exit(-1);
00427     }
00428     err = cutilDeviceSynchronize();
00429     if( cudaSuccess != err) {
00430         FPRINTF((stderr, "%s(%i) : CUDA error: %s : (%d) %s.\n",
00431         file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
00432         exit(-1);
00433     }
00434 }
00435 
00436 #ifndef STRCASECMP
00437 #ifdef _WIN32
00438 #define STRCASECMP  _stricmp
00439 #else
00440 #define STRCASECMP  strcasecmp
00441 #endif
00442 #endif
00443 
00444 #ifndef STRNCASECMP
00445 #ifdef _WIN32
00446 #define STRNCASECMP _strnicmp
00447 #else
00448 #define STRNCASECMP strncasecmp
00449 #endif
00450 #endif
00451 
00452 inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
00453 {
00454     const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
00455 
00456     bool bFlag = false;
00457     for (int i=1; i < argc; i++) {
00458         if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
00459             bFlag |= true;
00460         }
00461     }
00462 
00463     if (bFlag) {
00464         printf("&&&& %s %s", sStatus[bStatus], argv[0]);
00465         for (int i=1; i < argc; i++) printf(" %s", argv[i]);
00466     } else {
00467         printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
00468     }
00469 }
00470 
00471 // General check for CUDA GPU SM Capabilities
00472 inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
00473 {
00474     cudaDeviceProp deviceProp;
00475     deviceProp.major = 0;
00476     deviceProp.minor = 0;
00477     int dev;
00478 
00479 #ifdef __DEVICE_EMULATION__
00480     printf("> Compute Device Emulation Mode \n");
00481 #endif
00482 
00483     cutilSafeCall( cudaGetDevice(&dev) );
00484     cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
00485 
00486     if((deviceProp.major > major_version) ||
00487        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
00488     {
00489         printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
00490         return true;
00491     }
00492     else
00493     {
00494         printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
00495 //        __cutilQAFinish(argc, argv, true);
00496         return false;
00497     }
00498 }
00499 
00500 #endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
fupermod: Functional Performance Models of heterogeneous processors

cblas_wrappers/cuda-cc-sdk/4.0/cutil_inline_runtime.h

Projects

Hosted by UCD Open Source Lab