fupermod: Functional Performance Models of heterogeneous processors

File List
00001 #ifndef CUDA_AUXILIARY_H_
00002 #define CUDA_AUXILIARY_H_
00003 
00004 #ifdef HAVE_LIBCUBLAS
00005 #include "fupermod_cblas.h"
00006 #include <stdio.h>
00007 #include <stdlib.h>
00008 #include <string.h>
00009 #include <cublas_v2.h>
00010 #include <cuda_runtime.h>
00011 
00012 #define gama 0.6
00013 #define PRINT 1
00014 
00015 /* CUBLAS status type returns: 
00016 typedef enum{
00017       CUBLAS_STATUS_SUCCESS          =0,
00018       CUBLAS_STATUS_NOT_INITIALIZED  =1,
00019       CUBLAS_STATUS_ALLOC_FAILED     =3,
00020       CUBLAS_STATUS_INVALID_VALUE    =7,
00021       CUBLAS_STATUS_ARCH_MISMATCH    =8,
00022       CUBLAS_STATUS_MAPPING_ERROR    =11,
00023       CUBLAS_STATUS_EXECUTION_FAILED =13,
00024       CUBLAS_STATUS_INTERNAL_ERROR   =14
00025 } cublasStatus_t;
00026 */
00027 
00029 #  define CUBLAS_SAFE_CALL( call ) {                                         \
00030     cublasStatus_t err = call;                                               \
00031     if( CUBLAS_STATUS_SUCCESS != err) {                                      \
00032         fprintf(stderr, "Cublas error %d in file '%s' in line %i.\n",        \
00033                 err, __FILE__, __LINE__ );                                   \
00034         exit(EXIT_FAILURE);                                                  \
00035     } }
00036 
00038 inline cublasOperation_t trans_cblas2cublas(enum CBLAS_TRANSPOSE trans) {
00039   return (cublasOperation_t)(trans - CblasNoTrans);
00040 }
00041 
00042 #ifdef ENABLE_BLAS_SP
00043 #define CUBLAS_GEMM cublasSgemm
00044 #else
00045 #define CUBLAS_GEMM cublasDgemm
00046 #endif
00047 
00048 #define hA_ref(i)   (A+(i)*(offset_a))
00049 #define hB_ref(i)   (B+(i)*(offset_b))
00050 #define hC_ref(i,j) (C+(i)*(offset_c)+(j)*(mm)*(N))
00051 #define dA_ref(i)   (dA+(i)*(offset_a))
00052 #define dB_ref(i)   (dB+(i)*(offset_b))
00053 #define dC_ref(i,j) (dC+(i)*(offset_c)+(j)*(mm)*(dN))
00054 
00055 #define hA_rref(i)   ((i)*(offset_a))
00056 #define hB_rref(i)   ((i)*(offset_b))
00057 #define hC_rref(i,j) ((i)*(offset_c)+(j)*(mm)*(N))
00058 #define dA_rref(i)   ((i)*(offset_a))
00059 #define dB_rref(i)   ((i)*(offset_b))
00060 #define dC_rref(i,j) ((i)*(offset_c)+(j)*(mm)*(dN))
00061 
00067 typedef struct cublasxgemm_params {
00069   fupermod_float **dA, **dB, **dC;
00071   // try to avoid conflicts with IB when allocating huge pinned memory for C
00072   // it will only allocate pinned memory of the memory capacuties of devices
00073   // use stream callback function to schedule data transfers
00074   fupermod_float **pC;
00076   fupermod_float *workspace_WA, *workspace_WB;
00077   // somehow MPI_Bcast does not work on pinned memory, so workspace_WB is 
00078   fupermod_float *dma_WB;
00080   cublasHandle_t* handles;
00082   int idx_outer; 
00084   int num_iteration; // note: starting from 1, not 0
00086   int kernel_type;
00088   int num_device_use;
00089     
00090   // parameters from conf
00092   int device_id;
00094   int num_devices;
00096   int memory_mode;
00098   int algorithm;
00100   int is_overlap;
00101 } cublasxgemm_params;
00102 
00103 inline size_t getDeviceMemory(int device) {
00104   if (cudaGetDevice(&device) != cudaSuccess)
00105     return 0;
00106   struct cudaDeviceProp properties;
00107   if (cudaGetDeviceProperties(&properties, device) != cudaSuccess)
00108     return 0;
00109   return properties.totalGlobalMem; //bytes
00110 }
00111 
00112 // get the sum of an array
00113 inline int summer(int* array, int size)
00114 {
00115   if (size == 0) {
00116     return 0;
00117   } else {
00118     int i = 0;
00119     int sum = 0;
00120     for (i = 0; i < size; i++) {
00121       sum += *(array + i);
00122     }
00123     return sum;
00124   }
00125 }
00126 
00127 // get the address of a block from index. Plz refer to my matrix multiplicaiton kernel documentation
00128 inline fupermod_float* addr(const fupermod_float* ptr, int i, int j, int* height, int* width, int N, int K, int flag)
00129 {
00130   fupermod_float* address = (fupermod_float*)ptr;
00131   if (flag == 1) // A(i)
00132     address += summer(height, i) * K * K;
00133   else if (flag == 2) // B(i)
00134     address += summer(width, j) * K;
00135   else if (flag == 3) // C(i,j)
00136     address += summer(height, i) * K * N + summer(width, j) * K;
00137   else if (flag == 4) // A_(i)
00138     address += i * height[0] * K * K;
00139   else // C_(i)
00140     address += i * height[0] * width[0] * K * K;
00141     
00142   return address;
00143 }
00144 
00145 // get the address of a block from index. Plz refer to my matrix multiplicaiton kernel documentation
00146 inline fupermod_float* addr1D(const fupermod_float* ptr, int i, int j, int* height, int* width, int h_, int w_, int K, int flag)
00147 {
00148   fupermod_float* address = (fupermod_float*)ptr;
00149   if (flag == 1) // A(i)
00150     address += summer(height, i) * K * K;
00151   else if (flag == 2) // B(i)
00152     address += summer(width, j) * K * K;
00153   else if (flag == 3) {// C(i,j) 
00154     int l;
00155     for (l = 0; l < j; l++) {
00156       address += summer(height, h_) * K * width[j] * K;
00157     }
00158     address += summer(height, i) * K * width[j] * K;
00159   }
00160   else if (flag == 4) // A_(i)
00161     address += i * height[0] * K * K;
00162   else // C_(i)
00163     address += i * height[0] * width[0] * K * K;
00164     
00165   return address;
00166 }
00167 
00168 // struct for transfer data between C and temporary pinned buffer for C
00169 typedef struct transfer_t {
00170   cublasxgemm_params *params;
00171   fupermod_float *C;
00172   int i;
00173   int j;
00174   int buf;
00175   int *height;
00176   int *width;
00177   int N;
00178   int K;
00179   int id;
00180   int direction;
00181 } transfer_t;
00182 
00183 inline transfer_t *get_transfer(cublasxgemm_params *params, fupermod_float *C, int i, int j, int buf, int *h, int *w, int N, int K, int id, int direction) {
00184   transfer_t *t = (transfer_t *) malloc(sizeof(transfer_t));
00185   t->params = params;
00186   t->C = C;
00187   t->i = i;
00188   t->j = j;
00189   t->buf = buf;
00190   t->height = h;
00191   t->width = w;
00192   t->N = N;
00193   t->K = K;
00194   t->id = id;
00195   t->direction = direction;
00196   return t;
00197 }
00198 /*
00199 void CUDART_CB transfer_cb(cudaStream_t stream, cudaError_t status, void *transfer) {
00200   if (status != cudaSuccess) {
00201     fprintf(stderr, "%s %s: call back function status error", __FILE__, __func__);
00202   }  
00203   transfer_t *t = (transfer_t *) transfer;
00204   cublasxgemm_params *params = t->params;
00205   fupermod_float *C = t->C;
00206   int i = t->i;
00207   int j = t->j;
00208   int buf = t->buf;
00209   int *height = t->height;
00210   int *width = t->width;
00211   int N = t->N;
00212   int K = t->K;
00213   int id = t->id;
00214   int direction = t->direction;
00215   
00216   size_t count = sizeof(fupermod_float) * width[j+id] * K;
00217   int l;
00218   if (direction == 0) {
00219     // C --> pC, followed by pC --> dC
00220     fupermod_float *src = addr(C,i,j+id,height,width,N,K,3);
00221     fupermod_float *des = addr(params->pC[id],buf,j,height,width,N,K,5);
00222     for (l = 0; l < height[i] * K; l++) {
00223       memcpy(des + l * width[0] * K, src + l * N, count);
00224     }
00225   } else {
00226     // pC --> C, after dC --> pC done
00227     fupermod_float *src = addr(params->pC[id],buf,j,height,width,N,K,5);
00228     fupermod_float *des = addr(C,i,j+id,height,width,N,K,3);
00229     for (l = 0; l < height[i] * K; l++) {
00230       memcpy(des + l * N, src + l * width[0] * K, count);
00231     }
00232   }
00233   free(t); t = NULL;
00234 }*/
00235 
00236 /*-------------------------------------------------------------------------------------------------------------------*/
00237 void cuda_gemm_basic(fupermod_gemm* gemm,
00238           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00239           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00240           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00241 
00242 void cuda_gemm(fupermod_gemm* gemm,
00243           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00244           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00245           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00246 
00247 /*-------------------------------------------------------------------------------------------------------------------*/
00248 void gemm_kernel_gpu_single_fit(fupermod_gemm* gemm,
00249           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00250           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00251           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00252 
00253 void gemm_kernel_gpu_single_exceed(fupermod_gemm* gemm,
00254           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00255           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00256           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00257 
00258 void gemm_kernel_gpu_single_exceed_0(fupermod_gemm* gemm,
00259           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00260           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00261           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00262 
00263 void gemm_kernel_gpu_single_exceed_1(fupermod_gemm* gemm,
00264           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00265           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00266           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00267 
00268 void gemm_kernel_gpu_single_exceed_2(fupermod_gemm* gemm,
00269           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00270           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00271           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00272 
00273 void gemm_kernel_gpu_single_exceed_3(fupermod_gemm* gemm,
00274           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00275           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00276           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00277 /*
00278 void gemm_kernel_gpu_single_exceed_4(fupermod_gemm* gemm,
00279           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00280           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00281           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00282 */          
00283 /*-------------------------------------------------------------------------------------------------------------------*/
00284 void gemm_kernel_gpu_server_fit(fupermod_gemm* gemm,
00285           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00286           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00287           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00288 
00289 void gemm_kernel_gpu_server_exceed(fupermod_gemm* gemm,
00290           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00291           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00292           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00293 
00294 void gemm_kernel_gpu_server_exceed_nd2_1(fupermod_gemm* gemm,
00295           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00296           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00297           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00298 
00299 void gemm_kernel_gpu_server_exceed_nd2_2(fupermod_gemm* gemm,
00300           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00301           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00302           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00303 
00304 void gemm_kernel_gpu_server_exceed_nd2_3(fupermod_gemm* gemm,
00305           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00306           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00307           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00308           
00309 void gemm_kernel_gpu_server_exceed_np2_1(fupermod_gemm* gemm,
00310           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00311           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00312           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00313 
00314 void gemm_kernel_gpu_server_exceed_np2_2(fupermod_gemm* gemm,
00315           const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00316           const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00317           const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00318 
00319 /*-------------------------------------------------------------------------------------------------------------------*/
00320 void matrix_split_single_fit(int M, int N, int K, int *h_, int *w_, int **height, int **width);
00321 void matrix_split_single_exceed(int M, int N, int K, int mem_size, int *h_, int *w_, int **height, int **width);
00322 void matrix_split_server_exceed(int M, int N, int K, int mem_size_device, int num_device_use, int *h_, int *w_, int **height, int **width);
00323 
00324 /*-------------------------------------------------------------------------------------------------------------------*/
00325 #endif          
00326 #endif /* CUDA_AUXILIARY_H_ */
fupermod: Functional Performance Models of heterogeneous processors

cblas_wrappers/cuda_auxiliary.hpp

Projects

Hosted by UCD Open Source Lab