00001 #ifndef CUDA_AUXILIARY_H_
00002 #define CUDA_AUXILIARY_H_
00003
00004 #ifdef HAVE_LIBCUBLAS
00005 #include "fupermod_cblas.h"
00006 #include <stdio.h>
00007 #include <stdlib.h>
00008 #include <string.h>
00009 #include <cublas_v2.h>
00010 #include <cuda_runtime.h>
00011
00012 #define gama 0.6
00013 #define PRINT 1
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00029 # define CUBLAS_SAFE_CALL( call ) { \
00030 cublasStatus_t err = call; \
00031 if( CUBLAS_STATUS_SUCCESS != err) { \
00032 fprintf(stderr, "Cublas error %d in file '%s' in line %i.\n", \
00033 err, __FILE__, __LINE__ ); \
00034 exit(EXIT_FAILURE); \
00035 } }
00036
00038 inline cublasOperation_t trans_cblas2cublas(enum CBLAS_TRANSPOSE trans) {
00039 return (cublasOperation_t)(trans - CblasNoTrans);
00040 }
00041
00042 #ifdef ENABLE_BLAS_SP
00043 #define CUBLAS_GEMM cublasSgemm
00044 #else
00045 #define CUBLAS_GEMM cublasDgemm
00046 #endif
00047
00048 #define hA_ref(i) (A+(i)*(offset_a))
00049 #define hB_ref(i) (B+(i)*(offset_b))
00050 #define hC_ref(i,j) (C+(i)*(offset_c)+(j)*(mm)*(N))
00051 #define dA_ref(i) (dA+(i)*(offset_a))
00052 #define dB_ref(i) (dB+(i)*(offset_b))
00053 #define dC_ref(i,j) (dC+(i)*(offset_c)+(j)*(mm)*(dN))
00054
00055 #define hA_rref(i) ((i)*(offset_a))
00056 #define hB_rref(i) ((i)*(offset_b))
00057 #define hC_rref(i,j) ((i)*(offset_c)+(j)*(mm)*(N))
00058 #define dA_rref(i) ((i)*(offset_a))
00059 #define dB_rref(i) ((i)*(offset_b))
00060 #define dC_rref(i,j) ((i)*(offset_c)+(j)*(mm)*(dN))
00061
00067 typedef struct cublasxgemm_params {
00069 fupermod_float **dA, **dB, **dC;
00071
00072
00073
00074 fupermod_float **pC;
00076 fupermod_float *workspace_WA, *workspace_WB;
00077
00078 fupermod_float *dma_WB;
00080 cublasHandle_t* handles;
00082 int idx_outer;
00084 int num_iteration;
00086 int kernel_type;
00088 int num_device_use;
00089
00090
00092 int device_id;
00094 int num_devices;
00096 int memory_mode;
00098 int algorithm;
00100 int is_overlap;
00101 } cublasxgemm_params;
00102
00103 inline size_t getDeviceMemory(int device) {
00104 if (cudaGetDevice(&device) != cudaSuccess)
00105 return 0;
00106 struct cudaDeviceProp properties;
00107 if (cudaGetDeviceProperties(&properties, device) != cudaSuccess)
00108 return 0;
00109 return properties.totalGlobalMem;
00110 }
00111
00112
00113 inline int summer(int* array, int size)
00114 {
00115 if (size == 0) {
00116 return 0;
00117 } else {
00118 int i = 0;
00119 int sum = 0;
00120 for (i = 0; i < size; i++) {
00121 sum += *(array + i);
00122 }
00123 return sum;
00124 }
00125 }
00126
00127
00128 inline fupermod_float* addr(const fupermod_float* ptr, int i, int j, int* height, int* width, int N, int K, int flag)
00129 {
00130 fupermod_float* address = (fupermod_float*)ptr;
00131 if (flag == 1)
00132 address += summer(height, i) * K * K;
00133 else if (flag == 2)
00134 address += summer(width, j) * K;
00135 else if (flag == 3)
00136 address += summer(height, i) * K * N + summer(width, j) * K;
00137 else if (flag == 4)
00138 address += i * height[0] * K * K;
00139 else
00140 address += i * height[0] * width[0] * K * K;
00141
00142 return address;
00143 }
00144
00145
00146 inline fupermod_float* addr1D(const fupermod_float* ptr, int i, int j, int* height, int* width, int h_, int w_, int K, int flag)
00147 {
00148 fupermod_float* address = (fupermod_float*)ptr;
00149 if (flag == 1)
00150 address += summer(height, i) * K * K;
00151 else if (flag == 2)
00152 address += summer(width, j) * K * K;
00153 else if (flag == 3) {
00154 int l;
00155 for (l = 0; l < j; l++) {
00156 address += summer(height, h_) * K * width[j] * K;
00157 }
00158 address += summer(height, i) * K * width[j] * K;
00159 }
00160 else if (flag == 4)
00161 address += i * height[0] * K * K;
00162 else
00163 address += i * height[0] * width[0] * K * K;
00164
00165 return address;
00166 }
00167
00168
00169 typedef struct transfer_t {
00170 cublasxgemm_params *params;
00171 fupermod_float *C;
00172 int i;
00173 int j;
00174 int buf;
00175 int *height;
00176 int *width;
00177 int N;
00178 int K;
00179 int id;
00180 int direction;
00181 } transfer_t;
00182
00183 inline transfer_t *get_transfer(cublasxgemm_params *params, fupermod_float *C, int i, int j, int buf, int *h, int *w, int N, int K, int id, int direction) {
00184 transfer_t *t = (transfer_t *) malloc(sizeof(transfer_t));
00185 t->params = params;
00186 t->C = C;
00187 t->i = i;
00188 t->j = j;
00189 t->buf = buf;
00190 t->height = h;
00191 t->width = w;
00192 t->N = N;
00193 t->K = K;
00194 t->id = id;
00195 t->direction = direction;
00196 return t;
00197 }
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237 void cuda_gemm_basic(fupermod_gemm* gemm,
00238 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00239 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00240 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00241
00242 void cuda_gemm(fupermod_gemm* gemm,
00243 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00244 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00245 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00246
00247
00248 void gemm_kernel_gpu_single_fit(fupermod_gemm* gemm,
00249 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00250 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00251 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00252
00253 void gemm_kernel_gpu_single_exceed(fupermod_gemm* gemm,
00254 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00255 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00256 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00257
00258 void gemm_kernel_gpu_single_exceed_0(fupermod_gemm* gemm,
00259 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00260 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00261 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00262
00263 void gemm_kernel_gpu_single_exceed_1(fupermod_gemm* gemm,
00264 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00265 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00266 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00267
00268 void gemm_kernel_gpu_single_exceed_2(fupermod_gemm* gemm,
00269 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00270 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00271 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00272
00273 void gemm_kernel_gpu_single_exceed_3(fupermod_gemm* gemm,
00274 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00275 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00276 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00277
00278
00279
00280
00281
00282
00283
00284 void gemm_kernel_gpu_server_fit(fupermod_gemm* gemm,
00285 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00286 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00287 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00288
00289 void gemm_kernel_gpu_server_exceed(fupermod_gemm* gemm,
00290 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00291 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00292 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00293
00294 void gemm_kernel_gpu_server_exceed_nd2_1(fupermod_gemm* gemm,
00295 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00296 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00297 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00298
00299 void gemm_kernel_gpu_server_exceed_nd2_2(fupermod_gemm* gemm,
00300 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00301 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00302 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00303
00304 void gemm_kernel_gpu_server_exceed_nd2_3(fupermod_gemm* gemm,
00305 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00306 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00307 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00308
00309 void gemm_kernel_gpu_server_exceed_np2_1(fupermod_gemm* gemm,
00310 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00311 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00312 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00313
00314 void gemm_kernel_gpu_server_exceed_np2_2(fupermod_gemm* gemm,
00315 const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
00316 const int M, const int N, const int K, const fupermod_float alpha, const fupermod_float *A, const int lda,
00317 const fupermod_float *B, const int ldb, const fupermod_float beta, fupermod_float *C, const int ldc);
00318
00319
00320 void matrix_split_single_fit(int M, int N, int K, int *h_, int *w_, int **height, int **width);
00321 void matrix_split_single_exceed(int M, int N, int K, int mem_size, int *h_, int *w_, int **height, int **width);
00322 void matrix_split_server_exceed(int M, int N, int K, int mem_size_device, int num_device_use, int *h_, int *w_, int **height, int **width);
00323
00324
00325 #endif
00326 #endif