
#include <sys/time.h>
#include <stdio.h>
#include <math.h>

#include <hmpi.h>
#include <cblas.h>


#define N 4096

#ifdef  max
#undef  max
#endif  /* max */
#define max(a, b)       ((a) < (b) ? (b) : (a))

int main( int argc, char **argv)
{
    double *a, *b, *c;

    int rc;

    int nump;

    struct timeval start, end;

    gettimeofday(&start, NULL);

    // Initialize HMPI runtime
    HMPI_Init(&argc, &argv);

    nump = HMPI_Get_number_of_processors();

    {
      int i,j,m,n,k,lda,ldb,ldc;
      double alpha=1.0, beta=0.0;
      int recon_n = N/sqrt(nump);

      a = (double*)malloc(sizeof(double)*recon_n*recon_n);
      b = (double*)malloc(sizeof(double)*recon_n*recon_n);
      c = (double*)malloc(sizeof(double)*recon_n*recon_n);

      /*
       * call BLAS routine to compute C=A*B
       */
      m = n = k = lda =ldb = ldc = recon_n;
      printf("me=%d, m=%d, n=%d, k=%d, lda=%d, ldb=%d, ldc=%d\n", HMPI_Group_rank(HMPI_COMM_WORLD_GROUP), m, n, k, lda, ldb, ldc);

      cblas_dgemm(
           101,
           111,
           111,
           m,
           n,
           k,
           alpha,
           a,
           lda,
           b,
           ldb,
           beta,
           c,
           ldc
      );

      free(a);
      free(b);
      free(c);
    }

    HMPI_Finalize(0);
}
