
/* 
   This version authomatically tunes the number of processes and
   the block size 
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <hmpi.h>
#include "reloj.h"
#include "Performance_model.c"
#include "LSWindowing.h"

#define  SEED  23
#define max(a, b)       ((a) < (b) ? (b) : (a))

/* Global variables */
double elapsed_bench;

int main( int argc, char **argv ) {


  HMPI_Group gid;
  int param_count;
  int modelparams[5]; 
  int dim = 1;

  int output_p;
  int input_p[2];
  int* coord;

  int p;

  /* Information of time */
  double elapsed, ucpu, scpu;

  // Variables for the test
  int n, filter, nb;
  int nbb;
  int nbinit, nbend, nbinc;
  
  // Block size

  int i;

  MPI_Comm mxmcomm;
  int ictxt;

  char prog_name[80];


  char *c = strtok( argv[0], "/" );
  do {
    strcpy( prog_name, c );
    c = strtok( NULL, "/" );
  } while( c != NULL );

  /* Initialization of the HMPI env. */
  HMPI_Init(&argc, &argv);

  /* Input parameters */
  if( argc<7 ) {
    if( HMPI_Is_host() ) printf("usage: hmpirun %s -- n filter nbinit nbinc nbend nbofbench \n",prog_name);
    HMPI_Finalize(0);
    return -1;
  }
  sscanf( argv[1], "%d", &n );
  sscanf( argv[2], "%d", &filter );
  sscanf( argv[3], "%d", &nbinit );
  sscanf( argv[4], "%d", &nbinc );
  sscanf( argv[5], "%d", &nbend );
  sscanf( argv[6], "%d", &nbb );

  if( nbinit>nbend ) { nb=nbinit; nbinit=nbend; nbend=nb; }

  // Estimation of the speed of processors
  if( HMPI_Is_member(HMPI_COMM_WORLD_GROUP ) ) {
    input_p[0] = filter;
    input_p[1] = nbb;
    HMPI_Recon( &Benchmark_function, input_p, 2, &output_p );
  }

  int processes = HMPI_Group_size(HMPI_COMM_WORLD_GROUP);
  if( nbinit*processes>n || nbend*processes>n ) {
    if( HMPI_Is_host() ) printf("%s: not enough work for all processes\n",prog_name);
    HMPI_Finalize(0);
    return -1;
  }
#ifdef SYSTEM_INFO
  int processors = HMPI_Get_number_of_processors();
  double *speeds = (double *) malloc( processors*sizeof(double));
  HMPI_Get_processors_info(speeds);
  if( HMPI_Is_host() ) {
    printf("Total number of processors = %d\n", processors );
    int k;
    for( k = 0; k < processors; k++ ) {
      printf("%12.2lf\n", speeds[k]/1000000.0);
      printf("%lf\n", speeds[k]);
    }
  }
  free( speeds );
  speeds = (double *) malloc( processes*sizeof(double));
  HMPI_Get_processes_info(speeds);
  if( HMPI_Is_host() ) {
    printf("Total number of processes = %d\n", processes );
    int k;
    for( k = 0; k < processes; k++ ) {
      printf("%12.2lf\n", speeds[k]/1000000.0);
      printf("%lf\n", speeds[k]);
    }
  }
  free( speeds );
#endif

  /* Estimation of the execution time */
  int seguir = 1;
  double suma, sumamin;;
  if (HMPI_Is_host()) {
     param_count = 5;
     modelparams[0] = n;
     modelparams[1] = filter;
     modelparams[3] = nbb;
     double time, timemin = 1.79769313486231570e+308;
     sumamin = timemin;
     int procs, nbiter;
     //for( procs = 4; procs<=processes; procs++ ) {
     for( procs = processes; procs>=4; procs-- ) {
         modelparams[4] = procs;
         printf("Predicted Time for p = %d: \n",procs);
         suma = 0.0;
         //for( nbiter = nbinit; nbiter <= nbend; nbiter = nbiter + nbinc ) {
         for( nbiter = nbend; nbiter >= nbinit; nbiter = nbiter - nbinc ) {
           modelparams[2] = nbiter;
           time = HMPI_Timeof( &MPC_NetType_LSWindowing_grid2, 
                               modelparams,
                               param_count);
           suma += time;
           printf("--> nbiter = %d: %lf\n",nbiter,time);
           if( timemin>time ) { timemin=time; p=procs; nb=nbiter; }
        }
        if( sumamin>suma ) { sumamin = suma; seguir++; }
        else seguir--;
        if( !seguir ) break;
     }
     modelparams[2] = nb;
     modelparams[4] = p;
  }

  /* Broadcast the block size */ 

  //HMPI_Finalize(0);

  /* Creating the model param and HMPI group */
  if (HMPI_Is_host()) {
     printf(" nb = %d\n",nb);
     printf(" p = %d\n",p);
     HMPI_Group_create( &gid, 
                        &MPC_NetType_LSWindowing_grid2, 
                        modelparams, 
                        param_count);
  }

  if (HMPI_Is_free()) {
     HMPI_Group_create( &gid, &MPC_NetType_LSWindowing_grid2, NULL, 0);
  }

  if (HMPI_Is_free()) {
     HMPI_Finalize(0);
  }

  /* Each process knowns the total number of processes */

  p = HMPI_Group_size(&gid);
#ifdef SYSTEM_INFO
  processes = HMPI_Group_size(&gid);
  speeds = (double *) malloc( processes*sizeof(double));
  HMPI_Group_performances(&gid,speeds);
  if( HMPI_Is_host() ) {
    printf("Number of processes in the group = %d\n", processes );
    int k;
    for( k = 0; k < processes; k++ ) {
      printf("%12.2lf\n", speeds[k]/1000000.0);
    }
  }
  free( speeds );
#endif

  char pname[MPI_MAX_PROCESSOR_NAME];
  int plen;

  if( HMPI_Group_coordof( &gid, &dim, &coord )!=HMPI_SUCCESS ) {
    printf("Error in HMPI_Group_coordof\n");
  }

  MPI_Get_processor_name( pname, &plen );

  //printf("Processor %s: (%d)\n", pname, coord[0] );

  /****************************************/
  /* Configuring the BLACS environment */
  /****************************************/
  mxmcomm = *(MPI_Comm*)HMPI_Get_comm(&gid);
  /* Sending nb */
  MPI_Bcast( &nb, 1, MPI_INT, 0, mxmcomm);

  /* Translate algocomm to a BLACS handle */
  ictxt = Csys2blacs_handle(mxmcomm);

  /*
   * Form BLACS context based on algocomm
   */
  int myrow, mycol, nprow, npcol;
  //printf("1. ictxt = %d, p = %d\n",ictxt,p );
  //blacs_gridinit__( &ictxt, &order, &p, &p );
  Cblacs_gridinit( &ictxt, "r", p, 1 );
  //printf("2. ictxt = %d\n",ictxt );
  blacs_gridinfo__( &ictxt, &nprow, &npcol, &myrow, &mycol);
  // printf("3. ictxt = %d\n",ictxt );
  // printf(" (%3d,%3d) \n",myrow,mycol );
  // printf(" (%3d,%3d) \n",nprow,npcol );

  double *G = NULL, *L = NULL, *V = NULL;

  int desc[9];

  int rsrc = 0, csrc = 0;

  int np = numroc_( &n, &nb, &myrow, &rsrc, &p );

  if( !myrow && !mycol ) {
    printf(" p \t=\t %d\n n \t=\t %d\n filter =\t %d\n nb \t=\t %d\n n blks =\t %d\n blk pp =\t %d + %d\n",p,n,filter,nb,n/nb,(n/nb)/p,(n/nb+(n%nb?1:0))%p);
  }
  //printf("P(%3d,%3d): np = %d\n", myrow, mycol, np );

  // Initialize the array descriptor for the matrix A, x and y
  int lld = max(1,np);
  int info;
  descinit_( desc, &n, &filter, &nb, &filter, &rsrc, &csrc, &ictxt, &lld, &info );

  // Allocating workspace for the arrays
  if( ( G = ( double * ) calloc(np*filter,sizeof(double)) )==NULL ) {
     printf("Problems allocating memory for array A\n");
     HMPI_Finalize(0);
     exit(-1);
  }
  for( i = 0; i<np*filter; i++ ) {
    G[i] = 1.0;
  }
  if( ( L = ( double * ) calloc(np*n,sizeof(double)) )==NULL ) {
     printf("Problems allocating memory for array x\n");
     HMPI_Finalize(0);
     exit(-1);
  }
  if( ( V = ( double * ) calloc(nb*filter,sizeof(double)) )==NULL ) {
     printf("Problems allocating memory for array x\n");
     HMPI_Finalize(0);
     exit(-1);
  }

  // Initialization of arrays
  srand( SEED );
  for( i = 0; i < np*filter; i++ ) {
    G[i] = ( (double) rand() )/(double)RAND_MAX;
  }
#ifndef NOCOMPILE

  char topo = 'C';
  blacs_barrier__( &ictxt, &topo );
  elapsed = MPI_Wtime();
  //reloj_(&elapsed,&ucpu,&scpu);
  pdtrf_( G, desc, L, V );
  blacs_barrier__( &ictxt, &topo );
  //reloj_(&elapsed,&ucpu,&scpu);
  elapsed = MPI_Wtime()-elapsed;

#endif
  free( G );
  free( L );
  free( V );

  if(myrow==0 && mycol==0 ) {
    printf("--> Time of program = %12.3lf\n",elapsed);
  }
  printf("Time of %19s (%2d) = %8.3lf (bench = %8.3lf)\n", 
     pname, coord[0], elapsed, elapsed_bench );

  blacs_gridexit__( &ictxt );

  /****************************************/
  /* Finalization of the HMPI environment */
  /****************************************/
  if (HMPI_Is_member(&gid)) {
     HMPI_Group_free(&gid);
  }

  HMPI_Finalize(0);

  return 1;
}


int Perf_func ( int filter, int nbb ) {
  int i, k;
  double *G, *L;
  double elapsed, ucpu, scpu;
  // Allocating workspace for the arrays
  if( ( G = (double * ) calloc(nbb*filter,sizeof(double)) )==NULL ) {
     printf("Problems allocating memory for array G\n");
     HMPI_Finalize(0);
     exit(-1);
  }

  if( ( L = (double * ) calloc(nbb*nbb,sizeof(double)) )==NULL ) {
     printf("Problems allocating memory for array L\n");
     HMPI_Finalize(0);
     exit(-1);
  }

  reloj_(&elapsed_bench,&ucpu,&scpu);
  srand( SEED );
  // Initialization of arrays
  for( i = 0; i < filter*nbb; i++ ) {
    G[i] = ( (double) rand() )/(double)RAND_MAX;
  }
  dtrfx_( &nbb, &filter, G, &nbb, L, &nbb );
  reloj_(&elapsed_bench,&ucpu,&scpu);
  /*
  printf("Benchmark = %20.3lf\n",elapsed_bench);
  if( elapsed<0.1 ) {
    printf("Warning: bench time too small\n");
  }
  */

  free(G);
  //free(V);
  free(L);

  return HMPI_OK;

}

void Benchmark_function ( const void* input_p, int num_of_p, void* output_p ) {
  int param1 = ((int *) input_p)[0];
  int param2 = ((int *) input_p)[1];
   
  int result = Perf_func( param1, param2 );

  *(int*)output_p = result;

  return;
}

