
   #include "mxm_i.h"

   /*-----------------------------------------------------*/

   int
   Input_recon
   (
      double *a,
      double *b,
      double *c
   )
   {
     int i, j;
     for (i = 0; i < recon_r; i++)
     {
          for (j = 0; j < recon_n; j++)
          {
              a[i*recon_n + j] = 2.0;
          }
      }

      for (i = 0; i < recon_n; i++)
      {
          for (j = 0; j < recon_t; j++)
          {
              b[i*recon_t + j] = 2.0;
          }
      }

      for (i = 0; i < recon_r; i++)
      {
          for (j = 0; j < recon_t; j++)
          {
              c[i*recon_t + j] = 0.0;
          }
      }

      return HMPI_OK;
   }

   /*-----------------------------------------------------*/

   int Perf_func
   (
       int x, /* --r-- */
       int y, /* --n-- */
       int z  /* --t-- */
   )
   {
      /*
       * Multiplication of r*n and n*t matrices.
       */
      int i,j,k;
      for (i = 0; i < x; i++)
      {
          for (j = 0; j < z; j++)
          {
              for (k = 0; k < y; k++)
              {
                  c[i*z + j] += a[i*x + k] * b[k*z + j];
              }
          }
      }

      return HMPI_OK;
   }

   /*-----------------------------------------------------*/

   void Benchmark_function
   (
      const void* input_p,
      int num_of_p,
      void* output_p
   )
   {
      int* params = (int*)input_p;

      int result = Perf_func(
                       params[0],
                       params[1],
                       params[2]
      );

      *(int*)output_p = result;

      return;
   }

   /*-----------------------------------------------------*/

   int Do_recon()
   {
       a = (double*)malloc(
                    sizeof(double)
                    *
                    (recon_r*recon_n)
       );

       b = (double*)malloc(
                    sizeof(double)
                    *
                    (recon_n*recon_t)
       );

       c = (double*)malloc(
                    sizeof(double)
                    *
                    (recon_r*recon_t)
       );

       Input_recon(
         a,
         b,
         c
       );

       if (HMPI_Is_member(HMPI_COMM_WORLD_GROUP))
       {
            int rc;
            int output_p;
            int input_p[3] =
                {
                   recon_r,
                   recon_n,
                   recon_t
            };

            rc = HMPI_Recon(
                     &Benchmark_function,
                     input_p,
                     3,
                     &output_p
            );

            if  (rc != HMPI_OK)
            {
                printf("Panic: HMPI_Recon failed\n");
                return rc;
            }
       }

       if ((VERBOSE > 0)
           && (HMPI_Is_host()
          )
       )
       {
          printf("Processor performances refreshed\n");
       }

       free(a);
       free(b);
       free(c);

       return HMPI_OK;
   }

  /*-----------------------------------------------------*/

  int Execute_algorithm()
  {
     int rc, i, j, k, me;

     HMPI_Group gid;
     int *model_params;
     int model_count;
     double *dperf;
     
     me = HMPI_Group_rank(HMPI_COMM_WORLD_GROUP);
     
     model_count = 1;
     model_params = (int*)malloc(
                          sizeof(int)
                          *
                          model_count
     );
     
     if (model_params == NULL)
     {  
        return -1;
     }

     model_params[0] = p;

     block_allocations = (int*)malloc(
                               sizeof(double)
                               *
                               p
     );                             
     
     if (block_allocations == NULL)
     {
        return -1;
     }  

     if (HMPI_Is_host()
         || HMPI_Is_free()
     )
     {
        int rc = HMPI_Group_create(
                     &gid,
                     &HMPI_NetType_simple,
                     model_params,
                     model_count
        );

        if (rc != HMPI_OK)
        {
           return rc;
        }
     }

     if (HMPI_Is_free())
     {
        free(model_params);
        free(block_allocations);

        HMPI_Finalize(0);
     }

     /*
      * Calculate the barrier time for possible use.
      */
     {
        int rc;
        double i_barrier_time, f_barrier_time;

        i_barrier_time = MPI_Wtime();

        rc = HMPI_Barrier(&gid);

        if (rc != HMPI_OK)
        {
           return rc;
        }

        f_barrier_time = MPI_Wtime();

        barrier_time = (
                        f_barrier_time 
                        - 
                        i_barrier_time
        );
     }

     dperf = (double*)malloc(
                     sizeof(double)
                     *
                     p
     );              
     
     if (dperf == NULL)
     {
        return -1;
     }  
     
     rc = HMPI_Group_performances(
              &gid,
              dperf
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     if ((VERBOSE > 0)
         && (HMPI_Is_host()
        )
     )
     {
        printf("Performances are: ");

        for (i = 0; i < p; i++)
        {
            printf("%0.1f ", dperf[i]);
        }

        printf("\n");
     }

     /*
      * Distribute the load taking into account the
      * relative performances.
      */
     rc = Distribute_load(
            p,
            dperf,
            block_allocations
     );

     if (rc != HMPI_OK)
     {
        printf(
          "MXM:Problems HEHE distributing the load "
          "...Exiting...\n"
        );

        return rc;
     }

     rc = Perform_mxm(
                 &gid
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     rc = HMPI_Barrier(&gid);

     if (rc != HMPI_OK)
     {
        return rc;
     }

     free(model_params);
     free(block_allocations);
     free(Generalised_block);
     free(dperf);

     /*
      * Destroy the group
      */
     if (HMPI_Is_member(&gid))
     {
        int rc = HMPI_Group_free(&gid);

        if (rc != HMPI_OK)
        {
           return rc;
        }
     }

     return HMPI_OK;
  }

  /*-----------------------------------------------------*/

  int
  Perform_mxm
  (
     const HMPI_Group* gid
  )
  {
     int rc;

     A = (double*)malloc(
                  sizeof(double)
                  *
                  (N*N)
     );

     if (A == NULL)
     {
        printf("PANIC: heap problems, Allocation of A\n");
        return -1;
     }

     B = (double*)malloc(
                  sizeof(double)
                  *
                  (N*N)
     );

     if (B == NULL)
     {
        printf("PANIC: heap problems, Allocation of B\n");
        return -1;
     }            
               
     C = (double*)malloc(
                  sizeof(double)
                  *
                  (N*N)
     );

     if (C == NULL)
     {
        printf("PANIC: heap problems, Allocation of C\n");
        return -1;
     }            

     rc = mxm(
            gid,
            A,
            B,
            C
     );

     if (rc != HMPI_OK)
     {
        printf(
          "MXM:Problems multiplying the matrices A and B "
          "...Exiting...\n"
        );

        return rc;
     }

     free(A);
     free(B);
     free(C);

     return HMPI_OK;
  }

  /*-----------------------------------------------------*/

  int
  mxm
  (
     const HMPI_Group* gid,
     double *a,
     double *b,
     double *c
  )
  {
     int result;
     int x, y;
     int coord;
     int i, j, k, l;
     int num_of_coordinates;
     int temp, ind;
     MPI_Comm* grid_comm;
     int** my_coordinates;

     my_coordinates = (int**)malloc(
                             sizeof(int*)
     );

     if (my_coordinates == NULL)
     {
        return -1;
     }

     grid_comm = (MPI_Comm*)HMPI_Get_comm(
                               gid
     );

     if (grid_comm == NULL)
     {
        HMPI_Printf("Panic: grid communicator is NULL\n");
        return -1;
     }

     result =  HMPI_Group_coordof(
                   gid,
                   &num_of_coordinates,
                   my_coordinates
     ); 

     if (result != HMPI_OK)
     {
        HMPI_Printf("Error while getting the coordinates\n");
        return result;
     }

     /*
      * Initialize the respective array elements A & B at
      * the processors.
      * Each array element is a r*r matrix.
      */
     coord = (*my_coordinates)[0];

     temp = 0;
     for (i = 0; i < coord; i++)
     {
         temp += block_allocations[i];
     }

     istart = (temp)/(Generalised_block_size);
     jstart = (temp)%(Generalised_block_size);

     myalloc = (int)block_allocations[coord];

     for (x = (istart*r); x < N; x+=(Generalised_block_size*r))
     {
         for (y = (jstart*r), ind = 0; y < N; y+=(Generalised_block_size*r), ind+=(Generalised_block_size*r))
         {
             for (i = 0; i < (myalloc*r); i+=r)
             {
                 for (k = 0; k < r; k++)
                 {
                     for (l = 0; l < r; l++)
                     {
                         a[(x*N) + ind + ((y - ind + i)%(Generalised_block_size*r)) + (((y - ind + i)/(Generalised_block_size*r))*r*N) + (k*N) + l] 
                         = 
                         MXM_CONSTANT_NUMBER
                         ;

                         b[(x*N) + ind + ((y - ind + i)%(Generalised_block_size*r)) + (((y - ind + i)/(Generalised_block_size*r))*r*N) + (k*N) + l] 
                         = 
                         MXM_CONSTANT_NUMBER
                         ;

                         c[(x*N) + ind + ((y - ind + i)%(Generalised_block_size*r)) + (((y - ind + i)/(Generalised_block_size*r))*r*N) + (k*N) + l] 
                         = 
                         0.0
                         ;
                     }
                 }
             }
         }
     }

     /*
      * Check to see if the matrix elements are properly
      * initialised.
      */
     if (VERBOSE > 0)
     {
        for (x = (istart*r); x < N; x+=(Generalised_block_size*r))
        {
            for (y = (jstart*r), ind = 0; y < N; y+=(Generalised_block_size*r), ind+=(Generalised_block_size*r))
            {
                for (i = 0; i < (myalloc*r); i+=r)
                {
                     for (k = 0; k < r; k++)
                     {
                         for (l = 0; l < r; l++)
                         {
                             printf(
                                "a[%d][%d] = %0.2f\n",
                                x + ((y - ind + i)/(Generalised_block_size*r)) + k,
                                ind + ((y -ind + i)%(Generalised_block_size*r)) + l,
                                a[(x*N) + ind + ((y - ind + i)%(Generalised_block_size*r)) + (((y - ind + i)/(Generalised_block_size*r))*r*N) + (k*N) + l]
                             );
                        }
                    }
                }
            }
        }
     }

     result = Grid_computations(
                  gid,
                  grid_comm,
                  (*my_coordinates),
                  a,
                  b,
                  c
     );

     if (result != HMPI_OK)
     {
        HMPI_Printf("Error while performing grid computations\n");
        return result;
     }

     free(my_coordinates[0]);
     free(my_coordinates);

     return HMPI_OK;
  }

  /*-----------------------------------------------------*/

  int
  Grid_computations
  (
     const HMPI_Group* gid,
     MPI_Comm* grid_comm,
     const int* my_coordinates,
     double *a,
     double *b,
     double *c
  )
  {
     int x, y, i, j, k, l, m, t;

     MPI_Comm row_comm[Generalised_block_size];
     MPI_Comm column_comm[Generalised_block_size];
     int myrows[Generalised_block_size];
     int mycolumns[Generalised_block_size];
     int ind;

     /* 
      * Communicators for the rows and columns of the grid.
      * For example:
      * P00, P00, P00, P00, P01, P01
      * P00, P00, P00, P00, P01, P01
      * P00, P00, P00, P00, P01, P01
      * P00, P00, P00, P00, P11, P11
      * P10, P10, P10, P10, P11, P11
      * P10, P10, P10, P10, P11, P11
      *
      * Every processor stores above grid. Each grid element
      * consists of the coordinates of the processor.
      *     
      * There will be six row communicators with the groups
      * {P00, P01} , {P00, P01} , {P00, P01} , {P00, P11} 
      * {P10, P11} , {P10, P11}
      *
      * There will be six column communicators with the groups
      * {P00, P10} , {P00, P10} , {P00, P10} , {P00, P10} 
      * {P01, P11} , {P01, P11}
      */
     for (i = 0; i < Generalised_block_size; i++)
     {
         int result;
         int color = MPI_UNDEFINED;
         myrows[i] = 0;

         for (j = 0; j < Generalised_block_size; j++)
         {
             int coord = Generalised_block[(i*Generalised_block_size) + j];

             if (my_coordinates[0] == coord)
             {
                color = 0;
                myrows[i] = 1;
                break;
             }
         }

         result = MPI_Comm_split(
                     *grid_comm,
                     color,
                     1,
                     &row_comm[i]
         );

         if (result != MPI_SUCCESS)
         {
            HMPI_Printf(
                "Problems with MPI_Comm_split for row communicator %d\n",
                i
            );
            return result;
         }
     }

     /* 
      * Communicators for the columns 
      */
     for (j = 0; j < Generalised_block_size; j++)
     {
         int result;
         int color = MPI_UNDEFINED;
         mycolumns[j] = 0;

         for (i = 0; i < Generalised_block_size; i++)
         {
             int coord = Generalised_block[(i*Generalised_block_size) + j];

             if (coord == my_coordinates[0])
             {
                color = 0;
                mycolumns[j] = 1;
                break;
             }
         }

         result = MPI_Comm_split(
                     *grid_comm,
                     color,
                     1,
                     &column_comm[j]
         );

         if (result != MPI_SUCCESS)
         {
            HMPI_Printf(
                "Problems with MPI_Comm_split for column communicator %d\n",
                j
            );
            return result;
         }
      }

      for (k = 0; k < N; k+=r)
      {
          /*
           * P(i,k) broadcasts a(i,k) to p(i,*) horizontally.
           */
          for (i = 0; i < N; i+=r)
          {
              if (!myrows[((i/r)%Generalised_block_size)])            
              {
                 continue;
              }

              {
                 int Pik;
                 int coord;
                 int root;
                 int rank;
                 double temp[r][r];
                 int result;

                 coord = Generalised_block[(((i/r)%Generalised_block_size)*Generalised_block_size) + ((k/r)%Generalised_block_size)];

                 Pik = HMPI_Rank(
                           gid,
                           &coord
                 );

                 result = Translate_from_rank(
                                   Pik,
                                   grid_comm,
                                   &row_comm[((i/r)%Generalised_block_size)],
                                   &root
                 );

                 if (result != HMPI_OK)
                 {
                    HMPI_Printf(
                        "Problems with translating the rank "
                        " with row communicator %d for step %d\n",
                        ((i/r)%Generalised_block_size),
                        k
                    );
                    return result;
                 }

                 result = MPI_Comm_rank(
                             row_comm[((i/r)%Generalised_block_size)],
                             &rank
                 );

                 if (result != MPI_SUCCESS)
                 {
                    HMPI_Printf(
                        "Problems with getting the rank "
                        " with row communicator for step %d\n",
                        k
                    );
                    return result;
                 }

                 if (rank == root)
                 {
                    for (l = 0; l < r; l++)
                    {
                        for (m = 0; m < r; m++)
                        {
                            temp[l][m] = a[i*N + k + l*N + m];
                        }
                    }

                    result = MPI_Bcast(
                                &temp,
                                r*r,
                                MPI_DOUBLE,
                                root,
                                row_comm[((i/r)%Generalised_block_size)]
                    );

                    if (result != MPI_SUCCESS)
                    {
                       HMPI_Printf(
                           "Problems with MPI_Bcast for row communicator"
                           " for step %d\n",
                           k
                       );

                       return result;
                    }

                 }
                 else
                 {
                    result = MPI_Bcast(
                                &temp,
                                r*r,
                                MPI_DOUBLE,
                                root,
                                row_comm[((i/r)%Generalised_block_size)]
                    );

                    if (result != MPI_SUCCESS)
                    {
                       HMPI_Printf(
                           "Problems with MPI_Bcast"
                           " for row communicator for step %d\n",
                           k
                       );

                       return result;
                    }

                    for (l = 0; l < r; l++)
                    {
                        for (m = 0; m < r; m++)
                        {
                            a[i*N + k + l*N + m] = temp[l][m];
                        }
                    }
                 }
              }
          }

          /*
           * P(k,j) broadcasts a(k,j) to p(*,j) vertically.
           */ 
          for (j = 0; j < N; j+=r)
          {
              if (!mycolumns[((j/r)%Generalised_block_size)])
              {
                 continue;
              }

              {
                 int result;
                 double temp[r][r];
                 int coord;
                 int Pkj;
                 int root;
                 int rank;

                 coord = Generalised_block[(((k/r)%Generalised_block_size))*Generalised_block_size + ((j/r)%Generalised_block_size)];

                 Pkj =  HMPI_Rank(
                            gid,
                            &coord
                 ); 

                 result = Translate_from_rank(
                                   Pkj,
                                   grid_comm,
                                   &column_comm[((j/r)%Generalised_block_size)],
                                   &root
                 );

                 if (result != HMPI_OK)
                 {
                    HMPI_Printf(
                        "Problems with translating the rank"
                        " with column communicator for step %d\n",
                        k
                    );
                    return result;
                 }

                 result = MPI_Comm_rank(
                             column_comm[((j/r)%Generalised_block_size)],
                             &rank
                 );

                 if (result != MPI_SUCCESS)
                 {
                    HMPI_Printf(
                        "Problems with getting the rank"
                        " with column communicator for step %d\n",
                        k
                    );
                    return result;
                 }

                 if (rank == root)
                 {
                    for (l = 0; l < r; l++)
                    {
                        for (m = 0; m < r; m++)
                        {
                            temp[l][m] = b[k*N + j + l*N + m];
                        }
                    }

                    result = MPI_Bcast(
                                &temp,
                                r*r,
                                MPI_DOUBLE,
                                root,
                                column_comm[((j/r)%Generalised_block_size)]
                    );

                    if (result != MPI_SUCCESS)
                    {
                       HMPI_Printf(
                           "Problems with MPI_Bcast"
                           " for column communicator for step %d\n",
                           k
                       );

                       return result;
                    }
                 }
                 else
                 {
                    result = MPI_Bcast(
                                &temp,
                                r*r,
                                MPI_DOUBLE,
                                root,
                                column_comm[((j/r)%Generalised_block_size)]
                    );

                    if (result != MPI_SUCCESS)
                    {
                       HMPI_Printf(
                           "Problems with MPI_Bcast"
                           " for column communicator for step %d\n",
                           k
                       );

                       return result;
                    }

                    for (l = 0; l < r; l++)
                    {
                        for (m = 0; m < r; m++)
                        {
                            b[k*N + j + l*N + m] = temp[l][m];
                        }
                    }
                 }
              }
          }

          for (x = (istart*r); x < N; x+=(Generalised_block_size*r))
          {
              for (y = (jstart*r), ind = 0; y < N; y+=(Generalised_block_size*r), ind+=(Generalised_block_size*r))
              {
                  for (i = 0; i < (myalloc*r); i+=r)
                  {
                       /*
                        * Multiplication of a[i][k] * b[k][j]
                        * is equivalent to multiplying 2 r*r
                        * matrices.
                        */
                       for (l = 0; l < r; l++)
                       {
                           for (m = 0; m < r; m++)
                           {
                               for (t = 0; t < r; t++)
                               {
				   int step;

				   if (ind == 0)
                                   {
                                      step = ((y + i)%(Generalised_block_size*r));
				   }
                                   else
			           {
                                      step = ((y - ind + i)/(Generalised_block_size*r))
                                              ?
                                              (y - ind + i)
                                              :
                                              (y + i)
                                      ;
                                   }

                                   c[x*N + ind + ((y - ind + i)%(Generalised_block_size*r)) + (((y - ind + i)/(Generalised_block_size*r))*r*N) + l*N + m] 
                                   += 
                                   a[x*N + 0 + (((y - ind + i)/(Generalised_block_size*r))*r*N) + k   + l*N + t]
                                   *
                                   b[0   + step + k*N + t*N + m]
                                   ;
                              }
                          }
                      }
                  }
              }
          }
      }

      /*
       * Free the communicators.
       */
      for (i = 0; i < Generalised_block_size; i++)
      {
          if (myrows[i])
          {
             int result = MPI_Comm_free(&row_comm[i]);

             if (result != MPI_SUCCESS)
             {
                HMPI_Printf(
                    "Problems with freeing the row communicator %d\n",
                    i
                );

                return result;
             }
          }
      }

      for (j = 0; j < Generalised_block_size; j++)
      {
          if (mycolumns[j])
          {
             int result = MPI_Comm_free(&column_comm[j]);

             if (result != MPI_SUCCESS)
             {
                HMPI_Printf(
                    "Problems with freeing the column communicator %d\n",
                    j
                );

                return result;
             }
          }
      }

      /*
       * The result of the computations.
       */
      if (VERBOSE > 0)
      {
        for (x = (istart*r); x < N; x+=(Generalised_block_size*r))
        {
            for (y = (jstart*r), ind = 0; y < N; y+=(Generalised_block_size*r), ind+=(Generalised_block_size*r))
            {
                for (i = 0; i < (myalloc*r); i+=r)
                {
                     for (k = 0; k < r; k++)
                     {
                         for (l = 0; l < r; l++)
                         {
                             printf(
                                "c[%d][%d] = %0.2f\n",
                                x + ((y - ind + i)/(Generalised_block_size*r)) + k,
                                ind + ((y - ind + i)%(Generalised_block_size*r)) + l,
                                c[x*N + ind + ((y - ind + i)%(Generalised_block_size*r)) + (((y - ind +i)/(Generalised_block_size*r))*r*N) + k*N + l]
                             );
                         }
                     }
                 }
             }
         }
      }

      return HMPI_OK;
   }

  /*-----------------------------------------------------*/

   int
   Translate_from_rank
   (
      int pij,
      MPI_Comm* grid_comm,
      MPI_Comm* local_comm,
      int* root
   )
   {
      MPI_Group grid_group;
      MPI_Group local_group;

      int rc = MPI_Comm_group(
                  *grid_comm,
                  &grid_group
      );

      if (rc != MPI_SUCCESS)
      {
         HMPI_Printf(
             "Problems with getting the group of grid communicator\n"
         );
         return rc;
      }

      rc = MPI_Comm_group(
              *local_comm,
              &local_group
      );

      if (rc != MPI_SUCCESS)
      {
         HMPI_Printf(
             "Problems with getting the group of local communicator\n"
         );
         return rc;
      }

      rc =  MPI_Group_translate_ranks(
               grid_group,
               1,
               &pij,
               local_group,
               root
      );

      if (rc != MPI_SUCCESS)
      {
         HMPI_Printf(
             "Problems with translating ranks, problems with pij\n"
         );
         return rc;
      }

      rc = MPI_Group_free(&grid_group);

      if (rc != MPI_SUCCESS)
      {
         return rc;
      }

      rc = MPI_Group_free(&local_group);

      if (rc != MPI_SUCCESS)
      {
         return rc;
      }

      return HMPI_OK;
   }

  /*-----------------------------------------------------*/
