
  #include "mxm_i.h"

  /*-----------------------------------------------------*/

  int Execute_algorithm(MPI_Comm* acomm)
  {
     int i, rc;

     row_allocations = (int*)malloc(
                                 sizeof(int)
                                 *
                                 (p*q)
     );

     if (row_allocations == NULL)
     {
        return -1;
     }

     column_allocations = (int*)malloc(
                                    sizeof(int)
                                    *
                                    q
     );

     if (column_allocations == NULL)
     {
        return -1;
     }

     Generalised_block = (int(*)[2])malloc(
                         sizeof(int[2])
                         *
                         (generalised_block_size_row)
                         *
                         (generalised_block_size_col)
     );

     if (Generalised_block == NULL)
     {
        return -1;
     }

     w = (int*)malloc(
               sizeof(int)
               *
               q
     );

     if (w == NULL)
     {
        return -1;
     }

     h = (int*)malloc(
               sizeof(int)
               *
               (p*q*p*q)
     );

     if (h == NULL)
     {
        return -1;
     }

     trow = (int*)malloc(
                  sizeof(int)
                  *
                  (p*q)
     );

     if (trow == NULL)
     {
        return -1;
     }

     /*
      * Calculate the barrier time for possible use.
      */
     {
        double i_barrier_time, f_barrier_time;

        i_barrier_time = MPI_Wtime();

        for (i = 0; i < BARRIER_ITERATIONS; i++)
        {
           rc = MPI_Barrier(*acomm);

           if (rc != MPI_SUCCESS)
           {
              return rc;
           }
        }

        f_barrier_time = MPI_Wtime();

        barrier_time = (
                        f_barrier_time 
                        - 
                        i_barrier_time
        );
     }

     /*
      * Distribute the load taking into account the
      * relative performances.
      */
     rc = Distribute_load(
            p,
            q,
            generalised_block_size_row,
            generalised_block_size_col,
            row_allocations,
            column_allocations
     );

     if (rc != MPI_SUCCESS)
     {
        printf(
          "MXM:Problems distributing the load "
          "...Exiting...\n"
        );

        return rc;
     }

     /*
      * Perform the mxm for square matrices for 
      * each problem size.
      *
      * Update N which is the size of the matrix
      * For each iteration.
      *
      */
     n = (N/r);
     rc = Determine_distribution_parameters(
            p,
            q,
            row_allocations,
            column_allocations,
            w,
            h,
            trow
     );

     if (rc != MPI_SUCCESS)
     {
        return rc;
     }

     rc = Perform_mxm(acomm);

     if (rc != MPI_SUCCESS)
     {
        return rc;
     }

     rc = MPI_Barrier(*acomm);

     if (rc != MPI_SUCCESS)
     {
        return rc;
     }

     free(row_allocations);
     free(column_allocations);
     free(Generalised_block);

     free(h);
     free(w);
     free(trow);

     return MPI_SUCCESS;
  }

  /*-----------------------------------------------------*/

  int
  Perform_mxm(MPI_Comm* acomm)
  {
     int rc;

     A = (double*)malloc(
                  sizeof(double)
                  *
                  (N*N)
     );

     if (A == NULL)
     {
        printf("PANIC: heap problems, Allocation of A\n");
        return -1;
     }

     B = (double*)malloc(
                  sizeof(double)
                  *
                  (N*N)
     );

     if (B == NULL)
     {
        printf("PANIC: heap problems, Allocation of B\n");
        return -1;
     }

     C = (double*)malloc(
                  sizeof(double)
                  *
                  (N*N)
     );

     if (C == NULL)
     {
        printf("PANIC: heap problems, Allocation of C\n");
        return -1;
     }

     rc = mxm(
            acomm,
            A,
            B,
            C
     );

     if (rc != MPI_SUCCESS)
     {
        printf(
          "MXM:Problems multiplying the matrices A and B "
          "...Exiting...\n"
        );

        return rc;
     }

     free(A);
     free(B);
     free(C);

     return MPI_SUCCESS;
  }

  /*-----------------------------------------------------*/

  int
  mxm
  (
     MPI_Comm* acomm,
     double *a,
     double *b,
     double *c
  )
  {
     int rc;
     int x, y;
     int icoord, jcoord;
     int i, j, k, l;
     int* my_coordinates = (int*)malloc(
                                   sizeof(int)
                                   *
                                   COORDN
     );
     const int ppar[] = {p, q};

     rank2coord(algo_comm_rank, ppar, my_coordinates);

     /*
      * Initialize the respective array elements A & B at
      * the processors.
      * Each array element is a r*r matrix.
      */
     icoord = my_coordinates[0];
     jcoord = my_coordinates[1];

     istart = 0;
     for (i = 0; i < icoord; i++)
     {
         istart += row_allocations[(i*q) + jcoord];
     }

     jstart = 0;
     for (j = 0; j < jcoord; j++)
     {
         jstart += (column_allocations[j]);
     }

     myialloc = row_allocations[(icoord*q) + jcoord];
     myjalloc = (column_allocations[jcoord]);

     for (x = (istart*r); x < N; x+=(generalised_block_size_row*r))
     {
         for (y = (jstart*r); y < N; y+=(generalised_block_size_col*r))
         {
             for (i = 0; i < (myialloc*r); i+=r)
             {
                 for (j = 0; j < (myjalloc*r); j+=r)
                 {
                     for (k = 0; k < r; k++)
                     {
                         for (l = 0; l < r; l++)
                         {
                             a[(x*N) + y + (i*N) + j + (k*N) + l] 
                             = 
                             MXM_CONSTANT_NUMBER
                             ;

                             b[(x*N) + y + (i*N) + j + (k*N) + l] 
                             = 
                             MXM_CONSTANT_NUMBER
                             ;

                             c[(x*N) + y + (i*N) + j + (k*N) + l] 
                             = 
                             0.0
                             ;
                         }
                     }
                 }
             }
         }
     }

     /*
      * Check to see if the matrix elements are properly
      * initialised.
      */
     if (MPI_VERBOSE > 1)
     {
        for (x = (istart*r); x < N; x+=(generalised_block_size_row*r))
        {
            for (y = (jstart*r); y < N; y+=(generalised_block_size_col*r))
            {
                for (i = 0; i < (myialloc*r); i+=r)
                {
                    for (j = 0; j < (myjalloc*r); j+=r)
                    {
                        for (k = 0; k < r; k++)
                        {
                            for (l = 0; l < r; l++)
                            {
                                printf(
                                    "a[%d][%d] = %0.2f\n",
                                    x + i + k,
                                    y + j + l,
                                    a[(x*N) + y + (i*N) + j + (k*N) + l]
                                );
                            }
                        }
                    }
                }
            }
        }
     }

     /*
      * Computations on network nid_grid
      */
     {
        int rc = Grid_computations(
                     acomm,
                     my_coordinates,
                     a,
                     b,
                     c
        );

        if (rc != MPI_SUCCESS)
        {
           printf("Error while performing grid computations\n");
           return rc;
        }
     }

     free(my_coordinates);

     return MPI_SUCCESS;
  }

  /*-----------------------------------------------------*/

  int
  Grid_computations
  (
     MPI_Comm* acomm,
     const int* my_coordinates,
     double *a,
     double *b,
     double *c
  )
  {
     int x, y, z, i, j, k, l, m, t;
     int ppar[] = {p, q};
     int cheight, cwidth;

     Block Ablock, Bblock;
     Processor Root, Me, Receiver;

     Me.I = my_coordinates[0];
     Me.J = my_coordinates[1];

     for (k = 0; k < n; k++)
     {
         int Acolumn = (k%generalised_block_size_col), Arow;
         int Brow = (k%generalised_block_size_row), Bcolumn;

         /*
          * P(i,k) broadcasts a(i,k) to p(i,*) horizontally.
          */
         for (Arow = 0; Arow < generalised_block_size_row;)
         {
             GetBlock(Arow, Acolumn, &Ablock);
             GetProcessor(&Ablock, &Root);
             cheight = H((Root.I), (Root.J), (Me.I), (Me.J), p, q);

             if (cheight > 0)
             {
                if (((Me.I) == (Root.I)) && ((Me.J) == (Root.J)))
                {
                   for (Receiver.I = 0; Receiver.I < p; Receiver.I++)
                   {
                       for (Receiver.J = 0; Receiver.J < q; Receiver.J++)
                       {
                           if (((Root.I != Receiver.I) || (Root.J != Receiver.J))
                               && (Root.J != Receiver.J))
                           {
                              cheight = H((Root.I), (Root.J), (Receiver.I), (Receiver.J), p, q);
                              if (cheight > 0)
                              {
                                 double temp[(n/generalised_block_size_row)*cheight*r*r];
                                 int ProcessorCoords[2] = {Receiver.I, Receiver.J};
                                 int dest = coord2rank(ProcessorCoords, ppar);
                                 int ReceiverTopRow = trow[(Receiver.I)*q+(Receiver.J)];
                                 int TopRowIndex = (ReceiverTopRow > Arow) ? ReceiverTopRow : Arow;
                                 for (x = 0; x < (n/generalised_block_size_row); x++)
                                 {
                                     int RowIndex = TopRowIndex + (x*generalised_block_size_row);
                                     for (y = 0; y < cheight; y++)
                                     {
                                         for (z = 0; z < r; z++)
                                         {
                                             for (t = 0; t < r; t++)
                                             {
                                                 temp[x*cheight*r*r + y*r*r + z*r + t] = a[RowIndex*r*n*r + (k*r) + y*r*n*r + z*n*r + t];
                                             }
                                         }
                                     }
                                 }

                                 MPI_Send(&temp, (n/generalised_block_size_row)*cheight*r*r, MPI_DOUBLE, dest, HMPI_MSG_TAG,*acomm);
                              }
                           }
                       }
                   }
                }
                else
                {
                   MPI_Status stat;
                   double temp[(n/generalised_block_size_row)*cheight*r*r];
                   int RootCoords[2] = {Root.I, Root.J};
                   int root = coord2rank(RootCoords, ppar);
                   int ReceiverTopRow = trow[(Me.I)*q+(Me.J)];
                   int TopRowIndex = (ReceiverTopRow > Arow) ? ReceiverTopRow : Arow;
                   MPI_Recv(&temp, (n/generalised_block_size_row)*cheight*r*r, MPI_DOUBLE, root, HMPI_MSG_TAG, *acomm, &stat);
                   for (x = 0; x < (n/generalised_block_size_row); x++)
                   {
                       int RowIndex = TopRowIndex + (x*generalised_block_size_row);
                       for (y = 0; y < cheight; y++)
                       {
                           for (z = 0; z < r; z++)
                           {
                               for (t = 0; t < r; t++)
                               {
                                   a[RowIndex*r*n*r + (k*r) + y*r*n*r + z*n*r + t] = temp[x*cheight*r*r + y*r*r + z*r + t];
                               }
                           }
                       }
                   }
                }
             }
             Arow += H((Root.I), (Root.J), (Root.I), (Root.J), p, q);
          }

          /*
           * P(k,j) broadcasts a(k,j) to p(*,j) vertically.
           */
          for (Bcolumn = 0; Bcolumn < generalised_block_size_col;)
          {
              GetBlock(Brow, Bcolumn, &Bblock);
              GetProcessor(&Bblock, &Root);
              if (Me.J == Root.J)
              {
                  cwidth = w[(Root.J)];
                  if (Me.I == Root.I)
                  {
                     for (Receiver.I = 0; Receiver.I < p; Receiver.I++)
                     {
                         if (Root.I != Receiver.I)
                         {
                            double temp[(n/generalised_block_size_col)*cwidth*r*r];
                            int ProcessorCoords[2] = { Receiver.I, Root.J};
                            int dest = coord2rank(ProcessorCoords, ppar);
                            for (x = 0; x < (n/generalised_block_size_col); x++)
                            {
                                for (y = 0; y < cwidth; y++)
                                {
                                    for (z = 0; z < r; z++)
                                    {
                                        for (t = 0; t < r; t++)
                                        {
                                            temp[x*cwidth*r*r + y*r + z*cwidth*r + t] = b[k*r*n*r + x*generalised_block_size_col*r + (Bcolumn*r) + y*r + z*n*r + t];
                                        }
                                    }
                                }
                            }
                            MPI_Send(&temp, (n/generalised_block_size_col)*cwidth*r*r, MPI_DOUBLE, dest, HMPI_MSG_TAG,*acomm);
                         }
                     }
                  }
                  else
                  {
                     MPI_Status stat;
                     double temp[(n/generalised_block_size_col)*cwidth*r*r];
                     int RootCoords[2] = {Root.I, Root.J};
                     int root = coord2rank(RootCoords, ppar);
                     MPI_Recv(&temp, (n/generalised_block_size_col)*cwidth*r*r, MPI_DOUBLE, root, HMPI_MSG_TAG, *acomm, &stat);
                     for (x = 0; x < (n/generalised_block_size_col); x++)
                     {
                         for (y = 0; y < cwidth; y++)
                         {
                             for (z = 0; z < r; z++)
                             {
                                 for (t = 0; t < r; t++)
                                 {
                                     b[k*r*n*r + x*generalised_block_size_col*r + (Bcolumn*r) + y*r + z*n*r + t] = temp[x*cwidth*r*r + y*r + z*cwidth*r + t];
                                 }
                             }
                         }
                     }
                  }
              }
              Bcolumn += w[(Root.J)];
          }

          for (x = (istart*r); x < N; x+=(generalised_block_size_row*r))
          {
              for (y = (jstart*r); y < N; y+=(generalised_block_size_col*r))
              {
                  for (i = 0; i < (myialloc*r); i+=r)
                  {
                      for (j = 0; j < (myjalloc*r); j+=r)
                      {
                          /*
                           * Multiplication of a[i][k] * b[k][j]
                           * is equivalent to multiplying 2 r*r
                           * matrices.
                           */
                          for (l = 0; l < r; l++)
                          {
                              for (m = 0; m < r; m++)
                              {
                                  for (t = 0; t < r; t++)
                                  {
                                      c[x*N + y + i*N + j + l*N + m]
                                      +=
                                      a[x*N + 0 + i*N + (k*r) + l*N + t]
                                      *
                                      b[0   + y + (k*r*N) + j + t*N + m]
                                      ;
                                  }
                              }
                          }
                      }
                  }
              }
          }
      }

      /*
       * The result of the computations.
       */
      if (MPI_VERBOSE > 1)
      {
         for (x = (istart*r); x < N; x+=(generalised_block_size_row*r))
         {
             for (y = (jstart*r); y < N; y+=(generalised_block_size_col*r))
             {
                 for (i = 0; i < (myialloc*r); i+=r)
                 {
                     for (j = 0; j < (myjalloc*r); j+=r)
                     {
                          for (l = 0; l < r; l++)
                          {
                              for (m = 0; m < r; m++)
                              {
                                  printf(
                                      "c[%d][%d] = %0.2f\n",
                                      x + i + l,
                                      y + j + m,
                                      c[x*N + y + i*N + j + l*N + m]
                                  );
                              }
                          }
                     }
                 }
             }
         }
      }

      return MPI_SUCCESS;
   }

  /*-----------------------------------------------------*/

int Common_height
(
    int top_row_1,
    int bottom_row_1,
    int top_row_2,
    int bottom_row_2
)
{
    /*
     * One area contains the other
     */
    if ((top_row_1 >= top_row_2)
        && (bottom_row_1 <= bottom_row_2)
    )
    {
       return (bottom_row_1 - top_row_1);
    }

    if ((top_row_1 <= top_row_2)
        && (bottom_row_1 >= bottom_row_2)
    )
    {
       return (bottom_row_2 - top_row_2);
    }

    /*
     * One area is followed or preceded by another
     * with an overlap
     */
    if ((top_row_1 <= top_row_2)
        && (bottom_row_1 >= top_row_2)
        && (bottom_row_1 <= bottom_row_2)
    )
    {
       return (bottom_row_1 - top_row_2);
    }

    if ((top_row_1 >= top_row_2)
        && (top_row_1 <= bottom_row_2)
        && (bottom_row_1 >= bottom_row_2)
    )
    {
       return (bottom_row_2 - top_row_1);
    }

    /*
     * There is no overlap
     */
    if ((bottom_row_1 < top_row_2)
        || (top_row_1 > bottom_row_2)
    )
    {
       return 0;
    }

    if ((top_row_1 < top_row_2)
        && (bottom_row_1 < bottom_row_2)
    )
    {
       return 0;
    }

    if ((top_row_1 > top_row_2)
        && (bottom_row_1 > bottom_row_2)
    )
    {
       return 0;
    }

    return 0;
}

  /*-----------------------------------------------------*/

  void
  GetBlock(int x, int y, Block *b)
  {
     b->i = x;
     b->j = y;

     return;
  }

  /*-----------------------------------------------------*/

  void
  GetProcessor(Block *b, Processor* p)
  {
     p->I = Generalised_block[((b->i)*generalised_block_size_col) + (b->j)][0];
     p->J = Generalised_block[((b->i)*generalised_block_size_col) + (b->j)][1];

     return;
  }

  /*-----------------------------------------------------*/

  void rank2coord
  (
     int pnum,
     const int *ppar,
     int *pcoord
  )
  {
     int tmp;
     tmp =  * (ppar + 1);
      * pcoord = pnum / tmp;
     pnum = pnum % tmp;
      * (pcoord + 1) = pnum;
  }

  /*-----------------------------------------------------*/

  int coord2rank
  (
    const int *pcoord,
    const int *ppar
  )
  {
    return * pcoord *  * (ppar + 1) +  * (pcoord + 1);
  }

  /*-----------------------------------------------------*/
