
  #include "mxm_i.h"

   /*-----------------------------------------------------*/

   int
   Input_recon
   (
      double *a,
      double *b,
      double *c
   )
   {
     int i, j;
     for (i = 0; i < recon_r; i++)
     {
          for (j = 0; j < recon_n; j++)
          {
              a[i*recon_n + j] = 2.0;
          }
      }

      for (i = 0; i < recon_n; i++)
      {
          for (j = 0; j < recon_t; j++)
          {
              b[i*recon_t + j] = 2.0;
          }
      }

      for (i = 0; i < recon_r; i++)
      {
          for (j = 0; j < recon_t; j++)
          {
              c[i*recon_t + j] = 0.0;
          }
      }

      return HMPI_OK;
   }

   /*-----------------------------------------------------*/

   int Perf_func
   (
       int x, /* --r-- */
       int y, /* --n-- */
       int z  /* --t-- */
   )
   {
      /*
       * Multiplication of r*n and n*t matrices.
       */
      int i,j,k;
      for (i = 0; i < x; i++)
      {
          for (j = 0; j < z; j++)
          {
              for (k = 0; k < y; k++)
              {
                  c[i*z + j] += a[i*x + k] * b[k*z + j];
              }
          }
      }

      return HMPI_OK;
   }

   /*-----------------------------------------------------*/

   void Benchmark_function
   (
      const void* input_p,
      int num_of_p,
      void* output_p
   )
   {
      int* params = (int*)input_p;

      int result = Perf_func(
                       params[0],
                       params[1],
                       params[2]
      );

      *(int*)output_p = result;

      return;
   }

   /*-----------------------------------------------------*/

   int Do_recon()
   {
       a = (double*)malloc(
                    sizeof(double)
                    *
                    (recon_r*recon_n)
       );

       b = (double*)malloc(
                    sizeof(double)
                    *
                    (recon_n*recon_t)
       );

       c = (double*)malloc(
                    sizeof(double)
                    *
                    (recon_r*recon_t)
       );

       Input_recon(
         a,
         b,
         c
       );

       if (HMPI_Is_member(HMPI_COMM_WORLD_GROUP))
       {
            int rc;
            int output_p;
            int input_p[3] =
                {
                   recon_r,
                   recon_n,
                   recon_t
            };

            rc = HMPI_Recon(
                     &Benchmark_function,
                     input_p,
                     3,
                     &output_p
            );

            if  (rc != HMPI_OK)
            {
                printf("Panic: HMPI_Recon failed\n");
                return rc;
            }
       }

       if (HMPI_Is_host())
       {
          printf("Processor performances refreshed\n");
       }

       free(a);
       free(b);
       free(c);

       return HMPI_OK;
   }

  /*-----------------------------------------------------*/

  int Pack_model_parameters
  (
     int Generalised_block_size_row,
     int Generalised_block_size_col,
     int *w,
     int *h,
     int *model_params,
     int model_count
  )
  {
     int i, j;

     model_params[0] = p;
     model_params[1] = q;
     model_params[1+1] = n;
     model_params[1+1+1] = r;
     model_params[1+1+1+1] = Generalised_block_size_row;
     model_params[1+1+1+1+1] = Generalised_block_size_col;

     for (i = 0; i < q; i++)
     {
        model_params[1+1+1+1+1+1+i] = w[i];
     }

     for (i = 0; i < p*q*p*q; i++)
     {
        model_params[1+1+1+1+1+1+q+i] = h[i];
     }

     return HMPI_OK;
  }

  /*-----------------------------------------------------*/

  int Execute_algorithm()
  {
     int rc, i, j, k;

     HMPI_Group gid;
     int *model_params;
     int model_count;
     double *dperf;
     int *iperf;

     model_count = 1+1+1+1+1+1+q+(p*q*p*q);
     model_params = (int*)malloc(
                          sizeof(int)
                          *
                          model_count
     );

     if (model_params == NULL)
     {
        return -1;
     }

     w = (int*)malloc(
               sizeof(int)
               *
               q
     );

     if (w == NULL)
     {
        return -1;
     }

     h = (int*)malloc(
               sizeof(int)
               *
               (p*q*p*q)
     );

     if (h == NULL)
     {
        return -1;
     }

     trow = (int*)malloc(
                  sizeof(int)
                  *
                  (p*q)
     );

     if (trow == NULL)
     {
        return -1;
     }

     tcol = (int*)malloc(
                  sizeof(int)
                  *
                  q
     );

     if (tcol == NULL)
     {
        return -1;
     }

     row_allocations = (int*)malloc(
                                 sizeof(int)
                                 *
                                 (p*q)
     );

     if (row_allocations == NULL)
     {
        return -1;
     }

     column_allocations = (int*)malloc(
                                    sizeof(int)
                                    *
                                    q
     );

     if (column_allocations == NULL)
     {
        return -1;
     }

     dperf = (double*)malloc(
                     sizeof(double)
                     *
                     (p*q)
     );

     if (dperf == NULL)
     {
        return -1;
     }

     iperf = (int*)malloc(
                   sizeof(int)
                   *
                   (p*q)
     );

     if (iperf == NULL)
     {
        return -1;
     }

     rc = HMPI_Group_performances(
              HMPI_COMM_WORLD_GROUP,
              dperf
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     for (i = 0; i < (p*q); i++)
     {
        iperf[i] = dperf[i];
     }

     if (HMPI_Is_host())
     {
        printf("Performances are: ");

        for (i = 0; i < (p*q); i++)
        {
            printf("%d ", iperf[i]);
        }

        printf("\n");
     }

     if (HMPI_Is_host())
     {
        int bsize_r, bsize_c;
        double time, min_time = 1.7976931348623157E+308;

        for (bsize_r = p; bsize_r <= n; bsize_r++)
        {
           for (bsize_c = q; bsize_c <= n; bsize_c++)
           {
               /*
                * The generalised blocks should fit
                * the square matrix
                */
               int rfitted = n%bsize_r;
               int cfitted = n%bsize_c;

               if ((rfitted != 0)
                   || (cfitted != 0
                  )
               )
               {
                  continue;
               }

               printf("=========row block size=%d, column block size=%d=============\n",
                      bsize_r,
                      bsize_c
               );

               Generalised_block = (int(*)[2])malloc(
                                    sizeof(int[2])
                                    *
                                    (bsize_r)*(bsize_c)
               );

               if (Generalised_block == NULL)
               {
                  return -1;
               }

               rc = Distribute_load(
                      p,
                      q,
                      iperf,
                      bsize_r,
                      bsize_c,
                      row_allocations,
                      column_allocations
               );

               if (rc != HMPI_OK)
               {
                  return rc;
               }

               rc = Determine_distribution_parameters(
                      p,
                      q,
                      row_allocations,
                      column_allocations,
                      w,
                      h,
                      trow,
                      tcol
               );

               if (rc != HMPI_OK)
               {
                  return rc;
               }

               rc = Pack_model_parameters(
                        bsize_r,
                        bsize_c,
                        w,
                        h,
                        model_params,
                        model_count
               );

               if (rc != HMPI_OK)
               {
                  return rc;
               }

               printf("TIMEOF = ");

               time = HMPI_Timeof(
                          &MPC_NetType_ParallelAxB,
                          model_params,
                          model_count
               );

               printf("%0.3f\n", time);

               if (time < min_time)
               {
                  Optimal_generalised_block_size_row = bsize_r;
                  Optimal_generalised_block_size_col = bsize_c;
                  min_time = time;
               }

               free(Generalised_block);

               printf("===================================\n");
            }
        }
     }

     if (HMPI_Is_host())
     {
        printf("\n\n");
        printf("Optimal generalised block size row = %d, Optimal generalised block size col = %d\n",
           Optimal_generalised_block_size_row,
           Optimal_generalised_block_size_col
        );
        printf("\n\n");
     }

     rc = MPI_Bcast(
             &Optimal_generalised_block_size_row,
             1,
             MPI_INT,
             0,
             HMPI_COMM_WORLD
     );

     if (rc != MPI_SUCCESS)
     {
        return rc;
     }

     rc = MPI_Bcast(
             &Optimal_generalised_block_size_col,
             1,
             MPI_INT,
             0,
             HMPI_COMM_WORLD
     );

     if (rc != MPI_SUCCESS)
     {
        return rc;
     }

     Generalised_block = (int(*)[2])malloc(
                         sizeof(int[2])
                         *
                         (Optimal_generalised_block_size_row)
                         *
                         (Optimal_generalised_block_size_col)
     );

     if (Generalised_block == NULL)
     {
        return -1;
     }

     /*
      * Destrinute the load taking into account the
      * relative performances.
      */

     rc = Distribute_load(
             p,
             q,
             iperf,
             Optimal_generalised_block_size_row,
             Optimal_generalised_block_size_col,
             row_allocations,
             column_allocations
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     rc = Determine_distribution_parameters
          (
            p,
            q,
            row_allocations,
            column_allocations,
            w,
            h,
            trow,
            tcol
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     rc = Pack_model_parameters(
              Optimal_generalised_block_size_row,
              Optimal_generalised_block_size_col,
              w,
              h,
              model_params,
              model_count
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     if (HMPI_Is_host())
     {
        int rc = HMPI_Group_create(
                     &gid,
                     &MPC_NetType_ParallelAxB,
                     model_params,
                     model_count
        );

        if (rc != HMPI_OK)
        {
           return rc;
        }
     }

     if (HMPI_Is_free())
     {
        int rc = HMPI_Group_create(
                     &gid,
                     &MPC_NetType_ParallelAxB,
                     NULL,
                     0
        );

        if (rc != HMPI_OK)
        {
           return rc;
        }
     }

     if (HMPI_Is_free())
     {
        free(row_allocations);
        free(column_allocations);
        free(Generalised_block);
        free(model_params);

        free(dperf);
        free(iperf);

        free(h);
        free(w);
        free(trow);
        free(tcol);

        HMPI_Finalize(0);
     }

     /*
      * Calculate the barrier time for possible use.
      */
     {
        int rc;
        struct timeval start, end;
        double i_barrier_time, f_barrier_time;

        gettimeofday(&start, NULL);

        for (i = 0; i < BARRIER_ITERATIONS; i++)
        {
           rc = HMPI_Barrier(&gid);

           if (rc != HMPI_OK)
           {
              return rc;
           }
        }

        gettimeofday(&end, NULL);

        i_barrier_time = start.tv_sec + (start.tv_usec/pow(10, 6));
        f_barrier_time = end.tv_sec + (end.tv_usec/pow(10, 6));

        barrier_time = (f_barrier_time - i_barrier_time)/BARRIER_ITERATIONS;
     }

     rc = Perform_mxm(
                 &gid
     );

     if (rc != HMPI_OK)
     {
        return rc;
     }

     rc = HMPI_Barrier(&gid);

     if (rc != HMPI_OK)
     {
        return rc;
     }

     free(row_allocations);
     free(column_allocations);
     free(Generalised_block);
     free(model_params);

     free(h);
     free(w);
     free(trow);
     free(tcol);

     free(dperf);
     free(iperf);

     /*
      * Destroy the group
      */
     if (HMPI_Is_member(&gid))
     {
        int rc = HMPI_Group_free(&gid);

        if (rc != HMPI_OK)
        {
           return rc;
        }
     }

     return HMPI_OK;
  }

  /*-----------------------------------------------------*/

  int
  Perform_mxm(const HMPI_Group *gid)
  {
     int rc;
     int icoord, jcoord;
     int** my_coordinates = (int**)malloc(
                                   sizeof(int*)
     );
     int num_of_coordinates;

     rc =  HMPI_Group_coordof(
                   gid,
                   &num_of_coordinates,
                   my_coordinates
     );

     if (rc != HMPI_OK)
     {
        HMPI_Printf("Error while getting the coordinates\n");
        return rc;
     }

     /*
      * Initialize the respective array elements A & B at
      * the processors.
      * Each array element is a r*r matrix.
      */
     icoord = (*my_coordinates)[0];
     jcoord = (*my_coordinates)[1];

     myialloc = row_allocations[(icoord*q) + jcoord];
     myjalloc = (column_allocations[jcoord]);

     number_of_generalized_blocks_row = (N/(Optimal_generalised_block_size_row*r));
     number_of_generalized_blocks_col = (N/(Optimal_generalised_block_size_col*r));

     myalloc = myialloc*myjalloc*number_of_generalized_blocks_row*number_of_generalized_blocks_col*r*r;

     A = (double*)malloc(
                  sizeof(double)
                  *
                  myalloc
     );

     if (A == NULL)
     {
        printf("PANIC: heap problems, Allocation of A\n");
        return -1;
     }

     B = (double*)malloc(
                  sizeof(double)
                  *
                  myalloc
     );

     if (B == NULL)
     {
        printf("PANIC: heap problems, Allocation of B\n");
        return -1;
     }

     C = (double*)malloc(
                  sizeof(double)
                  *
                  myalloc
     );

     if (C == NULL)
     {
        printf("PANIC: heap problems, Allocation of C\n");
        return -1;
     }

     rc = mxm(
            gid,
            my_coordinates[0],
            A,
            B,
            C
     );

     if (rc != MPI_SUCCESS)
     {
        printf(
          "MXM:Problems multiplying the matrices A and B "
          "...Exiting...\n"
        );

        return rc;
     }

     free(A);
     free(B);
     free(C);

     free(my_coordinates[0]);
     free(my_coordinates);

     return HMPI_OK;
  }

  /*-----------------------------------------------------*/

  int
  mxm
  (
     const HMPI_Group* gid,
     const int* my_coordinates,
     double *a,
     double *b,
     double *c
  )
  {
     int rc;
     int i;

     /*
      * Initialize the respective array elements A & B at
      * the processors.
      */
     for (i = 0; i < (myalloc); i++)
     {
         a[i] = MXM_CONSTANT_NUMBER;
         b[i] = MXM_CONSTANT_NUMBER;
         c[i] = 0.0;
     }

     /*
      * Computations on network nid_grid
      */
     {
        int rc = Grid_computations(
                     gid,
                     my_coordinates,
                     a,
                     b,
                     c
        );

        if (rc != MPI_SUCCESS)
        {
           printf("Error while performing grid computations\n");
           return rc;
        }
     }

     return MPI_SUCCESS;
  }

  /*-----------------------------------------------------*/

  int
  Grid_computations
  (
     const HMPI_Group* gid,
     const int* my_coordinates,
     double *a,
     double *b,
     double *c
  )
  {
     int x, y, z, i, j, k, l, m, t;

     int cheight, cwidth;
     int nborheight[p];
     double *nborrow[p], *nborcol;
     double *root_row;
     int nbors;

     Block Ablock, Bblock;
     Processor Root, Me, Receiver;

     MPI_Comm *acomm = (MPI_Comm*)HMPI_Get_comm(gid);

     if (acomm == NULL)
     {
        HMPI_Printf("Panic: grid communicator is NULL\n");
        return -1;
     }

     Me.I = my_coordinates[0];
     Me.J = my_coordinates[1];

     if (HMPI_Is_host())
     {
        printf("Starting the matrix-matrix multiplication\n");
     }

     for (k = 0; k < n; k++)
     {
         int Acolumn = (k%Optimal_generalised_block_size_col), Arow;
         int Brow = (k%Optimal_generalised_block_size_row), Bcolumn;

         nbors = 0;

         /*
          * P(i,k) broadcasts a(i,k) to p(i,*) horizontally.
          */
         for (Arow = 0; Arow < Optimal_generalised_block_size_row;)
         {
             GetBlock(Arow, Acolumn, &Ablock);
             GetProcessor(&Ablock, Optimal_generalised_block_size_col, &Root);
             cheight = h[H((Root.I), (Root.J), (Me.I), (Me.J), p, q)];

             if (cheight > 0)
             {
                if (((Me.I) == (Root.I)) && ((Me.J) == (Root.J)))
                {
                   for (Receiver.I = 0; Receiver.I < p; Receiver.I++)
                   {
                       for (Receiver.J = 0; Receiver.J < q; Receiver.J++)
                       {
                           if (((Root.I != Receiver.I) || (Root.J != Receiver.J))
                               && (Root.J != Receiver.J))
                           {
                              cheight = h[H((Root.I), (Root.J), (Receiver.I), (Receiver.J), p, q)];

                              if (cheight > 0)
                              {
                                 int ProcessorCoords[2] = {Receiver.I, Receiver.J};
                                 int dest = HMPI_Rank(gid, ProcessorCoords);
                                 int ColumnIndex = Acolumn - tcol[(Root.J)];
                                 
                                 root_row    = (double*)malloc(
                                                 sizeof(double)
                                                 *
                                                 ((n/Optimal_generalised_block_size_row)*cheight*r*r)
                                 );

                                 for (x = 0; x < (n/Optimal_generalised_block_size_row); x++)
                                 {
                                     for (y = 0; y < cheight; y++)
                                     {
                                         for (z = 0; z < r; z++)
                                         {
                                             for (t = 0; t < r; t++)
                                             {
                                                 root_row[x*cheight*r*r + y*r*r + z*r + t] = a[x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + (ColumnIndex*r) + y*r*myjalloc*number_of_generalized_blocks_col*r + z*myjalloc*number_of_generalized_blocks_col*r + t];
                                             }
                                         }
                                     }
                                 }

                                 MPI_Send(root_row, (n/Optimal_generalised_block_size_row)*cheight*r*r, MPI_DOUBLE, dest, HMPI_MSG_TAG,*acomm);

                                 free(root_row);
                              }
                           }
                       }
                   }
                }
                else
                {
                   {
                      MPI_Status stat;
                      int RootCoords[2] = {Root.I, Root.J};
                      int root = HMPI_Rank(gid, RootCoords);

                      nborheight[nbors] = cheight;

                      nborrow[nbors] = (double*)malloc(
                                                   sizeof(double)
                                                   *
                                                   ((n/Optimal_generalised_block_size_row)*cheight*r*r)
                      );

                      if ((nborrow[nbors]) == NULL)
                      {
                         return -1;
                      }

                      MPI_Recv(nborrow[nbors], (n/Optimal_generalised_block_size_row)*cheight*r*r, MPI_DOUBLE, root, HMPI_MSG_TAG, *acomm, &stat);

                      nbors++;
                   }
                }
             }
             Arow += h[H((Root.I), (Root.J), (Root.I), (Root.J), p, q)];
          }

          /*
           * P(k,j) broadcasts a(k,j) to p(*,j) vertically.
           */
          for (Bcolumn = 0; Bcolumn < Optimal_generalised_block_size_col;)
          {
              GetBlock(Brow, Bcolumn, &Bblock);
              GetProcessor(&Bblock, Optimal_generalised_block_size_col, &Root);

              if (Me.J == Root.J)
              {
                  cwidth = w[(Root.J)];

                  nborcol = (double*)malloc(
                                     sizeof(double)
                                     *
                                     ((n/Optimal_generalised_block_size_col)*cwidth*r*r)
                  );

                  if (nborcol == NULL)
                  {
                     return -1;
                  }

                  if (Me.I == Root.I)
                  {
                     for (Receiver.I = 0; Receiver.I < p; Receiver.I++)
                     {
                         if (Root.I != Receiver.I)
                         {
                            {
                               int ProcessorCoords[2] = { Receiver.I, Root.J};
                               int dest = HMPI_Rank(gid, ProcessorCoords);
                               int RowIndex = Brow - trow[(Root.I)*q + (Root.J)];

                               for (x = 0; x < (n/Optimal_generalised_block_size_col); x++)
                               {
                                   for (y = 0; y < cwidth; y++)
                                   {
                                       for (z = 0; z < r; z++)
                                       {
                                           for (t = 0; t < r; t++)
                                           {
                                               nborcol[x*cwidth*r*r + y*r + z*cwidth*r + t] = b[(k/Optimal_generalised_block_size_row)*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + RowIndex*r*myjalloc*number_of_generalized_blocks_col*r + x*myjalloc*r + y*r + z*myjalloc*number_of_generalized_blocks_col*r + t];
                                           }
                                       }
                                   }
                               }

                               MPI_Send(nborcol, (n/Optimal_generalised_block_size_col)*cwidth*r*r, MPI_DOUBLE, dest, HMPI_MSG_TAG,*acomm);
                            }
                         }
                     }
                  }
                  else
                  {
                     {
                         MPI_Status stat;
                         int RootCoords[2] = {Root.I, Root.J};
                         int root = HMPI_Rank(gid, RootCoords);

                         nborcol = (double*)malloc(
                                            sizeof(double)
                                            *
                                            ((n/Optimal_generalised_block_size_col)*cwidth*r*r)
                         );

                         if (nborcol == NULL)
                         {
                            return -1;
                         }

                         MPI_Recv(nborcol, (n/Optimal_generalised_block_size_col)*cwidth*r*r, MPI_DOUBLE, root, HMPI_MSG_TAG, *acomm, &stat);
                     }
                  }
              }
              Bcolumn += w[(Root.J)];
          }

          if ((p == 1)
              && (q == 1
             )
          )
          {
             for (x = 0; x < number_of_generalized_blocks_row; x++)
             {
                 for (y = 0; y < number_of_generalized_blocks_col; y++)
                 {
                     for (i = 0; i < (myialloc*r); i+=r)
                     {
                         for (j = 0; j < (myjalloc*r); j+=r)
                         {
                             int cpos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + j;
                             int apos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + i*myjalloc*number_of_generalized_blocks_col*r + k*r;
                             int bpos = (k/Optimal_generalised_block_size_row)*r*myjalloc*number_of_generalized_blocks_col*r + (k%Optimal_generalised_block_size_row)*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + j;

                             for (l = 0; l < r; l++)
                             {
                                 for (m = 0; m < r; m++)
                                 {
                                     double cval = 0;

                                     for (t = 0; t < r; t++)
                                     {
                                         cval
                                         +=
                                         a[apos + l*myjalloc*number_of_generalized_blocks_col*r + t]
                                         *
                                         b[bpos + t*myjalloc*number_of_generalized_blocks_col*r + m]
                                         ;
                                     }
                                     
                                     c[cpos + l*myjalloc*number_of_generalized_blocks_col*r + m] += cval;
                                 }
                             }
                         }
                     }
                 }
             }

             continue;
          }

          if (p == 1)
          {
             if (nbors == 0)
             {
                for (x = 0; x < number_of_generalized_blocks_row; x++)
                {
                    for (y = 0; y < number_of_generalized_blocks_col; y++)
                    {
                        for (i = 0; i < (myialloc*r); i+=r)
                        {
                            for (j = 0; j < (myjalloc*r); j+=r)
                            {
                                int cpos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + j;
                                int apos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + (k/Optimal_generalised_block_size_col)*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + ((k%Optimal_generalised_block_size_col) - tcol[(Me.J)])*r;
                                int bpos = (k/Optimal_generalised_block_size_row)*r*myjalloc*number_of_generalized_blocks_col*r + (k%Optimal_generalised_block_size_row)*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + j;

                                for (l = 0; l < r; l++)
                                {
                                    for (m = 0; m < r; m++)
                                    {
                                        double cval = 0;

                                        for (t = 0; t < r; t++)
                                        {
                                            cval
                                            +=
                                            a[apos + l*myjalloc*number_of_generalized_blocks_col*r + t]
                                            *
                                            b[bpos + t*myjalloc*number_of_generalized_blocks_col*r + m]
                                            ;
                                        }
                                     
                                        c[cpos + l*myjalloc*number_of_generalized_blocks_col*r + m] += cval;
                                    }
                                }
                            }
                        }
                    }
                }

                continue;
             }

             for (x = 0; x < number_of_generalized_blocks_row; x++)
             {
                 for (y = 0; y < number_of_generalized_blocks_col; y++)
                 {
                     int nbors = 0;
                     int nborcum = 0;

                     for (i = 0; i < (myialloc*r); i+=r)
                     {
                         if ((i - nborcum) >= (nborheight[nbors]*r))
                         {
                            nborcum = i;
                            nbors++;
                         }

                         for (j = 0; j < (myjalloc*r); j+=r)
                         {
                             int cpos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + j;
                             int bpos = (k/Optimal_generalised_block_size_row)*r*myjalloc*number_of_generalized_blocks_col*r + (k%Optimal_generalised_block_size_row)*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + j;

                             for (l = 0; l < r; l++)
                             {
                                 for (m = 0; m < r; m++)
                                 {
                                     double cval = 0;

                                     for (t = 0; t < r; t++)
                                     {
                                         cval
                                         +=
                                         nborrow[nbors][x*(nborheight[nbors])*r*r + 0 + (i-nborcum)*r + l*r + t]
                                         *
                                         b[bpos + t*myjalloc*number_of_generalized_blocks_col*r + m]
                                         ;
                                     }
                                     
                                     c[cpos + l*myjalloc*number_of_generalized_blocks_col*r + m] += cval;
                                 }
                             }
                         }
                     }
                 }
             }

             for (i = 0; i < nbors; i++)
             {
                 free(nborrow[i]);
             }

             continue;
          }

          if (q == 1)
          {
             for (x = 0; x < number_of_generalized_blocks_row; x++)
             {
                 for (y = 0; y < number_of_generalized_blocks_col; y++)
                 {
                     for (i = 0; i < (myialloc*r); i+=r)
                     {
                         for (j = 0; j < (myjalloc*r); j+=r)
                         {
                             int cpos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + j;
                             int apos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + (k/Optimal_generalised_block_size_col)*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + ((k%Optimal_generalised_block_size_col) - tcol[(Me.J)])*r;

                             for (l = 0; l < r; l++)
                             {
                                 for (m = 0; m < r; m++)
                                 {
                                     double cval = 0;

                                     for (t = 0; t < r; t++)
                                     {
                                         cval
                                         +=
                                         a[apos + l*myjalloc*number_of_generalized_blocks_col*r + t]
                                         *
                                         nborcol[0 + y*cwidth*r*r + j*r + t*r + m]
                                         ;
                                     }
                                     
                                     c[cpos + l*myjalloc*number_of_generalized_blocks_col*r + m] += cval;
                                 }
                             }
                         }
                     }
                 }
             }

             free(nborcol);

             continue;
          }

          if (nbors == 0)
          {
             for (x = 0; x < number_of_generalized_blocks_row; x++)
             {
                 for (y = 0; y < number_of_generalized_blocks_col; y++)
                 {
                     for (i = 0; i < (myialloc*r); i+=r)
                     {
                         for (j = 0; j < (myjalloc*r); j+=r)
                         {
                             int cpos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + j;
                             int apos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + (k/Optimal_generalised_block_size_col)*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + ((k%Optimal_generalised_block_size_col) - tcol[(Me.J)])*r;

                             for (l = 0; l < r; l++)
                             {
                                 for (m = 0; m < r; m++)
                                 {
                                     double cval  = 0;
   
                                     for (t = 0; t < r; t++)
                                     {
                                         cval
                                         +=
                                         a[apos + l*myjalloc*number_of_generalized_blocks_col*r + t]
                                         *
                                         nborcol[0 + y*cwidth*r*r + j*r + t*r + m]
                                         ;
                                     }
                                     
                                     c[cpos + l*myjalloc*number_of_generalized_blocks_col*r + m] += cval;
                                 }
                             }
                         }
                     }
                 }
             }

             free(nborcol);

             continue;
          }

          for (x = 0; x < number_of_generalized_blocks_row; x++)
          {
              for (y = 0; y < number_of_generalized_blocks_col; y++)
              {
                  int nbors = 0;
                  int nborcum = 0;

                  for (i = 0; i < (myialloc*r); i+=r)
                  {
                      if ((i - nborcum) >= (nborheight[nbors]*r))
                      {
                         nborcum = i;
                         nbors++;
                      }

                      for (j = 0; j < (myjalloc*r); j+=r)
                      {
                          int cpos = x*myialloc*r*myjalloc*number_of_generalized_blocks_col*r + y*myjalloc*r + i*myjalloc*number_of_generalized_blocks_col*r + j;

                          for (l = 0; l < r; l++)
                          {
                              for (m = 0; m < r; m++)
                              {
                                  double cval  = 0;

                                  for (t = 0; t < r; t++)
                                  {
                                      cval
                                      +=
                                      nborrow[nbors][x*(nborheight[nbors])*r*r + 0 + (i - nborcum)*r + l*r + t]
                                      *
                                      nborcol[0 + y*cwidth*r*r + j*r + t*r + m]
                                      ;
                                  }
                                     
                                  c[cpos + l*myjalloc*number_of_generalized_blocks_col*r + m] += cval;
                              }
                          }
                      }
                  }
              }
          }

          for (i = 0; i < nbors; i++)
          {
              free(nborrow[i]);
          }

          free(nborcol);
      }

      /*
       * The result of the computations.
       */
      if (VERBOSE > 1)
      {
         for (x = 0; x < myalloc; x++)
         {
             printf(
                 "c[%d] = %0.2f\n",
                  x,
                  c[x]
             );
         }
      }

      return HMPI_OK;
   }

  /*-----------------------------------------------------*/

int Common_height
(
    int top_row_1,
    int bottom_row_1,
    int top_row_2,
    int bottom_row_2
)
{
    /*
     * One area contains the other
     */
    if ((top_row_1 >= top_row_2)
        && (bottom_row_1 <= bottom_row_2)
    )
    {
       return (bottom_row_1 - top_row_1);
    }

    if ((top_row_1 <= top_row_2)
        && (bottom_row_1 >= bottom_row_2)
    )
    {
       return (bottom_row_2 - top_row_2);
    }

    /*
     * One area is followed or preceded by another
     * with an overlap
     */
    if ((top_row_1 <= top_row_2)
        && (bottom_row_1 >= top_row_2)
        && (bottom_row_1 <= bottom_row_2)
    )
    {
       return (bottom_row_1 - top_row_2);
    }

    if ((top_row_1 >= top_row_2)
        && (top_row_1 <= bottom_row_2)
        && (bottom_row_1 >= bottom_row_2)
    )
    {
       return (bottom_row_2 - top_row_1);
    }

    /*
     * There is no overlap
     */
    if ((bottom_row_1 < top_row_2)
        || (top_row_1 > bottom_row_2)
    )
    {
       return 0;
    }

    if ((top_row_1 < top_row_2)
        && (bottom_row_1 < bottom_row_2)
    )
    {
       return 0;
    }

    if ((top_row_1 > top_row_2)
        && (bottom_row_1 > bottom_row_2)
    )
    {
       return 0;
    }

    return 0;
}

  /*-----------------------------------------------------*/

  int
  GetBlock(int x, int y, Block *b)
  {
     b->I = x;
     b->J = y;

     return 0;
  }

  /*-----------------------------------------------------*/

  int
  GetProcessor(Block *b, int Optimal_generalised_block_size_col, Processor* p)
  {
     p->I = Generalised_block[((b->I)*Optimal_generalised_block_size_col) + (b->J)][0];
     p->J = Generalised_block[((b->I)*Optimal_generalised_block_size_col) + (b->J)][1];

     return 0;
  }

  /*-----------------------------------------------------*/

  void rank2coord
  (
     int pnum,
     const int *ppar,
     int *pcoord
  )
  {
     int tmp;
     tmp =  * (ppar + 1);
      * pcoord = pnum / tmp;
     pnum = pnum % tmp;
      * (pcoord + 1) = pnum;
  }

  /*-----------------------------------------------------*/

  int coord2rank
  (
    const int *pcoord,
    const int *ppar
  )
  {
    return * pcoord *  * (ppar + 1) +  * (pcoord + 1);
  }

  /*-----------------------------------------------------*/
