
   #include "mxm_i.h"

   /*-----------------------------------------------------*/

   int Number_of_elements_proportional_to_speed
   (
       int pp,
       int nn,
       const double *speeds,
       int *allocations
   )
   {
       int i, j;
       int total = 0;
       double sum = 0.0;

       for (i = 0; i < pp; i++)
       {
           sum += speeds[i];
       }

       for (i = 0; i < pp; i++)
       {
           allocations[i] = (
	                      (double)speeds[i]
                              /
		              (double)sum
                            )
		            *
		            nn;

	   if (allocations[i] == 0)
           {
              allocations[i] = 1;
           }
       }

       for (i = 0; i < pp; i++)
       {
           total += allocations[i];
       }

       if (total == nn)
       {
          return MPI_SUCCESS;
       }

       for (i = total; i < nn; i++)
       {
           int optimal_p;
	   int *revised_allocations;
	   double *allocation_ratios;
	   double temp;

	   revised_allocations = (int*)malloc(
	 	                       sizeof(int)
                                       *
				       pp
           );

	   if (revised_allocations == NULL)
           {
              return -1;
           }

	   allocation_ratios = (double*)malloc(
		    	                sizeof(double)
                                        *
				        pp
           );

	   if (allocation_ratios == NULL)
           {
              return -1;
           }

           for (j = 0; j < pp; j++)
           {
               revised_allocations[j] = allocations[j] + 1;      
	       allocation_ratios[j] = (double)revised_allocations[j]
		                      /
		                      (double)speeds[j];
           }

	   temp = allocation_ratios[0];
           optimal_p = 0;
           for (j = 1; j < pp; j++)
           {
               if (temp > allocation_ratios[j])
               {
                  temp = allocation_ratios[j];
                  optimal_p = j;
               }
           }

	   allocations[optimal_p] = allocations[optimal_p] + 1;

	   free(revised_allocations);
	   free(allocation_ratios);
       }

       return MPI_SUCCESS;
   }

   /*-----------------------------------------------------*/

   int
   Calculate_row_and_column_distributions
   (
      int p,
      int q,
      const int *perf,
      int Generalised_block_size_row,
      int Generalised_block_size_col,
      int *row_allocations,
      int *column_allocations
   );

   /*-----------------------------------------------------*/

   /*
    * Get the performances of the processors, normalise
    * distribute the load according to the normalised
    * values.
    */
   int Distribute_load
   (
      int p,
      int q,
      int Generalised_block_size_row,
      int Generalised_block_size_col,
      int *row_allocations,
      int *column_allocations
   )
   {
      int i;
      int power = (p*q);
      int *rperf = (int*)malloc(
                         sizeof(int)
                         *
                         power
      );

      /*
       * In homogeneous distribution of data over heterogeneous
       * distribution of processes, we assume even distribution
       * of matrices. For simplicity, we assume that all the
       * processors have same performance. This way the code
       * for HEHE algorithm will work for HOHE with few changes.
       */
      for (i = 0; i < power; i++)
      {
          rperf[i] = 1;
      }

      Calculate_row_and_column_distributions(
            p,
            q,
            rperf,
            Generalised_block_size_row,
            Generalised_block_size_col,
            row_allocations,
            column_allocations
      );

      free(rperf);

      return MPI_SUCCESS;
   }

   /*-----------------------------------------------------*/

   int
   Calculate_row_and_column_distributions
   (
      int p,
      int q,
      const int *perf,
      int Generalised_block_size_row,
      int Generalised_block_size_col,
      int *row_allocations,
      int *column_allocations
   )
   {
      int i, j, rc;
      int x, y, tempy;
      int *tempx;
      double *column_speed_sums;

      column_speed_sums = (double*)malloc(
                          sizeof(double)
                          *
                          q
      );

      if (column_speed_sums == NULL)
      {
         return -1;
      }

      for (i = 0; i < q; i++)
      {
          column_speed_sums[i] = 0.0;
          for (j = 0; j < p; j++)
          {
             column_speed_sums[i] += perf[j*q+i];
          }
      }

      if ((MPI_VERBOSE > 0)
         && (algo_comm_rank == 0)
      )
      {
         printf("Generalised block sizes are (%d, %d)\n",  Generalised_block_size_row, Generalised_block_size_col);
      }

      if ((MPI_VERBOSE > 0)
          && (algo_comm_rank == 0)
      )
      {
         printf("HMPI---LOAD BALANCE-->: Sums are :\n");

         for (i = 0; i < q; i++)
         {
             printf("%0.2f ", column_speed_sums[i]);
         }

         printf("\n");
      }

      rc = Number_of_elements_proportional_to_speed(
               q,
               Generalised_block_size_col,
               column_speed_sums,
               column_allocations
      );

      if (rc != MPI_SUCCESS)
      {
         return rc;
      }

      free(column_speed_sums);

      if ((MPI_VERBOSE > 0)
          && (algo_comm_rank == 0)
      )
      {
         printf("Column allocations are :\n");

         for (i = 0; i < q; i++)
         {
             printf("%d ", column_allocations[i]);
         }

         printf("\n");
      }

      for (i = 0; i < q; i++)
      {
          double *row_speed_sums;
          int *row_np_sub;

          row_speed_sums = (double*)malloc(
                                    sizeof(double)
                                    *
                                   p
          );

          if (row_speed_sums == NULL)
          {
             return -1;
          }

          row_np_sub = (int*)malloc(
                             sizeof(int)
                             *
                             p
          );

          if (row_np_sub == NULL)
          {
             return -1;
          }

          for (j = 0; j < p; j++)
          {
              row_speed_sums[j] = perf[j*q + i];
          }

          rc = Number_of_elements_proportional_to_speed(
                   p,
                   Generalised_block_size_row,
                   row_speed_sums,
                   row_np_sub
          );

          if (rc != MPI_SUCCESS)
          {
             return rc;
          }

          for (j = 0; j < p; j++)
          {
              row_allocations[j*q + i] = row_np_sub[j];
          }

          free(row_np_sub);
          free(row_speed_sums);
      }

      if ((MPI_VERBOSE > 0)
          && (algo_comm_rank == 0)
      )
      {
         printf("Row allocations are :\n");

         for (i = 0; i < p; i++)
         {
             for (j = 0; j < q; j++)
             {
                 printf("%d ", row_allocations[(i*q) + j]);
             }
         }

         printf("\n");
      }

      /*
       * Create the heterogeneous processor grid.
       *
       * Consider the performances of processors 
       * 2, 1
       * 1, 1
       *
       * The row and column allocations are 
       *
       * {4, 3    {4, 2}
       *  2, 3}
       *
       * The heterogeneous processor grid would be
       *
       * P00, P00, P00, P00, P01, P01
       * P00, P00, P00, P00, P01, P01
       * P00, P00, P00, P00, P01, P01
       * P00, P00, P00, P00, P11, P11
       * P10, P10, P10, P10, P11, P11
       * P10, P10, P10, P10, P11, P11
       *
       */
      tempx = (int*)malloc(
                    sizeof(int)
                    *
                    q
      );

      for (i = 0; i < q; i++)
      {
          tempx[i] = 0;
      }

      /*
       * Creates the processor grid.
       * Magic but ugly loop.
       *
       * It allocates elements for processors in the order
       * P00, P01, P02, P03...
       * P10, P11, P12, P13...
       */
      for (x = 0; x < p; x++)
      {
          tempy = 0;
          for (y = 0; y < q; y++)
          {
              if (x)
              {
                 tempx[y] += row_allocations[(x-1)*q + y];
              }

              for (i = 0; i < row_allocations[x*q + y]; i++)
              {
                  for (j = 0; j < (column_allocations[y]); j++)
                  {
                      Generalised_block[(tempx[y] + i)*(Generalised_block_size_col) + tempy + j][0] = x;
                      Generalised_block[(tempx[y] + i)*(Generalised_block_size_col) + tempy + j][1] = y;
                  }
              }
              
              tempy += (column_allocations[y]);
          }
      }

      if ((MPI_VERBOSE > 0) 
          && (algo_comm_rank == 0)
      )
      {
         for (x = 0; x < Generalised_block_size_row; x++)
         {
             for (y = 0; y < Generalised_block_size_col; y++)
             {
                 printf(
                   "Grid element(%d,%d): %d, %d\n",
                   x,
                   y,
                   Generalised_block[(x*Generalised_block_size_col) + y][0],
                   Generalised_block[(x*Generalised_block_size_col) + y][1]
                 );
             }
         }
      }

      free(tempx);

      return MPI_SUCCESS;
   }

   /*-----------------------------------------------------*/

   int Determine_distribution_parameters
   (
       int p,
       int q, 
       const int *row_allocations,
       const int *column_allocations,
       int *w,
       int *h,
       int *trow
   )
   {
      int i, j, k, x, y, m, n;

      for (i = 0; i < p; i++)
      {
         for (j = 0; j < q; j++)
         {
            trow[i*q+j] = 0;
         }
      }

      for(i = 0; i < q; i ++ )
      {
         w[i] = column_allocations[i];
      }

      for (i = 0; i < p; i++)
      {
         for (j = 0; j < q; j++)
         {
             H(i, j, i, j, p, q) = row_allocations[(i*q) + j];
         }
      }

      for(i = 0;i < p;i ++ )
      {
         for(j = 0;j < q;j ++ )
         {
            for(k = 0;k < i;k ++ )
            {
               trow[i*q+j] += H(k, j, k, j, p, q);
            }
         }
      }

      for (i = 0; i < p; i++) 
      {
         for (j = 0; j < q; j++) 
         {
            for (x = 0; x < p; x++) 
            {
               for (y = 0; y < q; y++) 
               {
                  int height = Common_height(
                                     trow[i*q+j],
                                     trow[i*q+j]+H(i, j, i, j, p, q),
                                     trow[x*q+y],
                                     trow[x*q+y]+H(x, y, x, y, p, q)
                  );

                  H(i, j, x, y, p, q) = height;
               }
            }
         }
      }

      if ((MPI_VERBOSE > 0)
         && (algo_comm_rank == 0)
      )
      {       
         printf("=======================\n");
         printf("Distribution parameters\n");
         printf("=======================\n");
         printf("Widths are:\n");
        
         for (i = 0; i < q; i++)
         {
            printf("%d ", w[i]);
         } 
            
         printf("\n");
            
         printf("Heights are:\n");
            
         for (i = 0; i < p; i++) 
         {
            for (j = 0; j < q; j++) 
            {
               for (x = 0; x < p; x++) 
               {
                  for (y = 0; y < q; y++) 
                  {
                      printf("%d ", H(i, j, x, y, p, q));
                  }
               }

               printf("\n");
            }
         }

         printf("\n");
      
         printf("=======================\n");
         printf("=======================\n");
      }

      return MPI_SUCCESS;
   }

   /*-----------------------------------------------------*/
