
    #include <math.h>
    #include <stdio.h>
    #include <sys/time.h>

    #include <hmpi.h>
    #include "mxm_i.c"
    #include "mxm_data.c"
    #include "counter.h"

    int main(int argc, char **argv)
    {
        int p, size, myNp, i, x, y, rc, *np, *mlimits, me;
        double *speeds;
        double *A, *B, *C;
        double barrier_time;
        MPI_Comm mxmcomm;
        struct timeval start, end;
        struct timeval starts, ends;
	double computation_time = 0.0;
	double communication_time = 0.0;
        int color = 0;
        int steps = 1;

        gettimeofday(&start, NULL); 

        {
           rc = MPI_Init(&argc, &argv);

           if (rc != MPI_SUCCESS)
           {
              printf(
                 "MAIN:Problems initializing MPI "
                 "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -1);
           }
        }

        {
           rc = MPI_Comm_rank(MPI_COMM_WORLD, &me);

           if (rc != MPI_SUCCESS)
           {
              printf(
                 "MAIN:Problems getting rank "
                 "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -2);
           }
        }

        {
           rc = MPI_Comm_size(MPI_COMM_WORLD, &size);

           if (rc != MPI_SUCCESS)
           {
              printf(
                 "MAIN:Problems getting rank "
                 "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -3);
           }
        }

        if (me == (size - 1))
        {
           color = MPI_UNDEFINED;
        }

        rc = MPI_Comm_split(
                MPI_COMM_WORLD,
                color,
                me,
                &mxmcomm
        );

        if (rc != MPI_SUCCESS)
        {
           printf("Problems with MPI_Comm_split for MPI_COMM_WORLD\n");

           MPI_Abort(MPI_COMM_WORLD, -4);
        }

        if (me == (size - 1))
        {
           MPI_Finalize();

           return 0;
        }

        p = size - 1;

        np = (int*)malloc(
                   sizeof(int)
                   *
                   p
        );

        if (np == NULL)
        {
           printf("Cannot allocate np\n");
           MPI_Abort(MPI_COMM_WORLD, -5);
        }

	if (me == 0)
        {
           gettimeofday(&starts, NULL);

           speeds = (double*)malloc(
                             sizeof(double)
                             *
                             p
                             *
                             steps
           );

           if (speeds == NULL)
           {
              printf("No memory to allocate speeds\n");
              MPI_Abort(MPI_COMM_WORLD, -5);
           }

           mlimits = (int*)malloc(
                           sizeof(int)
                           *
                           p
                           *
                           steps
           );

           if (mlimits == NULL)
           {
              printf("No memory to allocate mlimits\n");
              MPI_Abort(MPI_COMM_WORLD, -5);
           }

           Fill_input_parameters(
               p,
               steps,
               speeds,
               mlimits
           );

           if (VERBOSE > 1)
           {
              if (me == 0)
              {
                 Display_input_parameters(
                     p,
                     steps,
                     speeds,
                     mlimits
                 );
              }
           }

           rc = HMPI_Partition_set(
                    p,
                    1,
                    speeds,
                    mlimits,
                    NULL,
                    N*N,
                    NULL,
                    0,
                    0,
                    -1,
                    NULL,
                    NULL,
                    np
           );

           if (rc != HMPI_OK)
           {
              printf("Problems partitioning\n");
              MPI_Abort(MPI_COMM_WORLD, -6);
           }

	   for (i = 0; i < p; i++)
           {
               if (np[i] < 0)
               {
                  printf("Problems partitioning\n");
                  MPI_Abort(MPI_COMM_WORLD, -6);
               }
           }

	   gettimeofday(&ends, NULL);

	   {
              double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	      double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	      computation_time += tend - tstart;
           }
        }

	/*
	 * Broadcast the partition parameter np to
	 * all the processors
	 */
        if (me == 0)
        {
           int temp[p];

           for (i = 0; i < p; i++)
           {
               temp[i] = np[i];   
           }

	   rc = MPI_Bcast(
                   &temp,
		   p,
		   MPI_INT,
		   me,
                   mxmcomm
           );
        }
        else
        {
	   rc = MPI_Bcast(
                   np,
		   p,
		   MPI_INT,
		   0,
                   mxmcomm
           );
        }

        if (rc != MPI_SUCCESS)
        {
           printf("Problems broadcasting partition parameter\n");
           MPI_Abort(MPI_COMM_WORLD, -12);
        }

        gettimeofday(&starts, NULL);

        rc = GetStripes(
                p,
                N,
                np 
        );

        if (rc != HMPI_OK)
        {
           printf("Problems getting striped partitioning\n");
           MPI_Abort(MPI_COMM_WORLD, -7);
        }

        if (VERBOSE > 0)
        {
           if (me == 0)
           {
              printf("Allocations are: \n");

              for (i = 0; i < p; i++)
              {
                  printf("%d ", np[i]);
              }
              printf("\n");
           }
        }

        myNp = np[me];

        A = (double*)malloc(
                     sizeof(double)
                     *
                     (N*myNp)
        );

        if (A == NULL)
        {
           printf("Cannot allocate A\n");
           MPI_Abort(MPI_COMM_WORLD, -8);
        }

        B = (double*)malloc(
                     sizeof(double)
                     *
                     (N*myNp)
        );

        if (B == NULL)
        {
           printf("Cannot allocate B\n");
           MPI_Abort(MPI_COMM_WORLD, -9);
        }

        C = (double*)malloc(
                     sizeof(double)
                     *
                     (N*myNp)
        );

        if (C == NULL)
        {
           printf("Cannot allocate C\n");
           MPI_Abort(MPI_COMM_WORLD, -10);  
        }

        InitializeMatrices(
           N*myNp,
           A,
           B,
           C 
        );

	gettimeofday(&ends, NULL);

        {
           double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	   double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	   computation_time += tend - tstart;
        }

        for (i = 0; i < N; i++)
        {
            int PivotProcessor;
	    double *temp;

            gettimeofday(&starts, NULL);

            PivotProcessor = GetPivotProcessor(
                                i,
                                N,
                                p,
                                np
            );

            temp = (double*)malloc(
                            sizeof(double)
                            *
                            N
            );

            if (temp == NULL)
            {
               printf("Cannot allocate temp\n");
               MPI_Abort(MPI_COMM_WORLD, -11);
            }

	    gettimeofday(&ends, NULL);

	    {
               double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	       double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	       computation_time += tend - tstart;
            }

            /*
             * Broadcast the pivot row
             */
            if (PivotProcessor == me)
            {
               int myrow;

               gettimeofday(&starts, NULL);

               myrow = i;
               for (x = 0; x < me; x++)
               {
                   myrow -= np[x];
               }

               for (x = 0; x < N; x++)
               {
                   temp[x] = B[myrow*N + x];
               }

	       gettimeofday(&ends, NULL);

	       {
                  double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	          double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	          computation_time += tend - tstart;
               }

	       gettimeofday(&starts, NULL);

               rc = MPI_Bcast(
                       temp,
                       N,
                       MPI_DOUBLE,
                       me,
                       mxmcomm
               );

	       gettimeofday(&ends, NULL);

	       {
                  double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	          double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	          communication_time += tend - tstart;
               }
            }
            else
            {
               gettimeofday(&starts, NULL);

               rc = MPI_Bcast(
                       temp,
                       N,
                       MPI_DOUBLE,
                       PivotProcessor,
                       mxmcomm                  
               );

	       gettimeofday(&ends, NULL);

	       {
                  double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	          double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	          communication_time += tend - tstart;
               }
            }

            if (rc != MPI_SUCCESS)
            {
               printf("Problems broadcasting pivot row\n");
               MPI_Abort(MPI_COMM_WORLD, -12);
            }

	    gettimeofday(&starts, NULL);

            for (x = 0; x < myNp; x++)
            {
                for (y = 0; y < N; y++)
                {
                    C[x*N+y] += A[x*N + i]
                                *
                                temp[y];
                }
            }

            free(temp);

	    gettimeofday(&ends, NULL);

	    {
               double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	       double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	       computation_time += tend - tstart;
            }
        }

        /*
         * Print the contents of the matrices
         */
        if (VERBOSE > 0)
        {
           for (x = 0; x < myNp; x++)
           {
               for (y = 0; y < N; y++)
               {
                   printf("C[%d][%d]=%0.4f\n", x, y, C[x*N+y]);
               }
           }
        }

	gettimeofday(&starts, NULL);

	if (me == 0)
        {
           free(speeds);
           free(mlimits);
        }

        free(np);        
        free(A);
        free(B);
        free(C);

	gettimeofday(&ends, NULL);

	{
           double tstart = starts.tv_sec + (starts.tv_usec/pow(10, 6));
	   double tend = ends.tv_sec + (ends.tv_usec/pow(10, 6));

	   computation_time += tend - tstart;
        }

        {
           double i_barrier_time, f_barrier_time;

           i_barrier_time = MPI_Wtime();

           rc = MPI_Barrier(mxmcomm);

           if (rc != MPI_SUCCESS)
           {
              return rc;
           }

           f_barrier_time = MPI_Wtime();

           barrier_time = (
                           f_barrier_time
                           -
                           i_barrier_time
           );
        }

        {
           rc = MPI_Comm_free(&mxmcomm);

           if (rc != MPI_SUCCESS)
           {
              printf(
                 "MAIN:Problems freeing MXM comm "
                 "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -13);
           }
        }

        gettimeofday(&end, NULL); 

        /*
	 * Print computation and communication times
         * Print Execution time
         */
	/*
	printf(
	  "me = %d, computation time=%0.10f, communication_time=%0.10f\n",
          me,
	  computation_time,
	  communication_time
        );
	*/

	if (me == 0)
        {
           double tstart = start.tv_sec + (start.tv_usec/pow(10, 6));
           double tend = end.tv_sec + (end.tv_usec/pow(10, 6));
           double speed = (double)(2*N*0.01*N*0.0001*N)
                          /
                          (double)(tend - tstart - barrier_time);

           printf(
             "%0.3f\n",
             speed
           );
        }

        MPI_Finalize();
    }

