
    #include <math.h>
    #include <stdio.h>
    #include <sys/time.h>

    #include <hmpi.h>
    #include "counter.h"
    #include "mxm_i.c"

    int main(int argc, char **argv)
    {
        int myNp, i, x, y, l, m, t, rc, *d, me, size, color = 0;
        double *A, *B, *C;
        double barrier_time;
        struct timeval start, end;
        double *temp;
        double *gperf;
        int recon_r = r;

        MPI_Comm mxmcomm;

        gettimeofday(&start, NULL); 

        {
           rc = MPI_Init(
                   &argc,
                   &argv
           );

           if (rc != MPI_SUCCESS)
           {
              printf(
                  "Problems initializing MPI runtime "
                  "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -1);
           }
        }

        {
           rc = MPI_Comm_rank(MPI_COMM_WORLD, &me);

           if (rc != MPI_SUCCESS)
           {
              printf(
                 "MAIN:Problems getting rank "
                 "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -2);
           }
        }

        {
           rc = MPI_Comm_size(MPI_COMM_WORLD, &size);

           if (rc != MPI_SUCCESS)
           {
              printf(
                 "MAIN:Problems getting size of MPI_COMM_WORLD "
                 "...Exiting...\n"
              );

              MPI_Abort(MPI_COMM_WORLD, -3);
           }
        }

        if (me >= p)
        {
           color = MPI_UNDEFINED;
        }

        rc = MPI_Comm_split(
                MPI_COMM_WORLD,
                color,
                me,
                &mxmcomm
        );

        if (rc != MPI_SUCCESS)
        {
           printf("Problems with MPI_Comm_split for MPI_COMM_WORLD\n");

           MPI_Abort(MPI_COMM_WORLD, -4);
        }

        if (me >= p)
        {
           MPI_Finalize();

           return 0;
        }

        d = (int*)malloc(
                  sizeof(int)
                  *
                  p
        );

        if (d == NULL)
        {
           printf(
              "Problems allocating distribution parameter d"
              "...Exiting...\n"
           );

           MPI_Abort(MPI_COMM_WORLD, -5);
        }

        gperf = (double*)malloc(
                  sizeof(double)
                  *
                  p
        );

        if (gperf == NULL)
        {
           printf(
              "Problems allocating speeds gperf"
              "...Exiting...\n"
           );

           MPI_Abort(MPI_COMM_WORLD, -6);
        }

        for (i = 0; i < p; i++)
        {
           gperf[i] = 1.00;
        }

        rc = HMPI_Partition_set(
                 p,
                 1,
                 gperf,
                 NULL,
                 NULL,
                 (N/recon_r),
                 NULL,
                 0,
                 0,
                 -1,
                 NULL,
                 NULL,
                 d
        );

        if (rc != HMPI_OK)
        {
           printf("Problems partitioning\n");
           MPI_Abort(MPI_COMM_WORLD, -7);
        }

        free(gperf);

        myNp = d[me];

        A = (double*)malloc(
                     sizeof(double)
                     *
                     (N*myNp*recon_r)
        );

        if (A == NULL)
        {
           printf("Cannot allocate A, N=%d, myNp=%d, me=%d\n", N, myNp, me);
           MPI_Abort(MPI_COMM_WORLD, -8);
        }

        B = (double*)malloc(
                     sizeof(double)
                     *
                     (N*myNp*recon_r)
        );

        if (B == NULL)
        {
           printf("Cannot allocate B, N=%d, myNp=%d, me=%d\n", N, myNp, me);
           MPI_Abort(MPI_COMM_WORLD, -9);
        }

        C = (double*)malloc(
                     sizeof(double)
                     *
                     (N*myNp*recon_r)
        );

        if (C == NULL)
        {
           printf("Cannot allocate C, N=%d, myNp=%d, me=%d \n", N, myNp, me);
           MPI_Abort(MPI_COMM_WORLD, -10);
        }

        /*
         * Initilization can be expensive, hence ignored.
         */
        InitializeMatrices(
           N*myNp*recon_r,
           A,
           B,
           C 
        );

        temp = (double*)malloc(
                        sizeof(double)
                        *
                        N
			*
			recon_r
        );

        if (temp == NULL)
        {
           printf("Cannot allocate temp\n");
           MPI_Abort(MPI_COMM_WORLD, -11);
        }

        for (i = 0; i < (N/recon_r); i++)
        {
            int PivotProcessor;

            PivotProcessor = GetPivotProcessor(
                                i,
                                (N/recon_r),
                                p,
                                d
            );
           
            if (VERBOSE > 0)
            {
               if (me == 0)
               {
                  printf("Step %d, pivot processor is %d\n", i, PivotProcessor);
               }
            }

            /*
             * Broadcast the pivot row
             */
            if (PivotProcessor == me)
            {
               int myrow;

               myrow = i;
               for (x = 0; x < me; x++)
               {
                   myrow -= d[x];
               }

               for (x = 0; x < (N*recon_r); x++)
               {
                   temp[x] = B[myrow*recon_r*N + x];
               }

               rc = MPI_Bcast(
                       temp,
                       N*recon_r,
                       MPI_DOUBLE,
                       me,
                       mxmcomm
               );
            }
            else
            {
               rc = MPI_Bcast(
                       temp,
                       N*recon_r,
                       MPI_DOUBLE,
                       PivotProcessor,
                       mxmcomm                  
               );
            }

            if (rc != MPI_SUCCESS)
            {
               printf("Problems broadcasting pivot row\n");
               MPI_Abort(MPI_COMM_WORLD, -12);
            }

            for (x = 0; x < myNp; x++)
            {
                for (y = 0; y < (N/recon_r); y++)
                {
                    for (l = 0; l < recon_r; l++)
                    {
                       for (m = 0; m < recon_r; m++)
                       {
			   double val = 0;
                           for (t = 0; t < recon_r; t++)
                           {
                                val
			      	+= 
				A[x*recon_r*N + i*recon_r + l*N + t]
                                *
                                temp[y*recon_r + t*recon_r + m];
			   }

			   C[x*recon_r*N + y*recon_r + l*N + m] += val;
                       }
                    }
                }
            }
        }

        /*
         * Print the contents of the matrices
         */
        if (VERBOSE > 0)
        {
           for (x = 0; x < (myNp*recon_r); x++)
           {
               for (y = 0; y < N; y++)
               {
                   printf("C[%d][%d]=%0.4f\n", x, y, C[x*N+y]);
               }
           }
        }

        free(temp);
        free(d);  
        free(A);
        free(B);
        free(C);

        {
           double i_barrier_time, f_barrier_time;

           i_barrier_time = MPI_Wtime();

           rc = MPI_Barrier(mxmcomm);

           if (rc != MPI_SUCCESS)
           {
              return rc;
           }

           f_barrier_time = MPI_Wtime();

           barrier_time = (
                           f_barrier_time
                           -
                           i_barrier_time
           );
        }

        gettimeofday(&end, NULL); 

	if (me == 0)
        {
           double tstart = start.tv_sec + (start.tv_usec/pow(10, 6));
           double tend = end.tv_sec + (end.tv_usec/pow(10, 6));

           printf(
             "N=%d, p=%d, time(sec)=%0.9f\n",
             N,
	     p,
             (tend - tstart - barrier_time)
           );
        }

        MPI_Finalize();
    }

