#include "config.h"
#include "fupermod/fupermod.h"
#include "libjacobi.h"
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <stdlib.h>
#include <string.h>
#define ZERO 1e-13
typedef enum fupermod_algorithm {
    partial,
    full,
    constant1,
    constant2,
    homogeneous,
    manual
} fupermod_algorithm;
void fupermod_balance_init(MPI_Comm comm, int root, fupermod_algorithm algorithm, int D, char* info);
void fupermod_balance_finalise(int size, fupermod_model** models);
FILE* balance_times = NULL;
int iter;
int balance;
int continue_balancing = 1;
double threshold;
fupermod_algorithm algorithm = -1;
int D;
struct timeval total_start;
void fupermod_balance_init(MPI_Comm comm, int root, fupermod_algorithm _algorithm, int _D, char* info) {
    algorithm = _algorithm;
    D = _D;
    int rank;
    MPI_Comm_rank(comm, &rank);
    int size;
    MPI_Comm_size(comm, &size);
    char hostname[MPI_MAX_PROCESSOR_NAME];
    int len;
    MPI_Get_processor_name(hostname, &len);
    char* hostnames = (rank == root ? (char*)malloc(sizeof(char) * size * MPI_MAX_PROCESSOR_NAME) : NULL);
    MPI_Gather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, root, comm);
    if (rank == root) {
        char filename[1024];
        char sizes[16];
        if (size <= 9)
            sprintf(sizes, "0%d", size);
        else
            sprintf(sizes, "%d", size);
        sprintf(filename, "balance_times.%d.%d.%s.dat", algorithm, D, sizes);
        balance_times = fopen(filename, "w");
        time_t now = time(NULL);
        struct tm *tm_now;
        char datetime[25];
        tm_now = localtime(&now);
        strftime(datetime, sizeof datetime, "%d-%m-%y %H:%M", tm_now);
        iter = 0;
        balance = 0;
        fprintf(balance_times, "#Fupermod Balance iteration times. Num machines:%d\tAt:%s\n", size, datetime);
        fprintf(balance_times, "# %s \n", info);
        fprintf(balance_times, "#host\t");
        int i;
        for (i = 0; i < size; i++) {
            fprintf(balance_times, "        \t%12s\t", (hostnames + i * MPI_MAX_PROCESSOR_NAME));
        }
        fprintf(balance_times, "\n");
        fprintf(balance_times, "#itt \t");
        for (i = 0; i < size; i++) {
            fprintf(balance_times, "dist%4d\ttime%3d     \t", i, i);
        }
        fprintf(balance_times, "\n");
        free(hostnames); hostnames = NULL;
        gettimeofday(&total_start, NULL);
    }
}
void fupermod_balance_finalise(int size, fupermod_model** models) {
    struct timeval end;
    gettimeofday(&end, NULL);
    double time = ((end.tv_sec + end.tv_usec / 1000000.) - (total_start.tv_sec + total_start.tv_usec / 1000000.));
    fprintf(balance_times, "#total time: %le\n", time);
    fclose(balance_times);
    char filename[1024];
    int i;
    for (i = 0; i < size; i++){
        sprintf(filename, "datapoints.%d.%d.%d.dat", algorithm, D, i);
        FILE* datapoints = fopen(filename, "w");
        fprintf(datapoints, "# Datapoints for rank:%d, algorithm:%d problem size:%d\n", i, algorithm, D);
        fprintf(datapoints, "#d\ttime\tspeed\n");
        int j, d;
        double t;
        for (j = 0; j < models[i]->data->count; j++){
            d = models[i]->data->points[j].d;
            t = models[i]->data->points[j].t;
            fprintf(datapoints, "%d\t%le\t%le\n", d, t, models[i]->complexity(d) / t);
        }
        fclose(datapoints);
    }
}
int main(int argc, char **argv){
    MPI_Init(&argc,&argv);
    MPI_Comm comm = MPI_COMM_WORLD;
    int rank;
    MPI_Comm_rank(comm, &rank);
    int size;
    MPI_Comm_size(comm, &size);
    int root = 0;
    
    int seed = 2;
    int D = 100;
    int max_itt = 20;
    fupermod_algorithm algorithm = 0;
    double threshold = 0.01;
    int verbose = 0;
    int exit = 0;
    if(rank == root){
        int ret;
        while ((ret = getopt(argc, argv, "hD:i:a:vt:")) >=0) {
            switch(ret) {
                case 'h': 
                    printf("jacobi.c help\n"
                            ""
                            "Description: Jacobi method for solving a system of liner "
                            "equations on a heterogeneous cluster. \n"
                            "\n"
                            "Usage:\n"
                            "-D I Size of matrix\n"
                            "-i I Number of Iterations\n"
                            "-a I   Balancing algorithm. (default: %d)\n"
                            "\t\t0: Partial functional performance model\n"
                            "\t\t1: Full functional performance model\n"
                            "\t\t2: Constant performance model 1 (small benchmark)\n"
                            "\t\t3: Constant performance model 2 (homogeneous benchmark)\n"
                            "\t\t4: Homogeneous distribution\n"
                            "\t\t5: Manual distribution\n"
                            "-t D Relative threshold between 0 and 1, above which balancing is done\n"
                            "-v I Verbose mode\n"
                            , algorithm);
                    exit = 1;
                    break;
                case 'D': 
                    D = atoi(optarg);
                    break;
                case 'i': 
                    max_itt = atoi(optarg);
                    break;
                case 'a': 
                    algorithm = atoi(optarg);
                    break;
                case 'v':
                    verbose=1;
                    break;
                case 't': 
                    threshold = atof(optarg);
                    break;
            }
        }
    }
    MPI_Bcast(&exit, 1, MPI_INT, root, comm);
    if (exit) {
        MPI_Finalize();
        return 0;
    }
    MPI_Bcast(&D, 1, MPI_INT, root, comm);
    MPI_Bcast(&max_itt, 1, MPI_INT, root, comm);
    MPI_Bcast(&algorithm, 1, MPI_INT, root, comm);
    MPI_Bcast(&threshold, 1, MPI_DOUBLE, root, comm);
    
    if (D < size){
        fprintf(stderr,"Error!\nMatrix smaller then number of processors\n");
        MPI_Finalize();
        return FUPERMOD_FAIL;
    }
    fupermod_dist* distr = ((rank == root) ? fupermod_dist_alloc(size, D) : NULL);
    int* d = NULL; 
    int* old_d = (int *)malloc(size * sizeof(int));
    int* offset = (int *)malloc(size * sizeof(int));
    jacobi_set_D(D);
    fupermod_data** data = (fupermod_data**)malloc(sizeof(fupermod_data*) * size);
    fupermod_model** models = (fupermod_model**)malloc(sizeof(fupermod_model*) * size);
    int i;
    for (i = 0; i < size; i++) {
        data[i] = fupermod_data_alloc();
        models[i] = fupermod_model_interp_alloc(data[i], jacobi_complexity, D, 0);
    }
    double *a, *b, *x;
    
    
    
    char info[1024];
    sprintf(info, "Jacobi. D:%d threshold:%le iterations:%d algorithm:%d", D, threshold, max_itt, algorithm);
    fupermod_balance_init(comm, root, algorithm, D, info);
    int itt;
    for (itt = 0; itt < max_itt; itt++){ 
        if (rank == root)
            fprintf(stderr, "P%d starting itt: %d\n", rank, itt);
        
        double diff = 0.0;
        MPI_Bcast(d, size, MPI_INT, root, comm);
        int i;
        for (i=0; i<size; i++) {
            offset[i] = i == 0 ? 0 : (offset[i-1] + d[i-1]);
        }
        if (itt == 0) {
            
            jacobi_fill_matrix(&a, &b, &x, D, d[rank], offset[rank], seed, rank);
            MPI_Bcast(b, D, MPI_DOUBLE, root, comm);
        } else {
            a = jacobi_redistribute(comm, a, D, old_d, d);
        }
        memcpy(old_d, d, sizeof(int) * size);
        fupermod_dynamic balancer = {
                fupermod_partition_multiroot,
                size,
                models,
                distr
        };
        struct timeval start;
        gettimeofday(&start, NULL);
        double* new_x = jacobi_compute(a, b, x, D, d[rank], offset[rank], &diff);
        fupermod_balancer_iterate(&balancer, comm, root, start);
        if(verbose) printf("P%d x before: %f %f %f %f\n",rank, x[0], x[1], x[2], x[3]);
        MPI_Allgatherv(new_x, old_d[rank], MPI_DOUBLE, x, old_d, offset, MPI_DOUBLE, comm);
        if(verbose) printf("P%d x after: %f %f %f %f\n",rank, x[0], x[1], x[2], x[3]);
        free(new_x);
        if (rank == root && verbose > 0) {  
            fprintf(stderr, "dist: ");
            for (i=0; i<size; i++) {
                fprintf(stderr, "%d ", d[i]);
            }
            fprintf(stderr, "\n");
        }
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
        
    } 
    if (rank == root) 
        fupermod_balance_finalise(size, models);
    
    
    
    
    
    
    
    
    
    
    jacobi_final_test(comm, a, x, b, D, old_d, offset);
    
    
    
    
    
    
    
    
    
    fupermod_dist_free(distr);
    if (rank != root) {
        free(d); d = NULL;
    }
    free(x);
    free(b);
    free(a);
    MPI_Finalize();
    return FUPERMOD_SUCCESS;
}