#include <stdlib.h>
#include <stdio.h>
#include "mpi.h"
#include "PLA.h"

#define MATHLIB_STANDALONE
#include <Rmath.h>

//global 
//declare a template (will be initialized in main subroutine)
PLA_Template                    templat = NULL;

#include "sub.c"

void myPLA_getLoglik(PLA_Obj dist, // n x n
		     PLA_Obj X,    // n x p
		     PLA_Obj Z,    // n x 1
		     double phi, double as, double bs,
		     int me,
		     //output
		     //PLA_Obj betahat,    // p x 1 
		     //PLA_Obj lXtSigInvX, // p x p
		     //PLA_Obj loglik,     // 1 x 1 multiscalar
		     //PLA_Obj quadForm    // 1 x 1 multiscalar
		     double* betahat_val,
		     double* lXtSigInvX_val,
		     double* loglik_val,
		     double* quadForm_val
		     ) {
  PLA_Obj 
    lSpat = NULL, 
    lSpatInv = NULL, 
    lSpatInvX = NULL,
    lSpatInvZ = NULL,
    XtSigInvX = NULL, 
    XtSigInvZ = NULL,
    quadForm = NULL;

  PLA_Obj
    betahat = NULL, lXtSigInvX = NULL;
  
  PLA_Obj //constants
    minus_one = NULL, one = NULL, zero = NULL;

  double logSqrtDet;
  double logSqrtDetXtSigInvX;

  int XtSigInvX_ldim;

  void* buffer;
  double* bufp;
  int i, info;

  int n, p, XtXN;
  int one = 1;
  double one = 1.0, dZero = 0.0, negone = -1.0;
  char LEFT = 'L', LO = 'L', TRANS = 'T', NTRANS = 'N', NDIAG = 'N';
  
  PLA_Obj_global_length(X, &n);
  PLA_Obj_global_width(X, &p);
  XtXN = n * n;

  PLA_Mvector_create ( MPI_DOUBLE, p, 1, templat, 0, &betahat);
  PLA_Matrix_create ( MPI_DOUBLE, p, p, templat, 0, 0, &lXtSigInvX);

  //set up constants
  //note that "conf_to": we wish to create a new object one conformable to dist
  //at (row,column)=(minus_one,zero) relate to the owner of the originalOA
  PLA_Create_constants_conf_to(dist, &minus_one, &zero, &one);

  //computing starts here
  MPI_Barrier(MPI_COMM_WORLD);

  //create conformable matrix for spatial correlation
  //then we will decompose it and find inverse
  PLA_Matrix_create_conf_to (dist, &lSpat);
  PLA_Copy(dist, lSpat);
  //calculate spatial correlation matrix
  myPLA_local_apply(lSpat, exponential, phi);

  MPI_Barrier(MPI_COMM_WORLD);

  //sum logs of diagonal elements
  logSqrtDet = logProd_diag_matrix(lSpat);
  //decompose matrix
  PLA_Chol(PLA_LOWER_TRIANGULAR, lSpat);

  //get logSqrtDet.
  logSqrtDet = logProd_diag_matrix(lSpat);

  //make lSpatInvX to be conformal to X and put X in it
  PLA_Matrix_create_conf_to(X, &lSpatInvX);
  PLA_Copy(X, lSpatInvX);
  MPI_Barrier(MPI_COMM_WORLD);


  // And here we solve lSpatInvX<-lSpat^-1 * lSpatInvX 
  // (which is at this moment just X)
  PLA_Trsm (PLA_SIDE_LEFT, PLA_LOWER_TRIANGULAR, PLA_NO_TRANSPOSE, PLA_NONUNIT_DIAG, one, lSpat, lSpatInvX);

  //lSpatInvZ<-1*Z (make empty lSpatInvZ just like Z, set to 0, add Z)
  PLA_Mvector_create_conf_to(Z, 1, &lSpatInvZ);
  PLA_Obj_set_to_zero(lSpatInvZ);
  PLA_Axpy(one, Z, lSpatInvZ);

  //here ve get lSpatInvZ<-lSpat^-1 * lSpatInvZ (i.e. * z)
  PLA_Trsv (PLA_LOWER_TRIANGULAR, PLA_NO_TRANSPOSE, PLA_NONUNIT_DIAG, lSpat, lSpatInvZ);

  //create XtSigInvX<-lSpatInvX'lSpatInvX
  PLA_Matrix_create(MPI_DOUBLE, p, p, templat, 0, 0, &XtSigInvX);
  PLA_Gemm(PLA_TRANSPOSE, PLA_NO_TRANSPOSE, one, lSpatInvX, lSpatInvX, zero, XtSigInvX);
  
  //create XtSigInvZ<-lSpatInvX'lSpatInvZ
  PLA_Mvector_create_conf_to(XtSigInvX, 1, &XtSigInvZ);
  PLA_Gemv(PLA_TRANSPOSE, one, lSpatInvX, lSpatInvZ, zero, XtSigInvZ);


  //make a copy of XtSigInvX: lXtSigInvX and factorize this copy
  PLA_Copy(XtSigInvX, lXtSigInvX);
  PLA_Chol(PLA_LOWER_TRIANGULAR, lXtSigInvX);

  //get logSqrtDetXtSigInvX
  logSqrtDetXtSigInvX = logProd_diag_matrix(lXtSigInvX);
  
  //make local copies of the values that will need: betahat_val, lXtSigInvX_val
  myPLA_copy_vector_to_local(XtSigInvZ, betahat_val);
  myPLA_copy_matrix_to_local(lXtSigInvX, lXtSigInvX_val);

  //solve betahat_val<- lXtSigInvX_val^-1 * betahat_val for not tr and tr
  PLA_dtrsv(&LO, &NTRANS, &NDIAG, &p, lXtSigInvX_val, &p, betahat_val, &one);
  PLA_dtrsv(&LO, &TRANS, &NDIAG, &p, lXtSigInvX_val, &p, betahat_val, &one);
    //get loglik
  myPLA_copy_vector_to_global(betahat, betahat_val);
  MPI_Barrier(MPI_COMM_WORLD);

  //lSpatInvZ<-lSpatInvZ-lSpatInvX*betahat
  PLA_Gemv(PLA_NO_TRANSPOSE, minus_one, lSpatInvX, betahat, one, lSpatInvZ);
  PLA_Mscalar_create(MPI_DOUBLE, PLA_ALL_ROWS, PLA_ALL_COLS, 1, 1, templat, &quadForm);
  //quadForm<-lSpatInvZ'lSpatInvZ
  PLA_Dot(lSpatInvZ, lSpatInvZ, quadForm);

  *quadForm_val = value_of_scalar(quadForm);
  *loglik_val = logSqrtDet + logSqrtDetXtSigInvX + (as + (n - p) / 2.0) * log( *quadForm_val / 2.0 + bs);

  MPI_Barrier(MPI_COMM_WORLD);

  //free memories
  PLA_Obj_free(&lSpat);
  PLA_Obj_free(&lSpatInv);
  PLA_Obj_free(&lSpatInvX);
  PLA_Obj_free(&lSpatInvZ);
  PLA_Obj_free(&XtSigInvX);
  PLA_Obj_free(&XtSigInvZ);
  PLA_Obj_free(&quadForm);
  PLA_Obj_free(&betahat);
  PLA_Obj_free(&lXtSigInvX);
  PLA_Obj_free(&minus_one);
  PLA_Obj_free(&one);          
  PLA_Obj_free(&zero);
}


void fitUltraSimple (PLA_Obj dist,     // n x n
		     PLA_Obj X,        // n x p
		     PLA_Obj z,        // n x 1
		      long int iters,
		     //output
		     double* beta,     //nIter x p
		     double* sigmasq,  //nIter x 1
		     double* phi       //nIter x 1
		     ) {
  int command, me, np;
  MPI_Comm comm;
  long int iIter, acceptCtr, neval;
  
    int n, p, XtXN, pSq, done;
  int iOne = 1, i;
  double dOne = 1.0, dZero = 0.0, negDOne = -1.0;
  char LEFT = 'L', LO = 'L', TRANS = 'T', NTRANS = 'N', NDIAG = 'N';

  double as = 2.5, bs = 12.5;
  double as2;// = as + (n - p) / 2.0;
  double l1 = 0.2, r1 = 5.0, currl, currr;

  double loglik, quadForm, loglikc, quadFormc;
  double *betahat, *lXtSigInvX, *betahatc, *lXtSigInvXc;

  //temporary
  double cand, logR, u, bs2, e, zstar;
  double tmp;
  int batcum = 0, batchsize = 20;

  //set up a random number
  unsigned int ii = 1234;
  unsigned int jj = 5678;
  set_seed(ii, jj);
  get_seed(&ii, &jj);

  // extract this node's rank and size
  MPI_Comm_rank( MPI_COMM_WORLD, &me );
  MPI_Comm_size( MPI_COMM_WORLD, &np );
  
  //here we have width and length of the X:
  PLA_Obj_global_length(X, &n);
  PLA_Obj_global_width(X, &p);
  //so we can calculate dimensions for X'X
  XtXN = n * n; 
  pSq = p * p;
  
  as2 = as + (n - p) / 2.0;


  /* computation */
  /*
  PLA_Mvector_create ( MPI_DOUBLE, p, 1, templat, 0, &betahat);
  PLA_Mvector_create_conf_to ( betahat, 1, &betahatc);
  PLA_Matrix_create ( MPI_DOUBLE, p, p, templat, 0, 0, &lXtSigInvX);
  PLA_Matrix_create_conf_to ( lXtSigInvX, &lXtSigInvXc);
  */
  //memory allocation
  betahat = (double *) calloc(p, sizeof(double));
  lXtSigInvX = (double *) calloc(p * p, sizeof(double));
  betahatc = (double *) calloc(p, sizeof(double));
  lXtSigInvXc = (double *) calloc(p * p, sizeof(double));
  

  phi[0] = 1.0;
  sigmasq[0] = var_vector(z);

  MPI_Barrier(MPI_COMM_WORLD);
  myPLA_getLoglik(dist, X, z, phi[0], as, bs, me,
		  betahat, lXtSigInvX, &loglik, &quadForm);
  loglik = - loglik; // change to the correct sign

//copy a vector of doubles of length p at addresses beta<-betahat (localOA)
  PLA_dcopy(&p, betahat, &iOne, beta, &iOne);

  acceptCtr = 0;
  neval = 0;
  for (iIter = 1; iIter < iters; iIter++) {
    if (me == 0) {
      batcum++;
      if (batcum == batchsize){
	printf("iIter = %d\n", iIter);
	batcum = 0;
      }
      e = rexp(1.0);
    }

    MPI_Bcast ( &e, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD );
     
    MPI_Barrier(MPI_COMM_WORLD);
    zstar = loglik - e;
    currl = l1;
    currr = r1;
    done = 0;
    while (done == 0) {
      if (me == 0) {
	cand = runif(currl, currr);
      }
      MPI_Bcast ( &cand, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD );
      MPI_Barrier(MPI_COMM_WORLD);
      
      myPLA_getLoglik(dist, X, z, cand, as, bs, me,
		      betahatc, lXtSigInvXc, &loglikc, &quadFormc);
      loglikc = - loglikc;
      neval++;

      if (me == 0)  printf("iIter = %d, e = %f, cand = %f, zstar = %f, loglikc = %f\n", iIter, e, cand, zstar, loglikc);
   
      if (zstar < loglikc) { // accept
	done = 1;
	phi[iIter] = cand;
	loglik = loglikc;
	quadForm = quadFormc;
	dcopy_(&p, betahatc, &iOne, betahat, &iOne);
	dcopy_(&pSq, lXtSigInvXc, &iOne, lXtSigInvX, &iOne);
      }
      else {// shrink interval
	if (cand > phi[iIter - 1]) currr = cand;
	else currl = cand;
      }
    }
    if (me == 0) { // only do this on the master node
      bs2 = bs + quadForm / 2.0;

      sigmasq[iIter] = 1.0 / rgamma(as2, 1.0 / bs2);

      
      for (i = 0; i < p; i++) beta[iIter * p + i] = rnorm(0.0, 1.0);

      PLA_dtrsv(&LO, &TRANS, &NDIAG, &p, lXtSigInvX, &p, &beta[iIter * p], &iOne);

      tmp = sqrt(sigmasq[iIter]);
      dscal_(&p, &tmp, &beta[iIter * p], &iOne);
      daxpy_(&p, &dOne, betahat, &iOne, &beta[iIter * p], &iOne);
    }
  } 
  MPI_Barrier(MPI_COMM_WORLD);

  /* Clean up */
  free(betahat);          free(lXtSigInvX);
  free(betahatc);         free(lXtSigInvXc);
  
  return;
}


int main(int argc, char** argv) {
  int me, np, block_size = 64, block_size_alg = 128;
  //supporting MPI communicator
  MPI_Comm comm;
  long int nIter = 20000;

  //These will be width and length of the matrix X
  int p, n;
  double *beta, *phi, *sigmasq;
  PLA_Obj dist = NULL, X = NULL, Z = NULL;

  double start, end;

  MPI_Init(&argc, &argv);//Initialize MPI

  MPI_Comm_rank(MPI_COMM_WORLD, &me); //Extract the rank of this node
  MPI_Comm_size(MPI_COMM_WORLD, &np); //Indicates number of processors available

//proposes suggested ratio for topology to be nprows/npcols=1
  PLA_Comm_1D_to_2D_ratio(MPI_COMM_WORLD, 1.0, &comm);

  PLA_Init(comm);//initialize communicator
//create template with block_size=64, that is length of subvectors of template
//PLAPACK breaks vectors and matrices in logical mash of that size
//0 - means that indexing starts with 0 (which is standard for C)
//if you needed to work with Fortran you would set 1
  PLA_Temp_create( block_size, 0, &templat );

//set algorithmic blocking size
  pla_Environ_set_nb_alg(PLA_OP_ALL_ALG, block_size_alg );


  // a subfunction that does required preparations of:
  // read in the input matrices: dist, X, Z
  DataPrep ( me, argv[1], &X, &Z, &dist );

 //and syncronize as you remember with MPI: MPI_Barrier( MPI_COMM_WORLD )
  MPI_Barrier( MPI_COMM_WORLD );
 
//get width and length of the matrix X (global width and lengthOB)
  PLA_Obj_global_width(X, &p);
  PLA_Obj_global_length(X, &n);

  //allocate memories for mcmc samples
  //allocate memory enough for 20000 iterations as it was the default value
  beta = (double *) calloc(nIter * p, sizeof(double));
  sigmasq = (double *) calloc(nIter, sizeof(double));
  phi = (double *) calloc(nIter, sizeof(double));

  //get actual number of iterations to doOB
  nIter = atoi(argv[2]);
  //call the engine

  if (me == 0) start = Timing();
  //fit the model given all the information we have by now
  fitUltraSimple(dist, X, Z, nIter, beta, sigmasq, phi);
 
  if (me == 0) {
    end = Timing();
    //printf("**** fitUltraSimple RUNNING TIME = ( %3.4lf ) ****\n", end - start);
    printf("%3.4lf\n", end - start);
  }
  //output the results, write beta, sigmasq, and phi into a file?
  MPI_Barrier(MPI_COMM_WORLD);
  if (me == 0) {
    //printf("save to file\n");
    WriteArrayToFile(beta, nIter, p, "beta.out");
    WriteArrayToFile(sigmasq, nIter, 1, "sigmasq.out");
    WriteArrayToFile(phi, nIter, 1, "phi.out");
    //printf("after save\n");  }
  }
  MPI_Barrier(MPI_COMM_WORLD);
  //clean up
  free(beta);     
  free(sigmasq);     
  free(phi);
  
  if (me == 0) {
    //printf("between free\n");
  }
  PLA_Obj_free(&dist);
  PLA_Obj_free(&X);
  PLA_Obj_free(&Z);

  if (me == 0) {
    //printf("after free\n");
  }


  //PLA_Temp_free(&templat);
  
  PLA_Finalize();
  MPI_Finalize();

  //if (me == 0) printf("free template?\n");
  //free(&templat); //why can not it be freed????? p4error
}