#include #include #include "mpi.h" #include "PLA.h" #define MATHLIB_STANDALONE #include //global //declare a template (will be initialized in main subroutine) PLA_Template templat = NULL; #include "sub.c" void myPLA_getLoglik(PLA_Obj dist, // n x n PLA_Obj X, // n x p PLA_Obj Z, // n x 1 double phi, double as, double bs, int me, //output //PLA_Obj betahat, // p x 1 //PLA_Obj lXtSigInvX, // p x p //PLA_Obj loglik, // 1 x 1 multiscalar //PLA_Obj quadForm // 1 x 1 multiscalar double* betahat_val, double* lXtSigInvX_val, double* loglik_val, double* quadForm_val ) { PLA_Obj lSpat = NULL, lSpatInv = NULL, lSpatInvX = NULL, lSpatInvZ = NULL, XtSigInvX = NULL, XtSigInvZ = NULL, quadForm = NULL; PLA_Obj betahat = NULL, lXtSigInvX = NULL; PLA_Obj //constants minus_one = NULL, one = NULL, zero = NULL; double logSqrtDet; double logSqrtDetXtSigInvX; int XtSigInvX_ldim; void* buffer; double* bufp; int i, info; int n, p, XtXN; int one = 1; double one = 1.0, dZero = 0.0, negone = -1.0; char LEFT = 'L', LO = 'L', TRANS = 'T', NTRANS = 'N', NDIAG = 'N'; PLA_Obj_global_length(X, &n); PLA_Obj_global_width(X, &p); XtXN = n * n; PLA_Mvector_create ( MPI_DOUBLE, p, 1, templat, 0, &betahat); PLA_Matrix_create ( MPI_DOUBLE, p, p, templat, 0, 0, &lXtSigInvX); //set up constants //note that "conf_to": we wish to create a new object one conformable to dist //at (row,column)=(minus_one,zero) relate to the owner of the originalOA PLA_Create_constants_conf_to(dist, &minus_one, &zero, &one); //computing starts here MPI_Barrier(MPI_COMM_WORLD); //create conformable matrix for spatial correlation //then we will decompose it and find inverse PLA_Matrix_create_conf_to (dist, &lSpat); PLA_Copy(dist, lSpat); //calculate spatial correlation matrix myPLA_local_apply(lSpat, exponential, phi); MPI_Barrier(MPI_COMM_WORLD); //sum logs of diagonal elements logSqrtDet = logProd_diag_matrix(lSpat); //decompose matrix PLA_Chol(PLA_LOWER_TRIANGULAR, lSpat); //get logSqrtDet. logSqrtDet = logProd_diag_matrix(lSpat); //make lSpatInvX to be conformal to X and put X in it PLA_Matrix_create_conf_to(X, &lSpatInvX); PLA_Copy(X, lSpatInvX); MPI_Barrier(MPI_COMM_WORLD); // And here we solve lSpatInvX<-lSpat^-1 * lSpatInvX // (which is at this moment just X) PLA_Trsm (PLA_SIDE_LEFT, PLA_LOWER_TRIANGULAR, PLA_NO_TRANSPOSE, PLA_NONUNIT_DIAG, one, lSpat, lSpatInvX); //lSpatInvZ<-1*Z (make empty lSpatInvZ just like Z, set to 0, add Z) PLA_Mvector_create_conf_to(Z, 1, &lSpatInvZ); PLA_Obj_set_to_zero(lSpatInvZ); PLA_Axpy(one, Z, lSpatInvZ); //here ve get lSpatInvZ<-lSpat^-1 * lSpatInvZ (i.e. * z) PLA_Trsv (PLA_LOWER_TRIANGULAR, PLA_NO_TRANSPOSE, PLA_NONUNIT_DIAG, lSpat, lSpatInvZ); //create XtSigInvX<-lSpatInvX'lSpatInvX PLA_Matrix_create(MPI_DOUBLE, p, p, templat, 0, 0, &XtSigInvX); PLA_Gemm(PLA_TRANSPOSE, PLA_NO_TRANSPOSE, one, lSpatInvX, lSpatInvX, zero, XtSigInvX); //create XtSigInvZ<-lSpatInvX'lSpatInvZ PLA_Mvector_create_conf_to(XtSigInvX, 1, &XtSigInvZ); PLA_Gemv(PLA_TRANSPOSE, one, lSpatInvX, lSpatInvZ, zero, XtSigInvZ); //make a copy of XtSigInvX: lXtSigInvX and factorize this copy PLA_Copy(XtSigInvX, lXtSigInvX); PLA_Chol(PLA_LOWER_TRIANGULAR, lXtSigInvX); //get logSqrtDetXtSigInvX logSqrtDetXtSigInvX = logProd_diag_matrix(lXtSigInvX); //make local copies of the values that will need: betahat_val, lXtSigInvX_val myPLA_copy_vector_to_local(XtSigInvZ, betahat_val); myPLA_copy_matrix_to_local(lXtSigInvX, lXtSigInvX_val); //solve betahat_val<- lXtSigInvX_val^-1 * betahat_val for not tr and tr PLA_dtrsv(&LO, &NTRANS, &NDIAG, &p, lXtSigInvX_val, &p, betahat_val, &one); PLA_dtrsv(&LO, &TRANS, &NDIAG, &p, lXtSigInvX_val, &p, betahat_val, &one); //get loglik myPLA_copy_vector_to_global(betahat, betahat_val); MPI_Barrier(MPI_COMM_WORLD); //lSpatInvZ<-lSpatInvZ-lSpatInvX*betahat PLA_Gemv(PLA_NO_TRANSPOSE, minus_one, lSpatInvX, betahat, one, lSpatInvZ); PLA_Mscalar_create(MPI_DOUBLE, PLA_ALL_ROWS, PLA_ALL_COLS, 1, 1, templat, &quadForm); //quadForm<-lSpatInvZ'lSpatInvZ PLA_Dot(lSpatInvZ, lSpatInvZ, quadForm); *quadForm_val = value_of_scalar(quadForm); *loglik_val = logSqrtDet + logSqrtDetXtSigInvX + (as + (n - p) / 2.0) * log( *quadForm_val / 2.0 + bs); MPI_Barrier(MPI_COMM_WORLD); //free memories PLA_Obj_free(&lSpat); PLA_Obj_free(&lSpatInv); PLA_Obj_free(&lSpatInvX); PLA_Obj_free(&lSpatInvZ); PLA_Obj_free(&XtSigInvX); PLA_Obj_free(&XtSigInvZ); PLA_Obj_free(&quadForm); PLA_Obj_free(&betahat); PLA_Obj_free(&lXtSigInvX); PLA_Obj_free(&minus_one); PLA_Obj_free(&one); PLA_Obj_free(&zero); } void fitUltraSimple (PLA_Obj dist, // n x n PLA_Obj X, // n x p PLA_Obj z, // n x 1 long int iters, //output double* beta, //nIter x p double* sigmasq, //nIter x 1 double* phi //nIter x 1 ) { int command, me, np; MPI_Comm comm; long int iIter, acceptCtr, neval; int n, p, XtXN, pSq, done; int iOne = 1, i; double dOne = 1.0, dZero = 0.0, negDOne = -1.0; char LEFT = 'L', LO = 'L', TRANS = 'T', NTRANS = 'N', NDIAG = 'N'; double as = 2.5, bs = 12.5; double as2;// = as + (n - p) / 2.0; double l1 = 0.2, r1 = 5.0, currl, currr; double loglik, quadForm, loglikc, quadFormc; double *betahat, *lXtSigInvX, *betahatc, *lXtSigInvXc; //temporary double cand, logR, u, bs2, e, zstar; double tmp; int batcum = 0, batchsize = 20; //set up a random number unsigned int ii = 1234; unsigned int jj = 5678; set_seed(ii, jj); get_seed(&ii, &jj); // extract this node's rank and size MPI_Comm_rank( MPI_COMM_WORLD, &me ); MPI_Comm_size( MPI_COMM_WORLD, &np ); //here we have width and length of the X: PLA_Obj_global_length(X, &n); PLA_Obj_global_width(X, &p); //so we can calculate dimensions for X'X XtXN = n * n; pSq = p * p; as2 = as + (n - p) / 2.0; /* computation */ /* PLA_Mvector_create ( MPI_DOUBLE, p, 1, templat, 0, &betahat); PLA_Mvector_create_conf_to ( betahat, 1, &betahatc); PLA_Matrix_create ( MPI_DOUBLE, p, p, templat, 0, 0, &lXtSigInvX); PLA_Matrix_create_conf_to ( lXtSigInvX, &lXtSigInvXc); */ //memory allocation betahat = (double *) calloc(p, sizeof(double)); lXtSigInvX = (double *) calloc(p * p, sizeof(double)); betahatc = (double *) calloc(p, sizeof(double)); lXtSigInvXc = (double *) calloc(p * p, sizeof(double)); phi[0] = 1.0; sigmasq[0] = var_vector(z); MPI_Barrier(MPI_COMM_WORLD); myPLA_getLoglik(dist, X, z, phi[0], as, bs, me, betahat, lXtSigInvX, &loglik, &quadForm); loglik = - loglik; // change to the correct sign //copy a vector of doubles of length p at addresses beta<-betahat (localOA) PLA_dcopy(&p, betahat, &iOne, beta, &iOne); acceptCtr = 0; neval = 0; for (iIter = 1; iIter < iters; iIter++) { if (me == 0) { batcum++; if (batcum == batchsize){ printf("iIter = %d\n", iIter); batcum = 0; } e = rexp(1.0); } MPI_Bcast ( &e, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD ); MPI_Barrier(MPI_COMM_WORLD); zstar = loglik - e; currl = l1; currr = r1; done = 0; while (done == 0) { if (me == 0) { cand = runif(currl, currr); } MPI_Bcast ( &cand, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD ); MPI_Barrier(MPI_COMM_WORLD); myPLA_getLoglik(dist, X, z, cand, as, bs, me, betahatc, lXtSigInvXc, &loglikc, &quadFormc); loglikc = - loglikc; neval++; if (me == 0) printf("iIter = %d, e = %f, cand = %f, zstar = %f, loglikc = %f\n", iIter, e, cand, zstar, loglikc); if (zstar < loglikc) { // accept done = 1; phi[iIter] = cand; loglik = loglikc; quadForm = quadFormc; dcopy_(&p, betahatc, &iOne, betahat, &iOne); dcopy_(&pSq, lXtSigInvXc, &iOne, lXtSigInvX, &iOne); } else {// shrink interval if (cand > phi[iIter - 1]) currr = cand; else currl = cand; } } if (me == 0) { // only do this on the master node bs2 = bs + quadForm / 2.0; sigmasq[iIter] = 1.0 / rgamma(as2, 1.0 / bs2); for (i = 0; i < p; i++) beta[iIter * p + i] = rnorm(0.0, 1.0); PLA_dtrsv(&LO, &TRANS, &NDIAG, &p, lXtSigInvX, &p, &beta[iIter * p], &iOne); tmp = sqrt(sigmasq[iIter]); dscal_(&p, &tmp, &beta[iIter * p], &iOne); daxpy_(&p, &dOne, betahat, &iOne, &beta[iIter * p], &iOne); } } MPI_Barrier(MPI_COMM_WORLD); /* Clean up */ free(betahat); free(lXtSigInvX); free(betahatc); free(lXtSigInvXc); return; } int main(int argc, char** argv) { int me, np, block_size = 64, block_size_alg = 128; //supporting MPI communicator MPI_Comm comm; long int nIter = 20000; //These will be width and length of the matrix X int p, n; double *beta, *phi, *sigmasq; PLA_Obj dist = NULL, X = NULL, Z = NULL; double start, end; MPI_Init(&argc, &argv);//Initialize MPI MPI_Comm_rank(MPI_COMM_WORLD, &me); //Extract the rank of this node MPI_Comm_size(MPI_COMM_WORLD, &np); //Indicates number of processors available //proposes suggested ratio for topology to be nprows/npcols=1 PLA_Comm_1D_to_2D_ratio(MPI_COMM_WORLD, 1.0, &comm); PLA_Init(comm);//initialize communicator //create template with block_size=64, that is length of subvectors of template //PLAPACK breaks vectors and matrices in logical mash of that size //0 - means that indexing starts with 0 (which is standard for C) //if you needed to work with Fortran you would set 1 PLA_Temp_create( block_size, 0, &templat ); //set algorithmic blocking size pla_Environ_set_nb_alg(PLA_OP_ALL_ALG, block_size_alg ); // a subfunction that does required preparations of: // read in the input matrices: dist, X, Z DataPrep ( me, argv[1], &X, &Z, &dist ); //and syncronize as you remember with MPI: MPI_Barrier( MPI_COMM_WORLD ) MPI_Barrier( MPI_COMM_WORLD ); //get width and length of the matrix X (global width and lengthOB) PLA_Obj_global_width(X, &p); PLA_Obj_global_length(X, &n); //allocate memories for mcmc samples //allocate memory enough for 20000 iterations as it was the default value beta = (double *) calloc(nIter * p, sizeof(double)); sigmasq = (double *) calloc(nIter, sizeof(double)); phi = (double *) calloc(nIter, sizeof(double)); //get actual number of iterations to doOB nIter = atoi(argv[2]); //call the engine if (me == 0) start = Timing(); //fit the model given all the information we have by now fitUltraSimple(dist, X, Z, nIter, beta, sigmasq, phi); if (me == 0) { end = Timing(); //printf("**** fitUltraSimple RUNNING TIME = ( %3.4lf ) ****\n", end - start); printf("%3.4lf\n", end - start); } //output the results, write beta, sigmasq, and phi into a file? MPI_Barrier(MPI_COMM_WORLD); if (me == 0) { //printf("save to file\n"); WriteArrayToFile(beta, nIter, p, "beta.out"); WriteArrayToFile(sigmasq, nIter, 1, "sigmasq.out"); WriteArrayToFile(phi, nIter, 1, "phi.out"); //printf("after save\n"); } } MPI_Barrier(MPI_COMM_WORLD); //clean up free(beta); free(sigmasq); free(phi); if (me == 0) { //printf("between free\n"); } PLA_Obj_free(&dist); PLA_Obj_free(&X); PLA_Obj_free(&Z); if (me == 0) { //printf("after free\n"); } //PLA_Temp_free(&templat); PLA_Finalize(); MPI_Finalize(); //if (me == 0) printf("free template?\n"); //free(&templat); //why can not it be freed????? p4error }