- Data structures and some basic operations
- How to manage memory
- Simple vector operations: An RK4 routine
- Using routines for lists of arguments
- A least squares problem
- A sparse matrix example
- How do I ....?

#include "matrix.h" .............. main() { MAT *A; VEC *x; PERM *p; .......... A = m_get(3,4); x = v_get(10); p = px_get(10); .......... }This initialises these data structures to have the given size. The matrix

#include "matrix.h" ............. main() { FILE *fp; MAT *A; ............. fp = fopen("fred","w"); m_foutput(fp,A); ............. }These input routines allow for the presence of comments in the data. A comment in the input starts with a ``hash'' character ``#'', and continues to the end of the line. For example, the following is valid input for a 3-dimensional vector:

# The initial vector must not be zero # x = Vector: dim: 3 -7 0 3For general input/output which conforms to this format, allowing comments in the input files, use the

input("Input number of steps: "," fp = stdin; finput(fp,"Input number of steps: "," fp = fopen("fred","r"); finput(fp,"Input number of steps: ","The

#include "matrix2.h"at the beginning of your program. It contains factorisation and solution routines for LU, Cholesky and QR-factorisation methods, as well as update routines for Cholesky and QR factorisations. Supporting these are a number of Householder transformation and Givens' rotation routines. Also there is a routine for generating the

#include "zmatrix.h"for using the basic routines, and

#include "zmatrix2.h"for the complex matrix factorisation routines. The

#include "sparse.h"or, if you use any sparse factorisation routines

#include "sparse2.h"at the beginning of your file. The routines contained in the library include routines for creating, destroying, initialising and updating sparse matrices, and also routines for sparse matrix--dense vector multiplication, sparse LU factorisation and sparse Cholesky factorisation. For using the iterative routines you need to use

#include "iter.h"This includes the

#includeThis file is

There are three main strategies that are recommended for deciding how to
allocate, deallocate and resize objects.
These are ``*no deallocation*'' which is really only useful for
demonstration programs, ``*allocate and deallocate*'' which minimises
overall memory requirements at the expense of speed, and ``*resize on
demand*'' which is useful for routines that are called repeatedly.
A new technique for static workspace arrays is to ``*register workspace
variables*''.

QR = m_copy(A,MNULL); /* allocate memory for QR */to allocate the memory, but without the call

for ( ... ; ... ; ... ) { QR = m_copy(A,MNULL); /* allocate memory for QR */ /* could have been allocated by m_get() */ /* use QR */ ...... ...... /* deallocate QR so memory can be reused */ M_FREE(QR); }The allocate and deallocate statements could also have come at the beginning and end of a function or procedure, so that when the function returns, all the memory that the function has allocated has been deallocated. This is most suitable for functions or sections of code that are called repeatedly but involve fairly extensive calculations (at least a matrix--matrix multiply, or solving a system of equations).

rk4(...,x,...) { static VEC *v1=VNULL, *v2=VNULL, *v3=VNULL, *v4=VNULL, *temp=VNULL; ....... v1 = v_resize(v1,x->dim); v2 = v_resize(v2,x->dim); v3 = v_resize(v3,x->dim); v4 = v_resize(v4,x->dim); temp = v_resize(temp,x->dim); ....... }The intention is that the

static VEC *v1, *v2, *v3, *v4, *temp;This strategy of resizing static workspace variables is not so useful if the object being allocated is extremely large. The previous ``allocate and deallocate'' strategy is much more efficient for memory in those circumstances. However, the following section shows how to get the best of both worlds.

rk4(...,x,...) { static VEC *v1, *v2, *v3, *v4, *temp; ....... v1 = v_resize(v1,x->dim); MEM_STAT_REG(v1,TYPE_VEC); v2 = v_resize(v2,x->dim); MEM_STAT_REG(v2,TYPE_VEC); ...... }Normally, these registered workspace variables remain allocated. However, to implement the ``deallocate on exit'' approach, use the following code:

...... mem_stat_mark(1); rk4(...,x,...) mem_stat_free(1); ......To keep the workspace vectors allocated for the duration of a loop, but then deallocated, use

...... mem_stat_mark(1); for (i = 0; i < N; i++ ) rk4(...,x,...); mem_stat_free(1); ......The number used in the

*x'=f(t,x), x(t_0)=x_0*

for *x(t_i)*, *i=1,2,3,...* where *t_i=t_0+i h* and *h*
is the step size.
The formulae for the 4th order Runge--Kutta method are:

- x_{i+1}=x_i+ (h/6){v_1+2v_2+2v_3+v_4} where
- v_1=f(t_i,x_i)
- v_2=f(t_i+ h/2,x_i+(h/2) v_1)
- v_3=f(t_i+ h/2,x_i+(h/2) v_2)
- v_4=f(t_i+h,x_i+h v_3)

#include "matrix.h" /* rk4 -- 4th order Runge--Kutta method */ double rk4(f,t,x,h) double t, h; VEC *(*f)(), *x; { static VEC *v1=VNULL, *v2=VNULL, *v3=VNULL, *v4=VNULL; static VEC *temp=VNULL; /* do not work with NULL initial vector */ if ( x == VNULL ) error(E_NULL,"rk4"); /* ensure that v1, v2, etc. are of the correct size */ v1 = v_resize(v1,x->dim); v2 = v_resize(v2,x->dim); v3 = v_resize(v3,x->dim); v4 = v_resize(v4,x->dim); temp = v_resize(temp,x->dim); /* register workspace variables */ MEM_STAT_REG(v1,TYPE_VEC); MEM_STAT_REG(v2,TYPE_VEC); MEM_STAT_REG(v3,TYPE_VEC); MEM_STAT_REG(v4,TYPE_VEC); MEM_STAT_REG(temp,TYPE_VEC); /* end of memory allocation */ (*f)(t,x,v1); /* most compilers allow: "f(t,x,v1);" */ v_mltadd(x,v1,0.5*h,temp); /* temp = x+.5*h*v1 */ (*f)(t+0.5*h,temp,v2); v_mltadd(x,v2,0.5*h,temp); /* temp = x+.5*h*v2 */ (*f)(t+0.5*h,temp,v3); v_mltadd(x,v3,h,temp); /* temp = x+h*v3 */ (*f)(t+h,temp,v4); /* now add: v1+2*v2+2*v3+v4 */ v_copy(v1,temp); /* temp = v1 */ v_mltadd(temp,v2,2.0,temp); /* temp = v1+2*v2 */ v_mltadd(temp,v3,2.0,temp); /* temp = v1+2*v2+2*v3 */ v_add(temp,v4,temp); /* temp = v1+2*v2+2*v3+v4 */ /* adjust x */ v_mltadd(x,temp,h/6.0,x); /* x = x+(h/6)*temp */ return t+h; /* return the new time */ }Note that the last parameter of

v1 = v_resize(v1,x->dim); v2 = v_resize(v2,x->dim); ....Here

static VEC *v1=VNULL, *v2=VNULL, *v3=VNULL, *v4=VNULL;or

static VEC *v1, *v2, *v3, *v4;The operations of vector addition and scalar addition are really the only

v_add(v1,v2, out), where

v_mltadd(x,v1,0.5*h,temp); /* temp = x+.5*h*v1 */We also need a number of ``utility'' operations. For example

Here is an implementation of the function *f* for simple harmonic motion:

/* f -- right-hand side of ODE solver */ VEC *f(t,x,out) VEC *x, *out; double t; { if ( x == VNULL || out == VNULL ) error(E_NULL,"f"); if ( x->dim != 2 || out->dim != 2 ) error(E_SIZES,"f"); out->ve[0] = x->ve[1]; out->ve[1] = - x->ve[0]; return out; }As can be seen, most of this code is error checking code, which, of course, makes the routine safer but a little slower. For a procedure like

#includeHere the initial values are entered as a vector by#include "matrix.h" main() { VEC *x; VEC *f(); double h, t, t_fin; double rk4(); input("Input initial time: "," input("Input final time: ", " x = v_get(2); /* this is the size needed by f() */ prompter("Input initial state:\n"); x = v_input(VNULL); input("Input step size: ", " printf("# At time v_output(x); while ( t < t_fin ) { t = rk4(f,t,x,min(h,t_fin-t));/* new t is returned */ printf("# At time v_output(x); t += h; } }

x = v_get(2); x = v_input(x);

To compile the program under Unix*(TM)*, if it is in a file
`tutorial.c` is:

cc -o tutorial tutorial.c meschach.aor, if you have an ANSI compiler,

cc -DANSI_C -o tutorial tutorial.c meschach.aHere is a sample session with the above program:

...... Input initial time: 0 Input final time: 1 Input initial state: Vector: dim: 2 entry 0: -1 entry 1: b entry 0: old -1 new: 1 entry 1: old 0 new: 0 Input step size: 0.1 At time 0, the state is Vector: dim: 2 1 0 At time 0.1, the state is Vector: dim: 2 0.995004167 -0.0998333333 ................. At time 1, the state is Vector: dim: 2 0.540302967 -0.841470478By way of comparison, the state at

...... Input initial time: 0 Input final time: 1 Input initial state: Vector: dim: 3 entry 0: 3 entry 1: 2 entry 2: -1 Input step size: 0.1 At time 0, the state is Vector: dim: 3 3 2 -1 "tutorial.c", line 79: sizes of objects don't match in function f() Sorry, aborting programThe error handler prints out the error message giving the source code file and line number as well as the function name where the error was raised. The relevant section of

if ( x->dim != 2 || out->dim != 2 ) error(E_SIZES,"f"); /* line 79 */The standard routines in this system perform error checking of this type, and also checking for undefined results such as division by zero in the routines for solving systems of linear equations. There are also error messages for incorrectly formatted input and end-of-file conditions.

To round off the discussion of this program, note that we have seen
interactive input of vectors.
If the input file or stream is not a tty (e.g., a file, a pipeline or
a device) then it expects the input to *have the same form as the
output for each of the data structures*.
Each of the input routines (`v_input()`, `m_input()`,
`px_input()`) skips over ``comments'' in the input data, as do the
macros `input()` and `finput()`.
Anything from a `#' to the end of the line (or EOF) is considered to
be a comment.
For example, the initial value problem could be set up in a file
`ivp.dat` as:

# Initial time 0 # Final time 1 # Solution is x(t) = (cos(t),-sin(t)) # x(0) = Vector: dim: 2 1 0 # Step size 0.1The output of the above program with the above input (from a file) gives essentially the same output as shown above, except that no prompts are sent to the screen.

#include "matrix.h" /* rk4 -- 4th order Runge--Kutta method */ double rk4(f,t,x,h) double t, h; VEC *(*f)(), *x; { static VEC *v1, *v2, *v3, *v4, *temp; /* do not work with NULL initial vector */ if ( x == VNULL ) error(E_NULL,"rk4"); /* ensure that v1, v2 etc. are of the correct size */ v_resize_vars(x->dim,&v1,&v2,&v3,&v4,&temp,NULL); /* register workspace variables */ mem_stat_reg_vars(0,TYPE_VEC,&v1,&v2,&v3,&v4,&temp,NULL); /* end of memory allocation */ (*f)(t,x,v1); v_mltadd(x,v1,0.5*h,temp); (*f)(t+0.5*h,temp,v2); v_mltadd(x,v2,0.5*h,temp); (*f)(t+0.5*h,temp,v3); v_mltadd(x,v3,h,temp); (*f)(t+h,temp,v4); /* now add: temp = v1+2*v2+2*v3+v4 */ v_linlist(temp,v1,1.0,v2,2.0,v3,2.0,v4,1.0,VNULL) /* adjust x */ v_mltadd(x,temp,h/6.0,x); /* x = x+(h/6)*temp */ return t+h; /* return the new time */ }

*Ax approx= b,*

for *x* where *A* is an *m x n* matrix (*m>n*) we
really need to solve the optimisation problem

min_*x ||Ax-b||_2^2.*
If we write *A=QR* where *Q* is an orthogonal *m x m* matrix and
*R* is an upper triangular *m x n* matrix then

||Ax-b||_2=||Rx-Q^T b||_2 = ||[R_1]b [Q_1^T]b|| ||[ O ] - [Q_2^T] ||_2

where *R_1* is an *n x n* upper triangular matrix.
If *A* has full rank then *R_1* will be an invertible matrix,
and the best least squares solution of *Ax approx= b* is
*x=inverse(R_1) Q_1^T b*.
These calculations can be be done quite easily as there is a
`QRfactor()` function available with the system.
`QRfactor()` is declared to have the prototype

MAT *QRfactor(MAT *A, VEC *diag);The matrix

#include "matrix2.h" main() { MAT *A, *QR; VEC *b, *x, *diag; /* read in A matrix */ printf("Input A matrix:\n"); A = m_input(MNULL); /* A has whatever size is input */

if ( A->m < A->n ) { printf("Need m >= n to obtain least squares fit\n"); exit(0); } printf("# A =\n"); m_output(A); diag = v_get(A->m); /* QR is to be the QR factorisation of A */ QR = m_copy(A,MNULL); QRfactor(QR,diag); /* read in b vector */ printf("Input b vector:\n"); b = v_get(A->m); b = v_input(b); printf("# b =\n"); v_output(b); /* solve for x */ x = QRsolve(QR,diag,b,VNULL); printf("Vector of best fit parameters is\n"); v_output(x); /* ... and work out norm of errors... */ printf("||A*x-b|| = v_norm2(v_sub(mv_mlt(A,x,VNULL),b,VNULL))); }Note that as well as the usual memory allocation functions like

mv_mlt(MAT *A, VEC *x, VEC *out). and also vector--matrix multiplication (with the vector on the left):

vm_mlt(MAT *A, VEC *x, VEC *out), with

cc -o leastsq leastsq.c meschach.a -lmA sample session using

Input A matrix: Matrix: rows cols:5 3 row 0: entry (0,0): 3 entry (0,1): -1 entry (0,2): 2 Continue: row 1: entry (1,0): 2 entry (1,1): -1 entry (1,2): 1 Continue: n row 1: entry (1,0): old 2 new: 2 entry (1,1): old -1 new: -1 entry (1,2): old 1 new: 1.2 Continue: row 2: entry (2,0): old 0 new: 2.5 .... .... (Data entry) .... # A = Matrix: 5 by 3 row 0: 3 -1 2 row 1: 2 -1 1.2 row 2: 2.5 1 -1.5 row 3: 3 1 1 row 4: -1 1 -2.2 Input b vector: entry 0: old 0 new: 5 entry 1: old 0 new: 3 entry 2: old 0 new: 2 entry 3: old 0 new: 4 entry 4: old 0 new: 6 # b = Vector: dim: 5 5 3 2 4 6 Vector of best fit parameters is Vector: dim: 3 1.47241555 -0.402817858 -1.14411815 ||A*x-b|| = 6.78938The

*u_{i,j+1}+u_{i,j-1}+u_{i+1,j}+u_{i-1,j}-4u_{ij}=
h^2 f(x_i,y_j)*,

for *i,j=1,...,N*
where *u_{0,j}=u_{i,0}=u_{N+1,j}=u_{i,N+1}=0* for *i,j=1,...,N* and
*h* is the common distance between grid points.
The first task is to set up the matrix describing this system of
linear equations.
The next is to set up the right-hand side.
The third is to form the incomplete Cholesky factorisation of this
matrix, and finally to use the sparse matrix conjugate gradient
routine with the incomplete Cholesky factorisation as preconditioner.
Setting up the matrix and right-hand side can be done by the following
code:

#define N 100 #define index(i,j) (N*((i)-1)+(j)-1) ...... A = sp_get(N*N,N*N,5); b = v_get(N*N); h = 1.0/(N+1); /* for a unit square */ ......

for ( i = 1; i <= N; i++ ) for ( j = 1; j <= N; j++ ) { if ( i < N ) sp_set_val(A,index(i,j),index(i+1,j),-1.0); if ( i > 1 ) sp_set_val(A,index(i,j),index(i-1,j),-1.0); if ( j < N ) sp_set_val(A,index(i,j),index(i,j+1),-1.0); if ( j > 1 ) sp_set_val(A,index(i,j),index(i,j-1),-1.0); sp_set_val(A,index(i,j),index(i,j),4.0); b->ve[index(i,j)] = -h*h*f(h*i,h*j); }Once the matrix and right-hand side are set up, the next task is to compute the sparse incomplete Cholesky factorisation of

LLT = sp_copy(A); spICHfactor(LLT);Now when that is done, the remainder is easy:

out = v_get(A->m); ...... iter_spcg(A,LLT,b,1e-6,out,1000,&num_steps); printf("Number of iterations = ......and the output can be used in whatever way desired. For graphical output of the results, the solution vector can be copied into a square matrix, which is then saved in MATLAB(TM) format using

VEC *x, *b; MAT *A, *LU; PERM *pivot; ...... LU = m_get(A->m,A->n); LU = m_copy(A,LU); pivot = px_get(A->m); LUfactor(LU,pivot); /* set values of b here */ x = LUsolve(LU,pivot,b,VNULL);

MAT *A, *QR; VEC *diag, *b, *x; ...... QR = m_get(A->m,A->n); QR = m_copy(A,QR); diag = v_get(A->n); QRfactor(QR,diag); /* set values of b here */ x = QRsolve(QR,diag,b,x);

/* A is the matrix whose e-vals and e-vecs are sought */ MAT *A, *T, *Q, *X_re, *X_im; VEC *evals_re, *evals_im; ...... Q = m_get(A->m,A->n); T = m_copy(A,MNULL); /* compute Schur form: A = Q.T.Q^T */ schur(T,Q); /* extract eigenvalues */ evals_re = v_get(A->m); evals_im = v_get(A->m); schur_evals(T,evals_re,evals_im); /* Q not needed for eiegenvalues */ X_re = m_get(A->m,A->n); X_im = m_get(A->m,A->n); schur_vecs(T,Q,X_re,X_im); /* k'th eigenvector is k'th column of (X_re + i*X_im) */

/* A.x == b is the system to be solved */ sp_mat *A, *LLT; VEC *x, *b; int num_steps; ...... /* set up A and b */ ...... x = m_get(A->m); LLT = sp_copy(A); /* preconditioning using incomplete Cholesky */ spICHfactor(LLT); /* now use pre-conditioned conjugate gradients */ x = iter_spcg(A,LLT,b,1e-7,x,1000,&num_steps); /* solution computed with relative residual < 10^{-7} */If explicitly storing such a matrix takes up too much memory, then if you can write a routine to perform the calculation of

VEC *mult_routine(user_def,x,out) void *user_def; VEC *x, *out; { /* compute out = A*x */ ...... return out; }

main() { ITER *ip; VEC *x, *b; ...... b = v_get(BIG_DIM); /* right-hand side */ x = v_get(BIG_DIM); /* solution */ /* set up b */ ...... ip = iter_get(b->dim, x->dim); ip->rhs = v_copy(b,ip->rhs); ip->info = NULL; /* if you don't want information about solution process */ v_zero(ip->x); /* initial guess is zero */ iter_Ax(ip,mult_routine,user_def); iter_cg(ip); printf("# Solution is:\n"); v_output(ip->x); ...... ITER_FREE(ip); /* destroy ip */ }The