/* File: minsplay.c Author: Douglas Jones, Dept. of Comp. Sci., U. of Iowa, Iowa City, IA 52242. Date: Nov. 1. 1990 (major revision of the Splay data compression utility written by Jeffrey Chilton and Douglas Jones, as revised by Jung Joowon in the summer of 1990) Copyright: This material is derived from code Copyrighted 1988 by Jeffrey Chilton and Douglas Jones. That code contained a copyright notice allowing copying for personal or research purposes, so long as copies of the code were not sold for direct commercial advantage. This version of the code has been stripped of most of the material added by Jeff Chilton, and this release of the code may be used or copied for any purpose, public or private. Patents: The algorithm central to this code is entirely the invention of Douglas Jones, and it has not been patented. Any patents claiming to cover this material are invalid. Exportability: Splay-tree based compression algorithms may be used for cryptography, and when used as such, they may not be exported from the United States without appropriate approval. All cryptographic features of the original version of this code have been removed. Language: C Purpose: Data compression program, a companion to minunsplay.c Algorithm: Uses a splay-tree based prefix code. For a full understanding of the operation of this data compression scheme, refer to the paper "Applications of Splay Trees to Data Compression" by Douglas W. Jones in Communications of the ACM, Aug. 1988, pages 996-1007. Operation: This program takes a stream of bytes from standard input, and delivers a compressed stream of bytes to standard output. Internally, all I/O is done with calls to putchar and getchar. All calls to getc are in the main program, and can easily be replaced with code to obtain data from other sources. Putchar is called in the main program to output a prefix to the compressed code, and it is called in the compress and compfini routines to output the compressed byte sequence. Warning: This code has been highly optimized in the compress procedure! the intent of the optimization is that each line should correspond to roughly one machine instruction on typical register-memory machines. Additionally, the code has been arranged to minimize the number of branches and maximize the amount of straight-line code; this allows pipelined processors to get the most out of it */ #include /* The magic numbers are a two byte prefix on the stream of compressed data. This prefix is applied by the main program, and it serves to distinguish files compressed this way from files produced by other software. The particular prefix used here is compatible with the splay-compress UNIX data compression utility. In other contexts, there is no reason to use this particular prefix or any prefix at all. */ #define MAGIC1 0x93 /* ^S with the high bit set */ #define MAGIC2 0x10 /* ^P */ /* The following constants depend on the size of the alphabet being compressed, and are used to dimension the arrays that hold the tree used for compression and to mark key elements in these trees. Throughout, it is assumed that the alphabet of source characters is contiguous and runs from zero to some maximum. */ #define MAXCHAR 256 /* number of distinct character codes in alphabet */ #define TWICEMAX 513 /* 2 * MAXCHAR + 1 */ #define ROOT 0 /* the root of the tree in left[0],right[0],up[0] */ /* Throughout, it is assumed that bytes are 8 bits and words are 16 bits; these types are defined here */ #define BYTE char #define WORD short int /* prefix macro - every reference to the arrays left, right and up is prefixed with this. The effect is to make the array an array of 16 bit integers, but with cells at offsets 0 2 4 6 ... instead of the usual 0 1 2 3 ... ; as a result, no multiply by two is required in the array subscript computation. The point of this is to allow simple indexed addressing, with the array name as an index constant and the array subscript as an index register. */ #define prefix *(WORD *) & /* global variables that hold the tree. These are logically arrays of 16 bit words, but the storage is deliberately allocated in bytes. These must not be used for any other purpose between a call to compinit and compfini. These arrays are used to store the nodes of the tree used during compression. Node i of the tree is stored in left[i*2], right[i*2] and up[i*2]. Nodes from the root (node 0) to node MAXCHAR-1 are internal nodes of the tree, and nodes MAXCHAR to TWICEMAX are leaves. For an alphabet of MAXCHAR characters, this allows one extra character to be used as an end-of-file mark. */ BYTE left [MAXCHAR * 2]; /* left[i] holds left child of node i */ BYTE right [MAXCHAR * 2]; /* right[i] holds right child of node i */ BYTE up [TWICEMAX * 2]; /* up[i] holds parent of node i */ /* data structures used to compress pack successive compressed bits into bytes. These must not be used for any other purpose between a call to compinit and compfini. */ #define MAXBITCNT 8 /* bits per byte */ WORD bitbuf; /* buffer holding at least 8 bits */ WORD bitcnt; /* count of unused bits in bitbuf, values 0 to 8 */ #define MAXBYTES 8 /* bytes per frame */ BYTE bytebuf [MAXBYTES]; /* buffer holding bytes awaiting transmission */ BYTE *bytemax; /* address of byte beyond last byte in buffer */ BYTE *byteptr; /* next position in bytebuf to be filled */ /* data structure used to reverse the order of bits within compress. Outside of compress, this storage area may be freely used for other purposes. The only reason it is statically allocated is to allow fast addressing. On the average, only a few bits of the stack will be used, but there is a very small chance that the tree will become completely unbalanced, in which case, a tree of MAXCHAR + 1 leaves will require a stack depth of MAXCHAR */ BYTE stack[MAXCHAR]; /* begin procedure to put bytebuf out */ sendbuf() { /* The following loop is a dumb model of some kind of data transmission interface; this is not to be taken as a model of any sensible way of sending the buffer. In a production system, multiple buffering of some form would almost certainly be used, where each call to this procedure initiates the transmission of one buffer while changing byteptr to use some other buffer. */ { WORD i; for (i = 0; &bytebuf[i] < byteptr; i++) { putchar(bytebuf[i]); }} byteptr = &bytebuf[0]; bytemax = &bytebuf[MAXBYTES]; return; } /* end procedure to put bytebuf out */ /* begin procedure to put a byte in bytebuf */ putbyte(b) BYTE b; { *byteptr = b; ++byteptr; if (byteptr == bytemax) { /* the buffer is full, send it */ sendbuf(); } } /* end procedure to put a byte in bytebuf */ /* begin build initial splay tree and initialize for compression */ compinit() { /* Note: Fast versions of compinit may be built by statically constructing the initial splay tree in ROM and then doing a block copy operation from ROM to RAM. */ WORD i,j; /* loop control variables used to build the tree */ /* first, make each node point to its parent */ for (i = 4; i <= TWICEMAX * 2; i += 2) { prefix up[i-2] = ((i>>2)<<1)-2; } /* then, make each node point to its two children */ for (j = 2; j <= MAXCHAR * 2; j += 2) { prefix left[j-2] = (j << 1)-2; prefix right[j-2] = ((j + 1) << 1)-2; } bitcnt = MAXBITCNT; /* no bits have been accumulated */ byteptr = &bytebuf[0]; bytemax = &bytebuf[MAXBYTES]; } /* end build initial splay tree and initialize for compression */ /* begin compress function */ compress(plain) WORD plain; { register BYTE *sp; /* stack pointer used to reverse bit order */ register WORD a, b; /* children of nodes to semi-rotate */ register WORD c, d; /* pair of nodes to semi-rotate */ a = plain + MAXCHAR; a <<= 1; sp = &stack[0]; /* the following loop walks up the tree semi-rotating pairs of nodes and pushing bits on the stack. It is written as an infinite loop with break and continue statements at the end of each arm of the nested conditionals in the body to make it loop. In fact, it could be coded as a do { } while loop that breaks when the root is reached. */ for (;;) { c = prefix up[a]; if (a == prefix left[c]) { /* a left son of c */ *sp = 0; sp++; if (c == ROOT) break; d = prefix up[c]; b = prefix left[d]; if (c == b) { /* c is left son of d */ *sp = 0; sp++; b = prefix right[d]; prefix right[d] = a; prefix left[c] = b; prefix up[b] = c; prefix up[a] = d; a = d; if (a != ROOT) continue; break; /* loop exit! */ } else { /* a left son of c right son of d */ *sp = 1; sp++; prefix left[d] = a; prefix left[c] = b; prefix up[b] = c; prefix up[a] = d; a = d; if (a != ROOT) continue; break; /* loop exit! */ } /* control flow never reaches this point */ } else { /* a right son of c */ *sp = 1; sp++; if (c == ROOT) break; d = prefix up[c]; b = prefix left[d]; if (c == b) { /* c is left son of d */ *sp = 0; sp++; b = prefix right[d]; prefix right[d] = a; prefix right[c] = b; prefix up[b] = c; prefix up[a] = d; a = d; if (a != ROOT) continue; break; /* loop exit! */ } else { /* a right son of c right son of d */ *sp = 1; sp++; prefix left[d] = a; prefix right[c] = b; prefix up[b] = c; prefix up[a] = d; a = d; if (a != ROOT) continue; break; /* loop exit! */ } } } /* control flow never reaches this point */ /* all break statements in the above code branch to this point */ /* the following loop could be coded as do {} while (sp != &stack[0]); but it has been optimized so the loop continuation is from within each arm of the if statement and the function return is direct from inside the loop */ for (;;) { /* pop the stack and transmit the bits */ --sp; bitbuf <<= 1; bitbuf |= *sp; --bitcnt; if (bitcnt != 0) { /* normal case, bitbuf not full */ if (sp != &stack[0]) continue; return; } else { /* abnormal case, bitbuf full, transmit it */ /* this directly puts characters in the bytebuffer instead of calling putbyte to avoid the cost of a procedure call in the normal case */ *byteptr = (char)(bitbuf & 0xff); ++byteptr; if (byteptr != bytemax) { /* normal, bytebuf not full */ bitcnt = MAXBITCNT; if (sp != &stack[0]) continue; return; } else { /* abnormal case, bytebuf full, transmit it */ sendbuf(); bitcnt = MAXBITCNT; if (sp != &stack[0]) continue; return; } } } } /* control flow never reaches this point */ /* end compress function */ /* begin flush the buffers */ compfini() { if (bitcnt != MAXBITCNT) { /* there are buffered bits */ do { /* shift the bits to the left */ bitbuf <<= 1; --bitcnt; } while (bitcnt != 0); *byteptr = (char)(bitbuf & 0xff); ++byteptr; } if (byteptr != (bytemax-MAXBYTES)) { /* there is a partial buffer */ /* This code is entirely dependent on the data transmission interface. */ sendbuf(); } } /* end flush the buffers */ /* THE MAIN PROGRAM */ main(argc, argv) int argc; char *argv[]; { WORD plain; compinit(); /* output the magic number required by the UNIX unsplay utility */ putbyte((char)MAGIC1); putbyte((char)MAGIC2); putbyte((char)1); /* begin compress the input stream */ for (;;) { if (((plain = getchar()) == EOF) && feof(stdin)) break; compress((WORD)(plain & 0xff)); } compress(MAXCHAR); /* end compress the input stream */ compfini(); exit(0); }