/* startstop.c */ /* read BNF from standard input, find start set and follow set for each nonterminal */ /* written by Douglas Jones, March 1990 in Pascal, based on pieces of cruncher, rewritten in C, Jan 2007 */ /* NOTE: This program was developed as 'hack work' and as such it is not a polished product. Code in it was written and then massaged with no attention to clarity until it worked, and then left that way. Computational efficiency was completely ignored, and recursive structures have been used for such applications as linear searching when iterative solutions could have been used. */ /* INPUT FORMAT: Terminal symbols can be quoted as '+' and nonterminals can be bracketed as , although in the processing does not depend on this -- nonterminals are those that appear on the left-hand side of production rules, terminals do not. Rules are written as follows: | | ::= 'b' 'd' | Indenting indicated continuation. Multiple rules that reduce to the same nonterminal may be written using the vertical bar separator as: | | ::= 'b' | | 'd' | The distinguished symbol from which the grammar hangs must be documented, since the output of the processing is done by traversing the grammar starting at the distinguished symbol as the root: | |> | */ /* OUTPUT FORMAT: The grammare is pretty printed with each nonterminal listed once, on the left-hand side of a vertical bar separated list of production rules. A single nonterminal might be documented as follows -- the symbol <<< introduces the start set of the nonterminal, while the symbol >>> introduces the follow set: | <<< '+' '-' | >>> ')' | ::= 'b' | | 'd' */ #include #include /* LIMIT ON NUMBER OF NONTERMINAL CHARACTERS */ #define STRINGLIMIT 10000 /* LIMIT ON SIZE OF ANY SYMBOL (T OR NT) */ #define SYMLEN 50 /* BASIC CONSTANTS */ #define TAB '\t' #define BOOLEAN int #define TRUE 1 #define FALSE 0 /* DISPLACEMENT INTO STRING TABLE OF A STRING (0 .. STRINGLIMIT) */ #define STRINGPT int /* TYPES OF STRINGS */ #define STYPE int #define ISEMPTY 0 #define CANBEEMPTY 1 #define TOUCHED 2 #define UNTOUCHED 3 #define STYPES 4 /* POINTER TYPES */ #define PSYMBOL struct symbol * #define PPRODUCTION struct production * #define PELEMENT struct element * struct symbol { /* HAS LIST OF PRODUCTIONS ASSOCIATED WITH SYMBOL */ STRINGPT name; PSYMBOL next; /* POINTS TO NEXT ITEM IN LIST */ PPRODUCTION data; /* POINTS TO THE CONTENTS OF THE NODE */ STYPE state; PELEMENT starter; /* LIST OF TERMINALS SYM MAY START WITH */ PELEMENT ender; /* LIST OF TERMINALS SYM MAY BE FOLLOWED BY */ }; struct production { /* HAS LIST OF ELEMENTS MAKING UP RULE */ PPRODUCTION next; PELEMENT data; STYPE state; PELEMENT starter; /* LIST OF TERMINALS PROD MAY START WITH */ PELEMENT ender; /* LIST OF TERMINALS PROD MAY BE FOLLOWED BY */ }; struct element { /* USED TO MAKE LISTS OF SYMBOLS */ PELEMENT next; PSYMBOL data; STYPE state; }; /* ALLOCATORS */ #define NEWSYMBOL (PSYMBOL) malloc(sizeof(struct symbol)) #define NEWPRODUCTION (PPRODUCTION)malloc(sizeof(struct production)) #define NEWELEMENT (PELEMENT)malloc(sizeof(struct element)) char stringtab[STRINGLIMIT]; STRINGPT stringlim; /* FREE POINTER, 0 INITIALLY */ PSYMBOL symlist; /* HEAD OF MAIN LIST OF TERMINALS AND NONTERM */ PSYMBOL emptypt; /* IDENTITY OF EMPTY SYMBOL */ PSYMBOL head; /* IDENTIFIES DISTINGUISHED SYMBOL OF GRAMMAR */ void writenam(PSYMBOL s) /* output symbol to standard error */ { STRINGPT x,y; x = s->name; y = x + (int) stringtab[x]; for (x = x+1; x <= y; x++) putc(stringtab[x], stderr); } /* * formatting package for output of grammar to standard output */ /* pretty printing stuff */ int lpos; /* position on line */ int ltab; /* tab position on line */ /* statistics about grammar */ int terminals, nonterms, rules, symbols; /* element lists */ PELEMENT list; /* NONTERMINALS TO BE PRINTED */ PELEMENT tlist; /* TERMINALS TO BE PRINTED */ PELEMENT sublist; void writesym(PSYMBOL l) { STRINGPT x,y; x = l->name; y = (int) stringtab[x]; lpos = lpos + y; y = x + y; for (x = x+1; x <= y; x++) putc(stringtab[x], stdout); } void putlist(PELEMENT * e, PELEMENT e1) /* add e1 to end of list e */ { if (*e == NULL) { *e = e1; } else { PELEMENT p = *e; while (p->next != NULL) p = p->next; p->next = e1; } } void writelist(PELEMENT l) { PELEMENT ep; rules = rules + 1; while (l != NULL) { int i; i = (int) stringtab[l->data->name]; if ((lpos+i) > 75) { putc('\n', stdout); for (i = 0; i <= ltab; i++) putc(' ', stdout); putc(' ', stdout); putc(' ', stdout); putc(' ', stdout); putc(' ', stdout); lpos = ltab + 5; } writesym(l->data); putc(' ', stdout); lpos = lpos + 1; if (l->data->state == UNTOUCHED){ ep = NEWELEMENT; ep->next = NULL; ep->data = l->data; symbols = symbols + 1; l->data->state = TOUCHED; if (l->data->data == NULL) { /* TERMINAL */ putlist(&tlist, ep); } else { /* NONTERMINAL */ putlist(&sublist, ep); } } l = l->next; } } void dumbwrite(PELEMENT l) { int i; while (l != NULL) { i = stringtab[l->data->name]; if ((lpos+i) > 75) { putc('\n', stdout); for (i=0; i<=ltab; i++) putc(' ', stdout); putc(' ', stdout); putc(' ', stdout); putc(' ', stdout); putc(' ', stdout); lpos = ltab + 5; } writesym(l->data); putc(' ', stdout); lpos = ltab + 1; l = l->next; } } void writeprod(PELEMENT l) { PPRODUCTION p; int i; sublist = NULL; putc('\n', stdout); putc('\n', stdout); lpos = 0; writesym(l->data); ltab = lpos; putc(' ', stdout); putc('<', stdout); putc('<', stdout); putc('<', stdout); putc(' ', stdout); dumbwrite(l->data->starter); putc('\n', stdout); lpos = 0; writesym(l->data); ltab = lpos; putc(' ', stdout); putc('>', stdout); putc('>', stdout); putc('>', stdout); putc(' ', stdout); dumbwrite(l->data->ender); putc('\n', stdout); lpos = 0; writesym(l->data); ltab = lpos; l->data->state = TOUCHED; p = l->data->data; putc(' ', stdout); putc(':', stdout); putc(':', stdout); putc('=', stdout); putc(' ', stdout); lpos = lpos + 5; while (p != NULL) { writelist(p->data); p = p->next; if (p != NULL) { putc('\n', stdout); for (i = 0; i <= ltab; i++) putc(' ', stdout); putc(' ', stdout); putc('|', stdout); putc(' ', stdout); putc(' ', stdout); lpos = ltab + 5; } } putlist(&sublist, list); list = sublist; } void writegram() { PSYMBOL s; PELEMENT l; fputs(" WRITING MODIFIED GRAMMAR\n", stderr); symbols = 1; rules = 0; terminals = 0; nonterms = 0; list = NULL; tlist = NULL; s = symlist; while (s != NULL) { /* mark all symbols as untouched */ s->state = UNTOUCHED; s = s->next; } putc('>', stdout); writesym(head); if (emptypt != NULL) { /* output name of empty symbol */ putc('\n', stdout); putc('/', stdout); writesym(emptypt); } list = NEWELEMENT; list->next = NULL; list->data = head; while (list != NULL) { nonterms = nonterms + 1; l = list; list = list->next; writeprod(l); free(l); } while (tlist != NULL) { terminals = terminals + 1; l = tlist; tlist = l->next; free(l); } putc('\n', stdout); fputs(" nonterminal symbols =", stderr); fprintf(stderr, "%d\n", nonterms); fputs(" terminal symbols =", stderr); fprintf(stderr, "%d\n", terminals); fputs(" symbols (TOTAL) =", stderr); fprintf(stderr, "%d\n", symbols); fputs(" production rules =", stderr); fprintf(stderr, "%d\n", rules); s = symlist; symbols = 0; while (s != NULL) { if (s->state == UNTOUCHED) { symbols = symbols + 1; } s = s->next; } fputs(" unreferenced symbols=", stderr); fprintf(stderr, "%d\n", symbols); } /* * input grammar from standard input */ int ch; /* most recent char read from stdin, initially blank */ char str[SYMLEN+1]; /* most recent symbol from stdin */ int strend; /* index of last used space in str */ PSYMBOL lookup(PSYMBOL * link) /* lookup a symbol in the symbol list */ { int j,k; PSYMBOL l; PSYMBOL m; if ((*link) == NULL) { symbols = symbols + 1; *link = NEWSYMBOL; (*link)->name = stringlim; (*link)->next = NULL; (*link)->data = NULL; for (j = 0; j <= strend; j++) { stringtab[stringlim] = str[j]; stringlim = stringlim+1; if (stringlim > STRINGLIMIT) { fputs(" >>STRING POOL OVERVLOW<<\n", stderr); } } return *link; } else { l = *link; do { m = l; j = 0; k = l->name; while (str[j] == stringtab[k]) { j = j+1; k = k+1; } l = l->next; } while ( (j <= strend) && (l != NULL) ); if (j > strend) { return m; } else { return lookup( &(m->next) /* KNOWN TO BE NULL */ ); } } } PSYMBOL getsymbol() /* get symbol from input to str */ { /* MUST BE CALLED WITH CH NONBLANK, FIRST CHAR OF SYMBOL */ strend = 1; str[strend] = ch; if (ch == '<') { ch = getchar(); if ((ch != EOF) && (ch != '\n') && (ch != TAB) && (ch != ' ')) { strend = 2; str[strend] = ch; ch = getchar(); if ((ch != EOF) && (ch != '\n') && (ch != TAB) && (ch != ' ')) { strend = 3; str[ strend ] = ch; if ( ((ch <= 'z') && (ch >= 'a')) || ((ch <= 'Z') && (ch >= 'A')) || ((ch <= '9') && (ch >= '0')) ) { do { ch = getchar(); if (ch == '\n') break; if (ch == EOF) break; if (strend >= SYMLEN) { fputs( " >>SYMBOL TOO LONG<<\n" ,stderr); } else { strend = strend + 1; str[strend] = ch; } } while (ch != '>'); } } } if (ch == '>') ch = getchar(); } else { ch = getchar(); while ((ch != ' ') && (ch != TAB) && (ch != '\n') && (ch != EOF )) { if (strend >= SYMLEN) { fputs(" >>SYMBOL TOO LONG<<\n", stderr); } else { strend = strend + 1; str[strend] = ch; } ch = getchar(); } } str[0] = strend; return lookup(&symlist); } BOOLEAN endlist; /* set by nonblank at end of list */ BOOLEAN endrule; /* set by nonblank at end of rule */ void nonblank() /* scan for a nonblank character in ch */ { while ((ch == '|') || (ch == ' ') || (ch == TAB) || (ch == '\n') || (ch == EOF)) { if (ch =='|') { endlist = TRUE; ch = getchar(); return; } else if (ch == EOF) { endrule = TRUE; endlist = TRUE; return; } else if (ch == '\n') { ch = getchar(); if ((ch != ' ') && (ch != TAB)) { /* line starts with nonblank */ endrule = TRUE; endlist = TRUE; return; } } else { /* must have been blank or tab */ ch = getchar(); } } return; } PELEMENT getsymlist() /* get the list of symbols on RHS of rule */ { PELEMENT s; nonblank(); if (endlist == TRUE) { endlist = FALSE; return NULL; } else { s = NEWELEMENT; s->data = getsymbol(); s->next = getsymlist(); return s; } } PPRODUCTION getprod() /* get a production rule */ { PPRODUCTION h; nonblank(); if (endrule == TRUE) { endrule = FALSE; endlist = FALSE; return NULL; } else { rules = rules + 1; h = NEWPRODUCTION; h->data = getsymlist(); h->next = getprod(); return h; } } void readgram() /* read grammar from stdin */ { PSYMBOL s; PPRODUCTION p; fputs(" READING GRAMMAR\n", stderr); rules = 0; symbols = 0; endlist = FALSE; endrule = FALSE; ch = getchar(); while (ch != EOF) { if (ch == '>') { /* IDENTIFY DISTINGUISHED SYMBOL */ ch = getchar(); nonblank(); head = getsymbol(); while ((ch != '\n') && (ch != EOF)) { ch = getchar(); }; if (ch == '\n') ch = getchar(); } else if (ch == '/') { /* IDENTIFY THE EMPTY (/)SYMBOL */ ch = getchar(); nonblank(); emptypt = getsymbol(); while ((ch != '\n') && (ch != EOF)) { ch = getchar(); }; if (ch == '\n') ch = getchar(); } else if (ch == '\\') { /* COMMENT */ while ((ch != '\n') && (ch != EOF)) { ch = getchar(); }; if (ch == '\n') ch = getchar(); } else { /*WE HAVE A RULE*/ s = getsymbol(); nonblank(); if (ch == ':') { ch = getchar(); if (ch == ':') { ch = getchar(); if (ch == '=') { ch = getchar(); } } else if (ch == '=') { ch = getchar(); } } else if (ch == '=') { ch = getchar(); } p = s->data; if (p == NULL) { s->data = getprod(); } else { while (p->next != NULL) p = p->next; p->next = getprod(); } } } if (head != NULL) { fputs(" distinguished symbol =", stderr); writenam(head); putc('\n', stderr); } else { fputs(" no distinguished symbol\n", stderr); } if (emptypt != NULL) { fputs(" empty symbol =", stderr); writenam(emptypt); putc('\n', stderr); } else { fputs(" no empty symbol\n", stderr); } fputs(" symbols (TOTAL) =", stderr); fprintf(stderr, "%d\n", symbols); fputs(" production rules =", stderr); fprintf(stderr, "%d\n", rules); } /* * package to find start symbols for each nonterminal */ BOOLEAN changed; /* has a symbol been added */ void addsym( PELEMENT * head, PSYMBOL s) /* add symbol s to set head */ { if (*head == NULL) { /* APPEND IT */ *head = NEWELEMENT; (*head)->next = NULL; (*head)->data = s; changed = TRUE; } else { if ((*head)->data != s) addsym(&((*head)->next), s); } } void addsymbols( PELEMENT * head, PELEMENT e ) { while (e != NULL) { addsym( head, e->data ); e = e->next; } } void dosymbol(PSYMBOL s) { PPRODUCTION p; s->starter = NULL; s->state = ISEMPTY; p = s->data; if (p == NULL) { /* TERMINAL */ addsym( &(s->starter), s ); } else { /* NONTERMINAL */ do { if (p->data->data->state == UNTOUCHED) { dosymbol( p->data->data ); } addsymbols( &(s->starter), p->data->data->starter ); p = p->next; } while (p != NULL); } s->state = TOUCHED; } void getstarts() { PSYMBOL s; s = symlist; while (s != NULL) { s->state = UNTOUCHED; s = s->next; } s = symlist; while (s != NULL) { if (s->state == UNTOUCHED) dosymbol( s ); s = s->next; } } /* * package to find stop symbols for each nonterminal */ void getstoprule( PPRODUCTION p ) { PELEMENT e; /* the current element */ PELEMENT pe; /* the previous element in this rule */ e = p->data; pe = NULL; while (e != NULL) { /* for each symbol making up rule */ if (pe != NULL) { if (pe->data->data != NULL) { /* pe is nonterminal */ if (e->data->data == NULL) { /* terminal e */ /* add e to enders of pe */ addsym( &(pe->data->ender), e->data ); } else { /* add starters of e to enders of pe */ addsymbols( &(pe->data->ender), e->data->starter ); } } } pe = e; e = e->next; } } void pushstops( PPRODUCTION p, PSYMBOL s ) /* apply stoppers of s to p */ { PELEMENT e; /* the current element */ e = p->data; while (e->next != NULL) e = e->next; if (e->data->data != NULL) { /* e is last element of rule p, e is nonterminal so add stoppers of s to stoppers of e */ addsymbols(&(e->data->ender), s->ender); } } void getstops() { PSYMBOL s; PPRODUCTION p; s = symlist; while (s != NULL) { s->state = UNTOUCHED; s->ender = NULL; s = s->next; } /* first pass: get local info on stop set from within rules */ s = symlist; while (s != NULL) { p = s->data; while (p != NULL) { /* for each rule of symbol */ getstoprule( p ); p = p->next; } s = s->next; } /* second pass: push stop set down from each symbol */ do { changed = FALSE; s = symlist; while (s != NULL) { p = s->data; while (p != NULL) { /* for each rule of symbol */ pushstops( p, s ); p = p->next; } s = s->next; } } while (changed == TRUE); } int main(int argc, int argv[]) { stringlim = 0; emptypt = NULL; symlist = NULL; head = NULL; readgram(); getstarts(); if (head != NULL) { getstops(); writegram(); } else { fputs(" >>NO DISTINGUISHED SYMBOL, CANT LIST<<\n", stderr); } }