Parsing nested structures with yacc

amoe <amoebae@gmail.com>
Thu, 7 Aug 2008 09:59:21 -0700 (PDT)

          From comp.compilers

Related articles
Parsing nested structures with yacc amoebae@gmail.com (amoe) (2008-08-07)
Re: Parsing nested structures with yacc cfc@shell01.TheWorld.com (Chris F Clark) (2008-08-08)
Re: Parsing nested structures with yacc max@gustavus.edu (Max Hailperin) (2008-08-09)
Re: Parsing nested structures with yacc kamalpr@hp.com (kamal) (2008-08-11)
Re: Parsing nested structures with yacc amoebae@gmail.com (amoe) (2008-08-19)
Re: Parsing nested structures with yacc max@gustavus.edu (Max Hailperin) (2008-08-20)
| List of all articles for this month |

From: amoe <amoebae@gmail.com>
Newsgroups: comp.compilers
Date: Thu, 7 Aug 2008 09:59:21 -0700 (PDT)
Organization: Compilers Central
Keywords: parse, question
Posted-Date: 07 Aug 2008 18:55:50 EDT

I'm trying to write a basic parser for S-expressions, basic list
structures used in Lisp. This is mainly to help me understand Lex and
Yacc. Since Yacc is meant to be good for recursive balanced
structures, I thought it would be an easy project. I did find it easy
to write a grammar that recognizes the correct form for an
S-expression, however, I'm having a massive mental block when it comes
to putting the data into a form in which it can be manipulated.
Basically I want to convert the list syntax into an in-memory
equivalent, using pairs (basically a linked list).


I've managed to parse a flat list into a standard linked list, but I'm
having problems seeing how I can extend this to do arbitrarily nested
lists. The only way I can think is to maintain my own stack, which
seems to defeat the point of using yacc. There must be a way to
leverage yacc's own recursion to build the data structures, but I just
can't think of one.


Here's my lexer (change yylval.v to yylval.s to work with the second
parser):


/* list lexer */
%{
#include "list.tab.h"
%}


%%
\( return OPEN;
\) return CLOSE;
[a-z]+ {
        yylval.v = yytext; /* for identifier */
        return SYMBOL;
}
[[:space:]] /* silently eat */
%%




I have two attempts at the parser; one has the proper types for how
I'd want the
end program to look, but I can't figure out how to get it into a
working state;
the other works, but only does half the job (only builds a flat non-
delimited
list).


Here's the first. At the moment the grammar recognizes just single
pairs, but it
doesn't work because it consumes both identifiers for each match to
the 'pair'
rule; I would want it to consume the first, then match again with the
second
identifier now becoming the first. In this way it might be possible
to build
the linked list as we walked down the list.




/* list parser */
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


enum type {
        TYPE_SYMBOL, TYPE_PAIR
};


struct object {
        enum type t;
        union {
                char *symbol;
                struct pair *pair;
        };
};


struct pair {
        struct object *car;
        struct object *cdr;
};




struct object *symbol(char *value);
struct object *pair(struct object *car, struct object *cdr);
void write(struct object *obj);
void *xmalloc(size_t size);
void fatal();
%}


%union {
        char *v;
        struct object *o;
}


%token OPEN CLOSE INVALID
%token <v> SYMBOL
%type <o> identifier pair


%%


input: /* empty */
              | input pair {
        printf("writing pair:\n");
        write($2);
        printf("\n");
}


pair: identifier identifier {
        printf("pair read\n");
        printf("$1 = '%s'\n", $1->symbol);
        printf("$2 = '%s'\n", $2->symbol);
        $$ = pair($1, $2);
}


identifier: SYMBOL {
        printf("creating symbol: %s\n", $1);
        $$ = symbol($1);
}




%%


struct object *symbol(char *value) {
        struct object *ret;
        ret = xmalloc(sizeof(struct object));


        ret->t = TYPE_SYMBOL;
        ret->symbol = strdup(value);


        return ret;
}


struct object *pair(struct object *car, struct object *cdr) {
        struct object *ret;


        ret = xmalloc(sizeof(struct object));


        ret->t = TYPE_PAIR;
        ret->pair = xmalloc(sizeof(struct pair));


        ret->pair->car = car;
        ret->pair->cdr = cdr;


        return ret;
}


void write(struct object *obj) {
        switch (obj->t) {
                case TYPE_SYMBOL:
                        printf("%s", obj->symbol);
                        break;
                case TYPE_PAIR:
                        printf("(");
                        write(obj->pair->car);
                        printf(" . ");
                        write(obj->pair->cdr);
                        printf(")");
                        break;
                default:
                        fatal();
        }
}


void *xmalloc(size_t size) {
        void *ret = malloc(size);
        if (ret == NULL) fatal();
        return ret;
}


void fatal() {
        perror("parser");
        exit(1);
}




Here's the second. As you can see, the 'current' node is global, so
it can only
ever support a flat list.




/* list parser */
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


struct node {
        char *item;
        struct node *next;
};


struct node *first;
struct node *current;


void walk(struct node *root);
void *xmalloc(size_t size);
void fatal();
%}


%union {
        char *s;
}


%token <s> SYMBOL


%%


input: /* empty */
        | input SYMBOL {
        printf("encountered symbol: %s\n", $2);


        if (first == NULL) {
                puts("mallocing first node");
                first = xmalloc(sizeof(struct node));
                first->item = strdup($2);
        }


        if (current == NULL) {
                puts("setting current = first");
                current = first;
        } else {
                struct node *n = xmalloc(sizeof(struct node));
                n->item = strdup($2);
                n->next = NULL; // for the moment


                current->next = n;
                current = n;
        }
}


%%


int main() {
        yyparse();
        walk(first);
}


void walk(struct node *root) {
        puts("walking linked list");


        do printf("item: %s\n", root->item);
        while (root = root->next);
}


void *xmalloc(size_t size) {
        void *ret = malloc(size);
        if (ret == NULL) fatal();
        return ret;
}


void fatal() {
        perror("parser");
        exit(1);
}




And this is the action-free grammar that seems to reliably match the
expressions I want to parse, but I can't seem to retrofit the
structure building onto it:




input: /* empty */
              | input list


list: OPEN expressions CLOSE


expressions: /* empty */
                          | expressions expression;


expression: identifier | list


identifier: SYMBOL




(I'm using the default main() that gets used when compiling with -ly -
ll.)


I apologize for the length of this post. I'd be very grateful for a
hint.


Thanks, David


Post a followup to this message

Return to the comp.compilers page.
Search the comp.compilers archives again.