Re: html and flex

deleveld@dds.nl
3 Feb 1999 23:57:22 -0500

          From comp.compilers

Related articles
html and flex deleveld@dds.nl (1999-01-25)
Re: html and flex rogerb@sco.COM (1999-01-27)
Re: html and flex mikee@cetasoft.cog (1999-01-27)
Re: html and flex deleveld@dds.nl (1999-02-03)
| List of all articles for this month |

From: deleveld@dds.nl
Newsgroups: comp.compilers
Date: 3 Feb 1999 23:57:22 -0500
Organization: Compilers Central
References: 99-01-093 99-01-106
Keywords: lex, WWW

On 25 Jan 1999 21:53:44 -0500, deleveld@dds.nl wrote:
> [ want advice on lexing and parsing HTML ]


mikee@cetasoft.cog (Mike Enright) wrote:
> Many browsers (both browsers?) allow broken HTML to work. Therefore to
> read those same pages, you will have to accept code that doesn't match
> the specs, like <A HREF="somewhere.html> (missing quote within a tag)
> or #include &ltstdio.h&gt; (missing semicolon after &lt). Maybe a
> lexer can be written in lex that deals with such things. My feeling is
> it would be a large grammar if it worked.


Yes, thank you for the advice. I had expected html to be somewhat standardized
but looking at some pages I got from the web, I see that there is a lot of bat
html code around. I'll do my best to get around any mistakes.


Anyway, if anyone is interested here is the flex file for the simple scanning
until now...




%{
#include <string.h>
#include "v_html.h"


%}




%option noyywrap
%option never-interactive
%option yyclass="HtmlLexer"




/* States */
%s INBODY
%s INHEAD




/* Basic Tag related */
HTML "<html>"
OUTHTML "</html>"
HEAD "<head>"
OUTHEAD "</head>"
BODY "<body>"
OUTBODY "</body>"




/* Link related */
HYPERLINK "<a"[ ]+"href"[ ]*"="[ ]*"\""[^#][^>]*">"
PAGELINK "<a"[ ]+"href"[ ]*"="[ ]*"\"#"[^>]*">"
PAGETARGET "<a"[ ]+"name"[^>]*">"
LINKEND "</a>"




/* List related */
UL "<ul>"
OUTUL "</ul>"
LI "<li>"
OUTLI "</li>"




/* Spacing related */
PARAGRAPH "<p>"
LINEBREAK "<br>"
PRE "<pre>"
OUTPRE "</pre>"


/* Ignore unknown tags */
UNKNOWNTAG "<"[^>]*">"




/* Special characters */
NBSP "&nbsp;"
GREATERTHAN "&gt;"
LESSTHAN "&lt;"




/* Printable stuff */
WHITESPACE [ \t]+
RETURN [\n]+
TEXTWORD [^ \t\n<>&]+




%%




{HTML} { // Basic Tag related -----------------------------------------------


    InHtml = 1;
}




{OUTHTML} {


    InHtml = 0;
}




{HEAD} {


    if(InHtml)
        BEGIN(INHEAD);
}




<INHEAD>{OUTHEAD} {


    BEGIN(INITIAL);
}




{BODY} {


    if(InHtml)
        BEGIN(INBODY);
}




<INBODY>{OUTBODY} {


    BEGIN(INITIAL);
}




<INBODY>{UL} { // List related ----------------------------------------------


    ListLevel++;


    Print("\n");


    LineCount++;
    WidthCount = 0;
}




<INBODY>{OUTUL} {


    if(ListLevel)
        ListLevel--;


    Print("\n");


    LineCount++;
    WidthCount = 0;
}




<INBODY>{LI} {


    if(InListElem)
    {
        Print("\n");


        LineCount++;
        WidthCount = 0;
    }
    InListElem = true;


    for(int i=0;i<ListLevel;i++)
    {
        Print(" ");


        WidthCount += Width(" ");
    }


    Print("* ");
    WidthCount += Width("* ");
}




<INBODY>{OUTLI} {


    Print("\n");


    LineCount++;
    WidthCount = 0;


    InListElem = false;
}




<INBODY>{LINEBREAK} { // Spacing related ------------------------------------


    Print("\n");


    LineCount++;
    WidthCount = 0;
}




<INBODY>{PARAGRAPH} {


    Print("\n\n");


    LineCount += 2;
    WidthCount = 0;
}




<INBODY>{PRE} {


    InPreformat = 1;
}




<INBODY>{OUTPRE} {


    InPreformat = 0;
}




<INBODY>{HYPERLINK} { // Link related ---------------------------------------


    cout << "HLink: " << YYText() << endl;


    // Setup a link to be saved
    free(LastLinkDest);
    LastLinkDest = strdup(YYText());
    LastLinkBegin = WidthCount;
    LastLinkLine = LineCount;


    InLinkedText = 1;
}




<INBODY>{PAGELINK} {


    cout << "PLink: " << YYText() << endl;


    // Setup a link to be saved
    if(LastLinkDest)
        free(LastLinkDest);


    LastLinkDest = strdup(YYText());
    LastLinkBegin = WidthCount;
    LastLinkLine = LineCount;


    InLinkedText = 1;
}




<INBODY>{PAGETARGET} {


    cout << "Page: " << YYText() << endl;


    InLinkedText = 1;
}




<INBODY>{LINKEND} {


    InLinkedText = 0;


    // The link should be done now
    if(LastLinkDest)
    {
        HtmlLink NewLink(LastLinkDest,LastLinkLine,LastLinkBegin,WidthCount);
        Link.append(NewLink);
    }
    free(LastLinkDest);
}




{UNKNOWNTAG} { // Ignore unknown tags ---------------------------------------


    if(InHtml)
        cout << "Tag: " << YYText() << endl;
}




<INBODY>{NBSP} { // Special characters --------------------------------------


    WrapPrint(" ");
}




<INBODY>{GREATERTHAN} {


    WrapPrint(">");
}




<INBODY>{LESSTHAN} {


    WrapPrint("<");
}




<INBODY>{WHITESPACE} { // Printable -----------------------------------------


    if(InPreformat)
    {
        Print(YYText());


        WidthCount += Width(YYText());
    }
    else
    {
        WrapPrint(" ");
    }
}




<INBODY>{RETURN} {


    if(InPreformat)
    {
        Print(YYText());


        LineCount += strlen(YYText());
        WidthCount = 0;
    }
    else
    {
// Print(" ");


// LineCount++;
// WidthCount = 0;
    }
}


<INBODY>{TEXTWORD} {


    if(InPreformat)
    {
        Print(YYText());
        WidthCount += Width(YYText());
    }
    else
        WrapPrint(YYText());
}




<INBODY>. { // Unknown text in body


    if(InPreformat)
    {
        Print(YYText());
        WidthCount += Width(YYText());
    }
    else
        WrapPrint(YYText());
}




<*>. { // Ignore totally unknown text


}




%%


Post a followup to this message

Return to the comp.compilers page.
Search the comp.compilers archives again.