Lexical Analysis |
|
Some people have suggested that Lua's regular expressions are limited.
The beauty of Lua is that it is so easy to add extra functionality. We don't need fancy regular expressions because we can easily add a lexical analyser with Peter Bumbulis' re2c
[1].
Here is a lexical scanner that recognizes Lua 5.0's syntax and keywords. The function LexLua
takes the string to be scanned and returns a function which does the scanning. The returned function is a closure that is bound to the C function scan
and two upvales: the string to be scanned and a userdata to keep track of the state. Every time it is called, it returns the next token, or nil
when it reaches the end of the string.
re2c
replaces the regular expressions in the special comments with code
for the scanner.
Here is the input [2] and output
[3] of re2c
.
/* ============================================================================== LexLua.c ============================================================================== */ #include "lua.h" #include "lauxlib.h" const char *name = "<name>"; const char *number = "<number>"; const char *literal = "<literal>"; #define YYCTYPE char #define YYCURSOR cursor #define YYMARKER marker #define YYLIMIT limit #define YYFILL(n) #define save_state(i,s,c,m,l) {\ (s)->cursor = (c)-(i); \ (s)->marker = (m)-(i); \ (s)->limit = (l)-(i); \ } typedef struct Scanner { int cursor, marker, limit; } Scanner; static Scanner *check_Scanner(lua_State *L, int index) { luaL_check_type(L, index, LUA_TUSERDATA); return (Scanner*)lua_touserdata(L,index); } static int scan (lua_State *L) { const char *input = luaL_check_string(L, lua_upvalueindex(1)); Scanner *state = check_Scanner(L,lua_upvalueindex(2)); char *cursor = (char*)input + state->cursor; char *marker = (char*)input + state->marker; char *limit = (char*)input + state->limit; char *white_space, *token; const char *ret = 0; int nest_count = 0; /*!re2c D = [0-9] ; E = [Ee] [+-]? D+ ; L = [a-zA-Z_] ; NUMBER = ( D+ | D* "." D+ | D+ "." D* ) E? ; WS = [ \t\n\v\f]+ ; LF = [\n] ; END = [\000] ; ANY = [\000-\377] \ END ; ESC = [\\] ; SQ = ['] ; DQ = ["] ; STRING1 = SQ ( ANY \ SQ \ ESC | ESC ANY )* SQ ; STRING2 = DQ ( ANY \ DQ \ ESC | ESC ANY )* DQ ; */ Begin: white_space = cursor; /* start of white space */ Space: token = cursor; /* start of token */ /*!re2c WS { goto Space; } "--[[" { nest_count=0; goto LongComment; } "--" | "#" { goto Comment; } "and" { goto Return; } "break" { goto Return; } "do" { goto Return; } "else" { goto Return; } "elseif" { goto Return; } "end" { goto Return; } "false" { goto Return; } "for" { goto Return; } "function" { goto Return; } "global" { goto Return; } "if" { goto Return; } "in" { goto Return; } "local" { goto Return; } "nil" { goto Return; } "not" { goto Return; } "or" { goto Return; } "repeat" { goto Return; } "return" { goto Return; } "then" { goto Return; } "true" { goto Return; } "until" { goto Return; } "while" { goto Return; } "..." { goto Return; } ".." { goto Return; } "==" { goto Return; } ">=" { goto Return; } "<=" { goto Return; } "~=" { goto Return; } "[[" { nest_count=0; goto LongString; } L ( L | D )* { ret = name; goto Return; } NUMBER { ret = number; goto Return; } STRING1 { ret = literal; goto Return; } STRING2 { ret = literal; goto Return; } ANY { goto Return; } END { goto TheEnd; } */ LongString: /*!re2c "[[" { nest_count++; goto LongString; } "]]" { if( nest_count == 0 ) { ret = literal; goto Return; } nest_count--; goto LongString; } ANY { goto LongString; } END { luaL_error(L,"unfinished long string"); } */ Comment: /*!re2c ( ANY \ LF )* { goto Space; } END { goto TheEnd; } */ LongComment: /*!re2c "[[" { nest_count++; goto LongComment; } "]]" { if( nest_count == 0 ) goto Space; nest_count--; goto LongComment; } ANY { goto LongComment; } END { luaL_error(L,"unfinished long comment"); } */ luaL_error(L,"impossible"); /* die */ TheEnd: if( --cursor != limit ) luaL_error(L,"didn't reach end of input"); /* die */ lua_pushnil(L); lua_pushnil(L); lua_pushlstring(L, white_space, token - white_space ); save_state(input,state,cursor,marker,limit); return 3; /* nil, nil, ws */ Return: lua_pushlstring(L, token, cursor - token ); if( ret ) lua_pushstring(L, ret ); else lua_pushnil(L); lua_pushlstring(L, white_space, token - white_space ); save_state(input,state,cursor,marker,limit); return 3; /* token, type, ws */ } static int scanner (lua_State *L) { Scanner *s; int len; const char *input = luaL_check_lstr(L, 1, &len); s = (Scanner*)lua_newuserdata(L, sizeof(Scanner)); s->cursor = 0; s->marker = 0; s->limit = len; lua_pushcclosure(L, scan, 2); /* string, userdata */ return 1; } int openLexLua (lua_State *L) { lua_register(L, "LexLua", scanner); return 0; }
This code can be compiled with into a unix shared lib as follows:
re2c -s LexLua.c > lex.c gcc -fPIC -g -c lex.c -o lexlua.o gcc -g -shared -Wl,-soname,liblexlua.so -o liblexlua.so.1.0.0 lexlua.o -L/usr/local/lib/ -llua -llualib su cp liblexlua.so.1.0.0 /usr/local/lib cd /usr/local/lib ln -s liblexlua.so.1.0.0 liblexlua.so ldconfig -v /usr/local/lib
$ lua Lua 5.0 (alpha) Copyright (C) 1994-2002 Tecgraf, PUC-Rio > assert(loadlib('/usr/local/lib/liblexlua.so','openLexLua'))() > for tok, tt in LexLua[[ for i = 1,10 do print(i*2) end ]] do print(tok,tt) end for nil i <name> = nil 1 <number> , nil 10 <number> do nil print <name> ( nil i <name> * nil 2 <number> ) nil end nil >
For an example of how to add some colour to your Lua code, see [4] or LuaToHtml