[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: comparing Lua with other languages
- From: Wim Couwenberg <w.couwenberg@...>
- Date: Thu, 30 Dec 2004 21:08:29 +0100
Below is an implementation of the "tokens" test for APLC that counts the
number of tokens in a lua 5.0.2 script file. It should correctly count
all allowed tokens, where
- (multi-line) strings and (multi-line) comments count as a single token
- Each + and - sign always counts as a single token. (So -5 is not a
single number but consists of two tokens.)
The lexer below only *counts* tokens, so in its current form it is not
suited to feed a syntactic parser but it should not be hard to adapt.
Since this is a fairly tricky business, would someone with some Lua
lexer knowledge care to run it through some tests? It is not submitted
to the APLC site yet.
--
Wim
-- print short discription if no filename is provided
if not arg[1] then
print("usage: " .. arg[0] .. [[ filename
counts the number of tokens in a Lua script file.]])
return
end
-- parse file in chunks of this size
local chunk_size = 1024
-- total tokens
local count = 0
-- report a token
local function token()
count = count + 1
end
-- forward declaration of main lexer
local lex
-- forward declaration of current parse state
local parse
-- helper to parse to first newline (for single-line comments)
local function single_line(chunk, index, more)
local _, t = string.find(chunk, "\n", index)
if t then
token()
parse = lex
return lex(chunk, t + 1, more)
elseif more then
parse = single_line
return ""
else
token()
end
end
-- nesting level in multi-line comments and strings
local level
-- helper to parse multi-line token (string or comment)
local function multi_line(chunk, index, more)
local len = string.len(chunk)
while index < len do
local _, t, c = string.find(chunk, "([%[%]][%[%]])", index)
if not c then index = len
elseif c == "[[" then
level = level + 1
index = t + 1
elseif c == "]]" then
level = level - 1
index = t + 1
if level == 0 then
token()
parse = lex
return lex(chunk, index, more)
end
else index = t
end
end
if more then
parse = multi_line
return string.sub(chunk, index)
end
error "]] expected"
end
-- quote character that opened currect string token
local quote
-- helper to parse a quoted string
local function quoted_string(chunk, index, more)
local len = string.len(chunk)
local pat = "([\\\n" .. quote .. "])"
while index <= len do
local _, c
_, index, c = string.find(chunk, pat, index)
if not c then index = len + 1
elseif c == "\n" then
error("unexpected newline: " .. quote .. " expected")
elseif c == "\\" then
if index < len then index = index + 2
else break
end
elseif c == quote then
token()
parse = lex
return lex(chunk, index + 1, more)
end
end
if more then
parse = quoted_string
return string.sub(chunk, index)
else
error("unexpected end of file: " .. quote .. " expected")
end
end
-- main lexer function.
-- try to parse as many tokens as possible from the chunk.
-- returns a remaining tail of the chunk that has to be
-- reexamined or nil if there's nothing left to do.
function lex(chunk, index, more)
local len = string.len(chunk)
if index > len then
-- more input needed
return more and ""
elseif len - index < 3 and more then
-- more lookahead needed
return string.sub(chunk, index)
end
-- skip white space
local _, t = string.find(chunk, "^%s+", index)
if t then return lex(chunk, t + 1, more) end
-- keywords and identifiers
_, t = string.find(chunk, "^[_%a][_%w]*", index)
if t == len and more then
return string.sub(chunk, index)
elseif t then
token()
return lex(chunk, t + 1, more)
end
-- numbers
_, t = string.find(chunk, "^[0-9]+%.?[0-9]*", index)
if not t then
_, t = string.find(chunk, "^%.?[0-9]+", index)
end
if t then
local _, e = string.find(chunk, "^[eE][+-]?[0-9]+", t + 1)
t = e or t
if t == len and more then
return string.sub(chunk, index)
end
token()
return lex(chunk, t + 1, more)
end
-- special tokens
if string.find(chunk, "^[%-%[%'%\"]", index) then
if string.find(chunk, "^%-%-%[%[", index) then
level = 1
return multi_line(chunk, index + 4, more)
elseif string.find(chunk, "^%-%-", index) then
return single_line(chunk, index + 2, more)
elseif string.find(chunk, "^%[%[", index) then
level = 1
return multi_line(chunk, index + 2, more)
elseif string.find(chunk, "^%'", index) then
quote = "'"
return quoted_string(chunk, index + 1, more)
elseif string.find(chunk, '^%"', index) then
quote = '"'
return quoted_string(chunk, index + 1, more)
end
end
-- multichar tokens
if not t then
_, t = string.find(chunk, "^[%~%=%<%>]=", index)
end
if not t then
_, t = string.find(chunk, "^%.%.%.?", index)
end
-- single char tokens
if not t then
_, t = string.find(chunk, "^[%^%*%(%)%-%+%=%{%}%[%]%:%;%<%>%,%.%/]",
index)
end
if t then
token()
return lex(chunk, t + 1, more)
end
-- still no match?
error("unrecognised token: " .. string.sub(chunk, index, index + 5) ..
"...")
end
-- start in the main lexer state
parse = lex
-- read and parse chunks from specified file
local function start()
io.input(arg[1])
local chunk = io.read(chunk_size)
local res = parse(chunk, 1, true)
while res do
chunk = io.read(chunk_size)
if chunk then
res = parse(res .. chunk, 1, true)
else
res = parse(res, 1, false)
end
end
end
-- start parsing the file and catch any errors
local rc, err = pcall(start)
if rc then
print(count .. " tokens")
else
print(err)
end