[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: Lua to LPEG?
- From: Roberto Ierusalimschy <roberto@...>
- Date: Tue, 21 Jun 2011 10:03:21 -0300
> I am tempted to ask (but won't) why there isn't any lpeg.L(lpat) that
> returns an lpeg pattern that matches the same strings, and returns
> the same captures, that the Lua pattern lpat would.
>
> Instead, I'm asking whether a question similar to this has burgeoned
> in the brain of a list member for long enough that some work in this
> direction has been done.
We have been working on it. Note that the task is not as easy as it may
sound, because the semantics of LPeg is different from the semantics of
Lua patterns. For instance, repetition in Lua is greedy, but in LPeg
it is possessive. The translation from one to the other is not difficult,
but the translation with captures is.
I have attached some code I wrote related to this translation. It
implements an equivalent to the 'string.find' function. It passes most
(all?) tests in the standard Lua test suite that do not use captures.
-- Roberto
local m = require'lpeg'
local re = require're'
-- require 'pt'
local any = m.P(1)
local allchars = ""
for i = 1, 255 do allchars = allchars..string.char(i) end
-- return the set of all chars of a given string that match a given class
local function classsub (s, p)
p = m.Cs( ((any - p) / "" + p)^0 )
return p:match(s)
end
-- return the set of all chars that match a given class
local function class2chars (p)
return classsub(allchars, p)
end
-- return the intersection of two sets of chars, represented
-- as strings
local function intersection (c1, c2)
return classsub(c1, m.S(c2))
end
-- return the set of all chars in a given category
local function charscat (c)
return class2chars(re.compile(c))
end
local function charsfromto (c1, c2)
local s = ""
for i = string.byte(c1), string.byte(c2) do
s = s .. string.char(i)
end
return s
end
local function concat (s1, s2)
return s1 .. s2
end
local g = m.P{ "pattern",
pattern = m.Ct(m.Cg("^", "init")^0
* m.V"item"^0
* (m.Cg("$" * m.P(-1), "fin"))^0) * -1,
setelem = "%" * m.R("AZ", "az") / charscat
+ "%" * m.C(any)
+ (m.C(any) * "-" * m.C(any - "]")) / charsfromto
+ m.C(any - "]"),
set = "["
* m.Cg(m.Cf(m.C"^"^-1 * m.C"]"^-1 * m.V"setelem"^0, concat), "set")
* "]",
esc = "%" * m.Cg(m.S"acdglpsuwxzACDGLPSUWXZ", "escape")
+ "%" * m.C(any),
class = m.C(any - m.S"[].$()%%.*+-?" + ("$" * #m.P(1)))
+ m.Cg(".", "all")
+ m.V"esc"
+ m.V"set",
item = "%" * m.Ct(m.Cg(m.R"09", "backref"))
+ "%b" * m.Ct(m.Cg(2, "balanced"))
+ "%f" * m.V"set"
+ m.Ct(m.V"class" * m.Cg(m.S"*+-?", "rep")^0)
+ m.Ct(m.Cg("()", "position"))
+ "(" * m.Cg(m.Ct(m.V"item"^1), "capture") * ")"
}
local function balanced (s)
local a, b = s:sub(1, 1), s:sub(2, 2)
return m.P{ a * ( (1 - m.S(s))^1 + m.V(1) )^0 * b }
end
local function class (e)
if e.escape then return re.compile("%"..e.escape)
elseif e.set then
if (string.sub(e.set, 1, 1) == "^") then
return any - m.S(string.sub(e.set, 2, -1))
else
return m.S(e.set)
end
elseif e.all then return any
else return m.P(e[1])
end
end
local count = 0
local function nextvar ()
count = count + 1
return "v" .. count
end
local function rep (e, k, r)
local c = class(e)
local fc = class2chars(c)
local inter = (intersection(fc, k.first) ~= "")
local union = fc .. k.first
if r == "?" then
if inter then
k[1] = c * k[1] + k[1]
else
k[1] = c^-1 * k[1]
end
elseif r == "*" then
local v = nextvar()
if inter then
k[v] = c * m.V(v) + k[1]
else
k[v] = c^0 * k[1]
end
k[1] = m.V(v)
elseif r == "-" then
local v = nextvar()
if inter or k.empty then
k[v] = k[1] + c * m.V(v)
else
k[v] = c^0 * k[1]
end
k[1] = m.V(v)
elseif r == "+" then
local k = rep(e, k, "*")
k[1] = c * k[1]
union = fc -- must start with a char from class 'c'
k.empty = false
end
k.first = union
return k
end
local function traduz (p, i)
if i > #p then
return {p.fin and m.P(-1) or m.P(""); first = ""; empty = true}
end
local k = traduz(p, i + 1)
local e = p[i]
local et, first, empty
if e.position then et = m.Cp(); first = k.first
elseif e.balanced then
et = balanced(e.balanced)
first = string.sub(e.balanced, 1, 1)
k.empty = false
elseif e.rep then return rep(e, k, e.rep)
else et = class(e); first = class2chars(et); k.empty = false
end
k[1] = et * k[1]
k.first = first
return k
end
local pg = m.P(g)
if arg[1] then
local p = pg:match(arg[1])
print(pt(p))
p = traduz(p, 1)
print(p.first, p.empty)
p.first, p.empty = nil
p = m.P(p); p:print()
if arg[2] then
print(p:match(arg[2]))
end
return
end
local function compile (p)
local tree = pg:match(p)
if not tree then error("invalid pattern " .. p) end
local g = traduz(tree, 1); g.first, g.empty = nil
local pp = m.P(g)
pp = m.Cp() * pp * m.Cp()
if tree.init then return pp
else return m.P{ pp + 1 * m.V(1) }
end
end
function find (s, p, i)
local pp = compile(p)
local a, b = pp:match(s, i)
if a then return a, b - 1 end
end
--------------------------------------------------------------------------------
local function f(s, p)
local i,e = find(s, p)
if i then return string.sub(s, i, e) end
end
a,b = find('', '') -- empty patterns are tricky
assert(a == 1 and b == 0);
a,b = find('alo', '')
assert(a == 1 and b == 0)
a,b = find('a\0o a\0o a\0o', 'a', 1) -- first position
assert(a == 1 and b == 1)
a,b = find('a\0o a\0o a\0o', 'a\0o', 2) -- starts in the midle
assert(a == 5 and b == 7)
a,b = find('a\0o a\0o a\0o', 'a\0o', 9) -- starts in the midle
assert(a == 9 and b == 11)
a,b = find('a\0a\0a\0a\0\0ab', '\0ab', 2); -- finds at the end
assert(a == 9 and b == 11);
a,b = find('a\0a\0a\0a\0\0ab', 'b') -- last position
assert(a == 11 and b == 11)
assert(find('a\0a\0a\0a\0\0ab', 'b\0') == nil) -- check ending
assert(find('', '\0') == nil)
assert(find('alo123alo', '12') == 4)
assert(find('alo123alo', '^12') == nil)
assert(f('aloALO', '%l*') == 'alo')
assert(f('aLo_ALO', '%a*') == 'aLo')
assert(f(" \n\r*&\n\r xuxu \n\n", "%g%g%g+") == "xuxu")
assert(f('aaab', 'a*') == 'aaa');
assert(f('aaa', '^.*$') == 'aaa');
assert(f('aaa', 'b*') == '');
assert(f('aaa', 'ab*a') == 'aa')
assert(f('aba', 'ab*a') == 'aba')
assert(f('aaab', 'a+') == 'aaa')
assert(f('aaa', '^.+$') == 'aaa')
assert(f('aaa', 'b+') == nil)
assert(f('aaa', 'ab+a') == nil)
assert(f('aba', 'ab+a') == 'aba')
assert(f('a$a', '.$') == 'a')
assert(f('a$a', '.%$') == 'a$')
assert(f('a$a', '.$.') == 'a$a')
assert(f('a$a', '$$') == nil)
assert(f('a$b', 'a$') == nil)
assert(f('a$a', '$') == '')
assert(f('', 'b*') == '')
assert(f('aaa', 'bb*') == nil)
assert(f('aaab', 'a-') == '')
assert(f('aaa', '^.-$') == 'aaa')
assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab')
assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab')
assert(f('alo xo', '.o$') == 'xo')
assert(f(' \n isto é assim', '%S%S*') == 'isto')
assert(f(' \n isto é assim', '%S*$') == 'assim')
assert(f(' \n isto é assim', '[a-z]*$') == 'assim')
assert(f('um caracter ? extra', '[^%sa-z]') == '?')
assert(f('', 'a?') == '')
assert(f('x', 'x?') == 'x')
assert(f('xbl', 'x?b?l?') == 'xbl')
assert(f(' xbl', 'x?b?l?') == '')
assert(f('aa', '^aa?a?a') == 'aa')
assert(f(']]]xb', '[^]]') == 'x')
assert(f("0alo alo", "%x*") == "0a")
assert(f("alo alo", "%C+") == "alo alo")
print('+')
assert(f("alo <xuxu> alo", "%b<>") == "<xuxu>")
assert(f("alo (xuxu>batata() and)) alo", "%b()") == "(xuxu>batata() and)")
assert(f("((( (xuxu>batata() and) alo", "%b()") == "(xuxu>batata() and)")
assert(f("alo alo 123 chegou", ".*123") == "alo alo 123")
assert(f("alo alo 123 chegou", ".*%d+") == "alo alo 123")
assert(f("alo alo 123 chegou", "^.*%d+") == "alo alo 123")
assert(not find("alo alo chegou", ".*%d+"))
print 'ok'