Ex Pattern |
|
Lua patterns [1] are a limited form of regular expressions. Unlike regular expressions, patterns do not support grouping and quantification [2] operations on anything except single character sets. For full regular expression (or Perl-style regular expression) support, you may instead compile a regular expression module such as lrexlib [3]. For even greater flexibility, for parsing and lexical analysis applications or even for the simpler case, one can use parsing expression grammars (PEG) via the LuaPeg library [4] implemented in C.
Sometimes, however, Lua patterns are almost sufficient, as we may want to extend the functionality of patterns just a bit more without compiling in any additional C code. That's where this module comes in. This module builds extended pattern matching on top of Lua patterns. It takes a pattern expression written in a form somewhat similar to LPeg and translates it to a string of Lua code containing string.match
calls, which is then compiled to a Lua function via loadstring
. This is similar to the code you might write yourself, although this module automates it with code generation. It should have similar efficiency assuming you precompile any match objects that are used repeatedly.
Here is a simple example:
local M = require "xpattern" local P = M.P local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile() local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first assert(a == nil and b == 'ccc' and c == 'b' and d == 11)
which internally generates this Lua function:
local match = ... -- string.match return function(s,pos) for pos1=(pos or 1),#s do local pos2 local c1,c2,c3,c4 local pos3 = pos1 c1,pos3 = match(s, "^(b+)()", pos1) if not pos3 then c2,pos3 = match(s, "^(c+)()", pos1) end pos2 = pos3 if pos2 then do local pos3=pos2 pos2 = pos2 while 1 do pos3 = match(s, "^[A-Z][a-z]()", pos3) if pos3 then pos2=pos3 else break end end end end if pos2 then c3,c4,pos2 = match(s, "^(.)()()", pos2) end if pos2 then return c1,c2,c3,c4 end end end
See the test suite below for additional examples.
Warning: this code should be considered somewhat experimental. It is not fully developed or tested. Fix up the code if you want to use it in production.
-- xpattern.lua -- Preliminary regular expression-like support in Lua -- Uses Lua patterns as the core building block. -- -- Implemented in pure Lua with code generation technique. -- It translates an expression into a snippet of Lua code -- having a series of `string.match` calls, which is then -- compiled (via `loadstring`). -- -- Like lpeg, does not support backtracking. -- -- WARNING: This is experimental code. The design and implementation -- has not been thoroughly tested. -- -- Version v20091021. -- (c) 2008-2009 David Manura. Licensed under the same terms as Lua (MIT license). -- Please post patches. local M = {} local string = string local format = string.format local match = string.match local assert = assert local error = error local ipairs = ipairs local loadstring = loadstring local setmetatable = setmetatable local type = type local print = print -- Adds whitespace to string `s`. -- Whitespace string `ws` (default to '' if omitted) is prepended to each line -- of `s`. Also ensures `s` is is terminated by a newline. local function add_whitespace(s, ws) ws = ws or '' s = s:gsub('[^\r\n]+', ws .. '%1') if s:match('[^\r\n]$') then s = s .. '\n' end return s end -- Counts the number `count` of captures '()' in Lua pattern string `pat`. local function count_captures(pat) local count = 0 local pos = 1 while pos <= #pat do local pos2 = pat:match('^[^%(%%%[]+()', pos) if pos2 then pos = pos2 elseif pat:match('^%(', pos) then count = count + 1 pos = pos + 1 elseif pat:match('^%%b..', pos) then pos = pos + 3 elseif pat:match('^%%.', pos) then pos = pos + 2 else local pos2 = pat:match('^%[[^%]%%]*()', pos) if pos2 then pos = pos2 while 1 do local pos2 = pat:match('^%%.[^%]%%]*()', pos) if pos2 then pos = pos2 elseif pat:match('^%]', pos) then pos = pos + 1 break else error('syntax', 2) end end else error('syntax', 2) end end end return count end M._count_captures = count_captures -- Appends '()' to Lua pattern string `pat`. local function pat_append_pos(pat) local prefix = pat:match'^(.*)%$$' pat = prefix and prefix .. '()$' or pat .. '()' return pat end -- Prepends '()' to Lua pattern string `pat`. local function pat_prepend_pos(pat) local postfix = pat:match'^%^(.*)' pat = postfix and '^()' .. postfix or '()' .. pat return pat end -- Prepends '^' to Lua pattern string `pat`. local function pat_prepend_carrot(pat) local postfix = pat:match'^%^(.*)' pat = postfix and pat or '^' .. pat return pat end -- Return a string listing pattern capture variables with indices `firstidx` -- to `lastidx`. -- Ex: code_vars(1,2) --> 'c1,c2' local function code_vars(firstidx, lastidx) local code = '' for i=firstidx,lastidx do code = code .. (i == firstidx and '' or ',') .. 'c' .. i end return code end -- Metatable for expression objects local epat_mt = {} epat_mt.__index = epat_mt -- Builds an extended pattern object `epat` from Lua string pattern `pat`. local function pattern(pat) local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0) local ncaptures = count_captures(pat) local lvars = code_vars(totncaptures0+1, totncaptures0+ncaptures) .. (ncaptures == 0 and '' or ',') .. 'pos' .. destidx0 local pat = pat_append_pos(pat) pat = pat_prepend_carrot(pat) local str = format('%q', pat) local code = lvars .. ' = match(s, ' .. str .. ', pos' .. srcidx0 .. ')\n' return code, ncaptures end epat.anchored = pat:sub(1,1) == '^' return epat end -- Generates code from pattern `anypat` (either Lua pattern string or extended -- pattern object). -- `anypat` - either Lua pattern string or extended pattern object -- `srcidx0` - index of variable holding position to start matching at -- `destidx0` - index of variable holding position to store subsequent -- match position at. stores nil if no match -- `totncaptures0` - number of captures prior to this match -- `code` - Lua code string (code) and number of -- `ncaptures` - number of captures in pattern. local function gen(anypat, srcidx0, destidx0, totncaptures0) if type(anypat) == 'string' then anypat = pat_prepend_carrot(anypat) anypat = pattern(anypat) end local code, ncaptures = anypat(srcidx0, destidx0, totncaptures0) return code, ncaptures end -- Creates a new extended pattern object `epat` that is the concatenation of -- the given list (of size >= 0) of pattern objects. -- Specify a single string argument to convert a Lua pattern to an extended -- pattern object. local function seq(...) -- epats... -- Ensure args are extended pattern objects. local epats = {...} for i=1,#epats do if type(epats[i]) == 'string' then epats[i] = pattern(epats[i]) end end local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0, ws) if #epats == 0 then return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0 elseif #epats == 1 then return epats[1](srcidx0, destidx0, totncaptures0, ws) else local ncaptures = 0 local pat_code, pat_ncaptures = gen(epats[1], srcidx0, destidx0, totncaptures0+ncaptures, true) ncaptures = ncaptures + pat_ncaptures local code = add_whitespace(pat_code, '') for i=2,#epats do local pat_code, pat_ncaptures = gen(epats[i], destidx0, destidx0, totncaptures0+ncaptures, true) ncaptures = ncaptures + pat_ncaptures code = code .. 'if pos' .. destidx0 .. ' then\n' .. add_whitespace(pat_code, ' ') .. 'end\n' end return code, ncaptures end end if epats[1] and epats[1].anchored then epat.anchored = true end return epat end M.P = seq -- Creates new extended pattern object `epat` that is the alternation of the -- given list of pattern objects `epats...`. local function alt(...) -- epats... -- Ensure args are extended pattern objects. local epats = {...} for i=1,#epats do if type(epats[i]) == 'string' then epats[i] = pattern(epats[i]) end end local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0) if #epats == 0 then return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0 elseif #epats == 1 then return epats[1](srcidx0, destidx0, totncaptures0) else local ncaptures = 0 local pat_code, pat_ncaptures = gen(epats[1], srcidx0, destidx0+1, totncaptures0+ncaptures, true) ncaptures = ncaptures + pat_ncaptures local code = 'local pos' .. destidx0+1 .. ' = pos' .. srcidx0 .. '\n' .. add_whitespace(pat_code, '') for i=2,#epats do local pat_code, pat_ncaptures = gen(epats[i], srcidx0, destidx0+1, totncaptures0+ncaptures, true) ncaptures = ncaptures + pat_ncaptures code = code .. 'if not pos' .. destidx0+1 .. ' then\n' .. add_whitespace(pat_code, ' ') .. 'end\n' end code = code .. 'pos' .. destidx0 .. ' = pos' .. destidx0+1 .. '\n' return code, ncaptures end end return epat end M.alt = alt -- Creates new extended pattern object `epat` that is zero or more repetitions -- of the given pattern object `pat` (if one evaluates to false) or one or more -- (if one evaluates to true). local function star(pat, one) local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0) local ncaptures = 0 local destidx = destidx0 + 1 local code = 'do\n' .. ' local pos' .. destidx .. '=pos' .. srcidx0 .. '\n' local pat_code, pat_ncaptures = gen(pat, destidx, destidx, totncaptures0+ncaptures, true) ncaptures = ncaptures + pat_ncaptures code = code .. (one and (' pos' .. destidx0 .. ' = nil\n') or (' pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n')) .. ' while 1 do\n' .. add_whitespace(pat_code, ' ') .. ' if pos' .. destidx .. ' then\n' .. ' pos' .. destidx0 .. '=pos' .. destidx .. '\n' .. ' else break end\n' .. ' end\n' .. 'end\n' return code, ncaptures end return epat end M.star = star -- Creates new extended pattern object `epat` that is zero or one of the -- given pattern object `epat0`. local function zero_or_one(epat0) local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0) local ncaptures = 0 local destidx = destidx0 + 1 local code = 'do\n' .. ' local pos' .. destidx .. '=pos' .. srcidx0 .. '\n' local epat0_code, epat0_ncaptures = gen(epat0, destidx, destidx, totncaptures0+ncaptures, true) ncaptures = ncaptures + epat0_ncaptures code = code .. add_whitespace(epat0_code) .. ' if pos' .. destidx .. ' then\n' .. ' pos' .. destidx0 .. '=pos' .. destidx .. '\n' .. ' else\n' .. ' pos' .. destidx0 .. '=pos' .. srcidx0 .. '\n' .. ' end\n' .. 'end\n' return code, ncaptures end return epat end M.zero_or_one = zero_or_one -- Gets Lua core code string `code` corresponding to pattern object `epat` local function basic_code_of(epat) local pat_code, ncaptures = epat(1, 2, 0, true) local lvars = code_vars(1, ncaptures) if epat.anchored then local code = 'local pos1=pos or 1\n' .. 'local pos2\n' .. (lvars ~= '' and ' local ' .. lvars .. '\n' or '') .. add_whitespace(pat_code) .. '\n' .. 'if pos2 then return ' .. (lvars ~= '' and lvars or 's:sub(pos1,pos2-1)') .. ' end\n' return code else local code = 'for pos1=(pos or 1),#s do\n' .. ' local pos2\n' if lvars == '' then code = code .. add_whitespace(pat_code, ' ') .. ' if pos2 then return s:sub(pos1,pos2-1) end\n' else code = code .. ' local ' .. lvars .. '\n' .. add_whitespace(pat_code, ' ') .. ' if pos2 then return ' .. lvars .. ' end\n' end code = code .. 'end\n' return code end end M.basic_code_of = basic_code_of -- Gets Lua complete code string `code` corresponding to pattern object `epat`. local function code_of(epat) local code = 'local match = ...\n' .. 'return function(s,pos)\n' .. add_whitespace(basic_code_of(epat), ' ') .. 'end\n' return code end M.code_of = code_of -- Compiles pattern object `epat` to Lua function `f`. local function compile(epat) local code = code_of(epat) if M.debug then print('DEBUG:\n' .. code) end local f = assert(loadstring(code))(match) return f end M.compile = compile -- operator for pattern matching function epat_mt.__call(epat, ...) return epat.call(...) end -- operator for pattern alternation function epat_mt.__add(a_epat, b_epat) return alt(a_epat, b_epat) end -- operator for pattern concatenation function epat_mt.__mul(a_epat, b_epat) return seq(a_epat, b_epat) end -- operator for pattern repetition function epat_mt.__pow(epat, n) if n == 0 then return star(epat) elseif n == 1 then return star(epat, true) elseif n == -1 then return zero_or_one(epat) else error 'FIX - unimplemented' end end -- IMPROVE design? epat_mt.compile = compile epat_mt.basic_code_of = basic_code_of epat_mt.code_of = code_of return M
-- xpattern_test.lua - test suite for xpattern.lua -- utility function: convert list of values to string. local function str(...) local n = select('#', ...) local t = {...} for i=1,n do t[i] = tostring(t[i]) end return table.concat(t, ',') end local M = require "xpattern" local P = M.P M.debug = true -- internal: _count_captures assert(M._count_captures'' == 0) assert(M._count_captures'a' == 0) assert(not pcall(function() M._count_captures'%' end)) assert(M._count_captures'()' == 1) assert(M._count_captures'%(%)' == 0) -- %( assert(M._count_captures'[()]' == 0) -- () inside [] assert(M._count_captures'[%(%)]' == 0) -- %( inside [] assert(M._count_captures'[%]()]' == 0) -- %] inside [] assert(M._count_captures'[]()]' == 1) assert(M._count_captures'%b()' == 0) -- () on %b.. assert(M._count_captures'(()().())' == 4) -- nested -- more complex example assert(M._count_captures'.(.%))[(]%(()' == 2) -- simple matching assert(str(P'':compile()('')) == '') assert(str(P'':compile()('a')) == '') assert(str(P'a':compile()('')) == '') assert(str(P'a':compile()('a')) == 'a') assert(str(P'a':compile()('ba')) == 'a') assert(str(P'a+':compile()('baa')) == 'aa') -- simple anchors assert(str(P'^a+':compile()('aa')) == 'aa') assert(str(P'^a+':compile()('baab')) == '') -- $ fail assert(str(P'a+$':compile()('baa')) == 'aa') assert(str(P'a+$':compile()('baab')) == '') -- $ fail -- simple captures assert(str(P'(a+)(b+)':compile()('baab')) == 'aa,b') assert(str(P'^(a+)(b+)':compile()('baab')) == '') -- simple sequences local m = P():compile() assert(str( m('') ) == '') assert(str( m('a') ) == '') local m = P(''):compile() assert(str( m('') ) == '') assert(str( m('a') ) == '') local m = P('a', 'b', 'c'):compile() assert(str( m('.a.') ) == '') assert(str( m('.abc.') ) == 'abc') local m = (P'a' * P'b' * P'c'):compile() -- identical assert(str( m('.a.') ) == '') assert(str( m('.abc.') ) == 'abc') local m = P(P'a', 'b', P'c'):compile() -- identical assert(str( m('.a.') ) == '') assert(str( m('.abc.') ) == 'abc') local m = P(P'a+', 'b+', P'c+'):compile() assert(str( m('.abaabcc.') ) == 'aabcc') -- simple alternation local m = (P'aa+' + P'bb+'):compile() assert(str( m('abbaa') ) == 'bb') local m = (P'aa+' + P'bb+' + P'cc+'):compile() assert(str( m('abccaa') ) == 'cc') -- simple combinations local m = ((P'(a+)' + P'b(b*)') * P'(c+)()'):compile() assert(str( m("aacccdd")) == 'aa,nil,ccc,6') assert(str( m("bbcccdd")) == 'nil,b,ccc,6') assert(str( m("bbdd")) == '') print('?'..str( m("aabbcc"))) assert(str( m("aabbcc")) == 'nil,b,cc,7') -- alternative -- simple replication (*) local m = ( P'a'^0 ):compile() assert(str(m'') == '') assert(str(m'a') == 'a') assert(str(m'aab') == 'aa') -- replication (*) local m = ( (P'a+' + P'b+')^0 ):compile() assert(str(m'zabaabbc') == '') assert(str(m'abaabb') == 'abaabb') local m = ( (P'a+' * P'b+' + P'c+' * P'd+')^0 ):compile() assert(str(m'aabbccddaa') == 'aabbccdd') local m = ( P'aa'^0 * P'bb' * P'aa'^0 ):compile() assert(str(m'aaccaaaabbaa') == 'aaaabbaa') -- simple replication (+) local m = ( P'a'^1 ):compile() assert(str(m'') == '') assert(str(m'a') == 'a') assert(str(m'aab') == 'aa') -- replacation (+) local m = ( P'b' * P'a'^1 ):compile() print(m'b') assert(str(m'b') == '') assert(str(m'ba') == 'ba') assert(str(m'baab') == 'baa') -- simple replication (?) local m = ( P'a'^-1 ):compile() assert(str(m'') == '') assert(str(m'a') == 'a') assert(str(m'aab') == 'a') -- replication (?) local m = ( P'c' * (P'a+' + P'b+')^-1 ):compile() assert(str(m'caabb') == 'caa') -- Some of these examples from Mastering Regular Expressions (MRE), -- 2nd Ed. Jeffrey .Friedl. -- MRE p.19 local m = ( P'^' * (P'From' + P'Subject' + P'Date') * P':%s*(.*)' ):compile() assert(str(m('Subject: test')) == 'test') -- MRE p.13 local m = ( (P'Geo' + P'Je') * P'ff' * (P're' + P'er') * P'y' ):compile() assert(str(m'Jeffrey') == 'Jeffrey') assert(str(m'Jeffery') == 'Jeffery') assert(str(m'Geoffrey') == 'Geoffrey') assert(str(m'Geoffery') == 'Geoffery') assert(str(m'Jefery') == '') assert(str(m'Geofferi') == '') assert(str(m'GeoffrezGeoffery') == 'Geoffery') -- skips assert(str(m'JefferzGeoffery') == 'Geoffery') -- skips assert(str(m'GeoffJeffery') == 'Jeffery') -- skips -- MRE p.24 local m = ( P'%$[0-9]+' * P'%.[0-9][0-9]'^-1 ):compile() assert(str(m'$20.00') == '$20.00') assert(str(m'$20') == '$20') assert(str(m'$20.00.00') == '$20.00') -- example print 'example' local M = require "xpattern" local P = M.P local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile() local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first assert(a == nil and b == 'ccc' and c == 'b' and d == 11) -- example local m = P('foo', P'bar'+P'baz', 'qux'):compile() assert(str(m'afoobazfoobarquxbar', 'foobarqux')) local m = P('^foo', P'bar'+P'baz', 'qux'):compile() -- anchored assert(str(m'afoobazfoobarquxbar', '')) assert(str(m'foobarquxbar', '')) -- http://lua-users.org/lists/lua-l/2009-10/msg00752.html local m = ( P'^' * ( ( P'ceil'+P'abs' +P'floor'+P'mod' +P'exp'+P'log'+P'pow'+ P'sqrt'+P'acos'+P'asin' +P'atan'+P'cos'+P'sin'+P'tan'+ P'deg' +P'rad' +P'random' ) * P'%(' + P'[0-9%(%)%-%+%*%/%.%,]' + P'pi' )^1 * P'$' ):compile() assert(m'cos(1+pi)' == 'cos(1+pi)') assert(m'cos(1+p)' == nil) -- 'p' assert(m'cos(12.3/2)+mod(2,3)' == 'cos(12.3/2)+mod(2,3)') assert(m'cos(12.3/2)+mod(2,3) ' == nil) -- ' ' assert(m' cos(12.3/2)+mod(2,3)' == nil) -- ' ' assert(m'cos(12.3/2)+mod+2' == nil) -- no '(' print 'DONE'
Changes: