Yet Another Pure Lua XML Parser
At work, I needed a pure-lua XML parser that produced results
similar to the LOM parser included with LuaExpat. I wrote one,
stealing ideas from LOM and the pure-lua parser seen on the Wiki.
What makes mine special?
* Strips out comments and processing directives
* Unescapes XML entities (&, <, >, ", ') in text and attributes
* (optionally) Strips leading/trailing whitespace from text elements
* (optionally) Creates an 'innertext' property that holds the
concatenation of all child text for each element.
* (optionally) Allows access to child elements by name:
* mylomnode.elementname -- gives the first child element with that
name
* mylomnode.elements.elementname -- gives an array of all children
with that name
* Sort of handles CDATA sections (doesn't strip whitespace from
them, but DOES incorrectly translate entities inside them)
* Provides a method for giving single-element string representations
Simple usage:
local theRoot = AKLOM.parse( '<root><message string="Hello World"></
root>' )
print( theRoot.message.attr.string )
local theRoot = AKLOM.parse( '<root><message>Hello <null/> World</
message></root>' )
print( theRoot[1][1] )
print( theRoot.message.innertext )
I include the code below for your review and comments.
module 'AKLOM'
-- Creates the 'elements' collection and named access to the first
child element
useElementCollectionFlag = true
-- Creates an 'innertext' property that is the sum of all text
objects
useInnerTextFlag = true
-- Strips all leading/trailing whitespace between nodes and text
stripWhitespaceFlag = true
local sub, gsub, find, push, pop = string.sub, string.gsub,
string.find, table.insert, table.remove
function unescape( inString )
inString = gsub( inString, '<', '<' )
inString = gsub( inString, '>', '>' )
inString = gsub( inString, '"', '"' )
inString = gsub( inString, ''', "'" )
return gsub( inString, '&', '&' )
end
function parse( inXMLString )
-- Throw out SGML comments and processing directives
inXMLString = gsub( inXMLString, '<!%-%-.-%-%->', '' )
inXMLString = gsub( inXMLString, '<%?.-%?>', '' )
if stripWhitespaceFlag then
-- Throw out leading and trailing whitespace in text blocks
inXMLString = gsub( inXMLString, '>%s+', '>' )
inXMLString = gsub( inXMLString, '%s+<', '<' )
end
inXMLString = gsub( inXMLString, '<!%[CDATA%[', '' )
inXMLString = gsub( inXMLString, '%]%]>', '' )
local theDoc = useElementCollectionFlag and { elements={} } or { }
local theCurrentElement = theDoc
local theStack = { n=0 }
local thePos = 1
local theStart, theEnd, theClose, theName, theAttr, theEmpty
local theLeadingText
while true do
theStart, theEnd, theClose, theName, theAttr, theEmpty = find
( inXMLString, '<(%/?)(%a%w*)(.-)(%/?)>', thePos )
if not theStart then break end
local theIsParentFlag = ( theEmpty == '' )
theLeadingText = unescape( sub( inXMLString, thePos + 1, theStart
- 1 ) )
if theLeadingText ~= '' then
push( theCurrentElement, theLeadingText )
if useInnerTextFlag then
theCurrentElement.innertext = theCurrentElement.innertext ..
theLeadingText
end
end
thePos = theEnd
if theClose ~= '' then
if useInnerTextFlag and theCurrentElement.innertext == '' then
theCurrentElement.innertext = nil
end
theCurrentElement = pop( theStack )
assert( theName == theCurrentElement.name, "Found close element
'"..theName.."', expected '"..theCurrentElement.name.."'" )
theCurrentElement = theStack[ theStack.n ]
if not theCurrentElement then break end
else
local theElement = {
name = theName,
attr = {},
elements = useElementCollectionFlag and {} or nil,
innertext = ( useInnerTextFlag and theIsParentFlag ) and '' or nil
}
-- Parse the attribute string
gsub(
theAttr,
'([%a_:][%w._:-]*)%s*=%s*([\'"])(.-)%2',
function( inAttName, _, inAttValue )
theElement.attr[ inAttName ] = unescape( inAttValue )
end
)
-- Add the element to the parent
push( theCurrentElement, theElement )
if useElementCollectionFlag then
if not theCurrentElement[ theName ] then
theCurrentElement[ theName ] = theElement
end
if not theCurrentElement.elements[ theName ] then
theCurrentElement.elements[ theName ] = {}
end
push( theCurrentElement.elements[ theName ], theElement )
end
if theIsParentFlag then
push( theStack, theElement )
theCurrentElement = theElement
end
end
end
if theStack.n > 0 then
error( "AKLOM parsing ended early; I was still inside the '"..
(theStack[theStack.n].name).."' element." )
end
return theDoc[ 1 ]
end
function lomstring( inLOM )
local theOutput = "<" .. inLOM.name .. " (" .. table.getn
( inLOM ) .. " children)"
for k,v in pairs( inLOM.attr ) do
theOutput = theOutput .. ' ' .. k .. '="' .. v .. '"'
end
return theOutput .. '>'
end