[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: What would be a good representation of XML tree as Lua tables?
- From: Enrico Colombini <erix@...>
- Date: Sun, 21 Aug 2016 17:09:56 +0200
On 21-Aug-16 16:45, Marc Balmer wrote:
I am working on code to easily create XML trees from Lua tables. What would,
in your opinion, be a comfortable Lua table notation of XML trees, expressed
as Lua tables?
Some years ago I wrote a minimalist, gmatch-based XML parser, ~50 lines
apart from testing code, to read .ods (Open Document Spreadsheet) files.
I never published it because it had limitations and I did not have the
time to finish the job properly beyond my immediate needs, but I attach
it here in case its approach to table organization (line 61) could have
something useful for your design... perhaps just to see what is best
avoided ;-)
The parser both builds an XML tree and calls an external handler (if
any) when tags are encountered.
('+++' comments mean 'work in progress')
P.S. I put this code (from 2008) under the Lua license.
--
Enrico
function Test()
local fname = 'spazi.xml' -- +++ test +++
-- fname = 'content.xml'
local f = io.open(fname, 'r')
local txt = f:read('*a')
f:close()
local startf = function(element)
print('start: ' .. element.name)
for attr, val in pairs(element.attrib) do
print(' ' .. attr .. ' = ' .. val)
end
end
local endf = function(element)
print('end: ' .. element.name)
end
local dataf = function(element)
print('data: ' .. element.data)
end
local root = Parse(txt, startf, endf, dataf)
io.read()
PrintTree(root) -- +++
end
---------------------------------------------------------------------------
-- todo: +++
-- handle CDATA (put away in table, get them later?)
-- note: space in character data is preserved (not really standard)
-- note: currently only latest data chunk is preserved in tree
-- numeric chars for escape currently not supported
-- handle comments?
-- numeric escape (e.g.    ) not supported because of UTF-8,
-- can be handled at content level
-- Print (use write)
-- test/demo files
-- +++ data --> text
-- +++ multiple text: use array for children & data, select on type
-- +++ children/data order is not preserved, use handlers +++
-- +++ is this a design problem?
---------------------------------------------------------------------------
-- parse xml text, build element tree,
-- call startTagHandler(element), if any, at every start or empty tag,
-- call endTagHandler(element), if any, at every start or empty tag,
-- call dataHandler(element), if any, at every character data chunk,
-- return root element of the created element tree:
--
-- element = {
-- parent = parent (nil for root)
-- name = name
-- data = character data (as string)
-- attrib = { attr=val, attr=val, ... }
-- [1] = child element
-- [2] = child element
-- [n] = child element
-- }
function Parse(xmlText, startTagHandler, endTagHandler, dataHandler)
-- unescape function (numeric escape currently not supported)
local escapeTable = {
['&']='&', ['<']='<', ['>']='>', [''']="'", ['"']='"'
}
local function unescape(s)
return string.gsub(s, '(%&%a+%;)', escapeTable)
end
-- start with root element
local root = { parent=nil, name='root', data=nil, attrib={} }
currentElement = root
-- get (<startTag attributes> | <emptyTag/> | </endTag>) and following data (if any)
local namePattern = '%a[%w%.%-%_%:%&%;]*' -- (primitive but enough for basic parsing)
local tagPattern = '%<([%/]?)(' .. namePattern .. ')(.-)([%/]?)%>([^%<]*)' -- (5 captures)
local attributePattern = '(' .. namePattern .. ')%s*%=%s*([\'\"])(.-)%2'
for endTagChar, name, attributes, emptyTagChar, data in string.gmatch(xmlText, tagPattern) do
local isEndTag = (endTagChar ~= '')
local isEmptyTag = (emptyTagChar ~= '')
assert(not (isEndTag and isEmptyTag), 'endTag-emptyTag conflict in: ' .. name)
-- handle start tag or empty tag
if (not isEndTag) then
-- create a new child element, add it to current element
local newElement = { parent=currentElement, name=name, data=nil, attrib={} }
currentElement[#currentElement + 1] = newElement
-- move to the new element
currentElement = newElement
-- add attributes and their values to current element
if attributes ~= '' then
local attribTable = currentElement.attrib
for attr, _, val, _ in string.gmatch(attributes, attributePattern) do
attribTable[attr] = unescape(val)
end
end
-- call start tag handler, if any
if startTagHandler then
startTagHandler(currentElement)
end
end
-- handle close tag or empty tag
if (isEndTag or isEmptyTag) then
assert((name == currentElement.name), 'inconsistent startTag/endTag in :' .. name)
-- call end tag handler, if any
if endTagHandler then
endTagHandler(currentElement)
end
-- return to parent element
currentElement = currentElement.parent
end
-- if any character data, store it and call data handler, if any
-- (note: only last data chunk is stored, use handlers to get full data)
if data and (data ~= '') then
currentElement.data = unescape(data)
dataHandler(currentElement)
end
end
return root
end
---------------------------------------------------------------------------
-- recursively print element tree from given element
-- with optional indent step (default 2) and initial indent (defautl 0)
function PrintTree(element, indentStep, indent)
indentStep = indentStep or 2
indent = indent or 0
local spc = string.rep(' ', indent)
-- print as empty tag if no attributes and no children
local emptyTag = ((next(element.attrib) == nil) and (#element == 0)) -- +++ no ??
-- show open tag or empty tag
if emptyTag then
io.write(spc, '<', element.name, '/>\n')
else
io.write(spc, '<', element.name, '>\n')
end
-- show attributes-value pairs
for attr, val in pairs(element.attrib) do
io.write(spc, '-', attr, '=', val, '\n')
end
-- show (latest) data
if element.data then
io.write(spc, element.data, '\n')
end
-- show children
for _, child in ipairs(element) do
PrintTree(child, indentStep, indent + indentStep)
end
-- show close tag if any children
if not emptyTag then
io.write(spc, '</', element.name, '>\n')
end
end
---------------------------------------------------------------------------
Test()