String Trim |
|
-- trim implementations function trim1(s) return (s:gsub("^%s*(.-)%s*$", "%1")) end -- from PiL2 20.4 function trim2(s) return s:match "^%s*(.-)%s*$" end -- variant of trim1 (match) function trim3(s) return s:gsub("^%s+", ""):gsub("%s+$", "") end -- two gsub's function trim4(s) return s:match"^%s*(.*)":match"(.-)%s*$" end -- variant of trim3 (match) function trim5(s) return s:match'^%s*(.*%S)' or '' end -- warning: has bad performance when s:match'^%s*$' and #s is large function trim6(s) return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)' end -- fixes performance problem in trim5. -- note: the '()' avoids the overhead of default string capture. -- This overhead is small, ~ 10% for successful whitespace match call -- alone, and may not be noticeable in the overall benchmarks here, -- but there's little harm either. Instead replacing the first `match` -- with a `find` has a similar effect, but that requires localizing -- two functions in the trim7 variant below. local match = string.match function trim7(s) return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)') end -- variant of trim6 (localize functions) local find = string.find local sub = string.sub function trim8(s) local i1,i2 = find(s,'^%s*') if i2 >= i1 then s = sub(s,i2+1) end local i1,i2 = find(s,'%s*$') if i2 >= i1 then s = sub(s,1,i1-1) end return s end -- based on penlight 0.7.2 function trim9(s) local _, i1 = find(s,'^%s*') local i2 = find(s,'%s*$') return sub(s, i1 + 1, i2 - 1) end -- simplification of trim8 function trim10(s) local a = s:match('^%s*()') local b = s:match('()%s*$', a) return s:sub(a,b-1) end -- variant of trim9 (match) function trim11(s) local n = s:find"%S" return n and s:match(".*%S", n) or "" end -- variant of trim6 (use n position) -- http://lua-users.org/lists/lua-l/2009-12/msg00904.html function trim12(s) local from = s:match"^%s*()" return from > #s and "" or s:match(".*%S", from) end -- variant of trim11 (performs better for all -- whitespace string). See Roberto's comments -- on ^%s*$" v.s. "%S" performance: -- http://lua-users.org/lists/lua-l/2009-12/msg00921.html do local lpeg = require("lpeg") local space = lpeg.S' \t\n\v\f\r' local nospace = 1 - space local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0) local match = lpeg.match function trim13(s) return match(ptrim, s) end end -- lpeg. based on http://lua-users.org/lists/lua-l/2009-12/msg00921.html do local lpeg = require("lpeg") local re = require("re") local ptrim = re.compile"%s* {(%s* %S+)*}" local match = lpeg.match function trim14(s) return match(ptrim, s) end end -- variant with re module. require 'trim' local trim15 = trim -- C implementation (see separate trim.c file) -- test utilities local function trimtest(trim) assert(trim'' == '') assert(trim' ' == '') assert(trim' ' == '') assert(trim'a' == 'a') assert(trim' a' == 'a') assert(trim'a ' == 'a') assert(trim' a ' == 'a') assert(trim' a ' == 'a') assert(trim' ab cd ' == 'ab cd') assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b') end local function perftest(f, s) local time = os.clock -- os.time or os.clock local t1 = time() for i=1,100000 do f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) end local dt = time() - t1 io.stdout:write(("%4.1f"):format(dt) .. " ") end local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7, trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15} -- correctness tests for _,trim in ipairs(trims) do trimtest(trim) end -- performance tests for j = 1, 3 do for i,trim in ipairs(trims) do io.stdout:write(string.format("%2d",i) .. ": ") perftest(trim, "") perftest(trim, "abcdef") perftest(trim, " abcdef ") perftest(trim, "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef") perftest(trim, " a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ") perftest(trim, " a ") perftest(trim, " ") print() end end
Optional C module for test15 based on LuaList:2009-12/msg00951.html:
/* trim.c - based on http://lua-users.org/lists/lua-l/2009-12/msg00951.html from Sean Conner */ #include <stddef.h> #include <ctype.h> #include <lua.h> #include <lauxlib.h> int trim(lua_State* L) { const char* front; const char* end; size_t size; front = luaL_checklstring(L,1,&size); end = &front[size - 1]; while (size && isspace((unsigned char)*front)) { size--; front++; } while (size && isspace((unsigned char)*end)) { size--; end--; } lua_pushlstring(L,front,(size_t)(end - front) + 1); return 1; } int luaopen_trim(lua_State* L) { lua_register(L, "trim", trim); return 0; }
Test results (lower numbers = faster):
Lua 5.1.4/Cygwin1.7 1: 0.9 1.9 2.1 9.6 10.3 2.2 2.0 2: 0.7 1.6 1.7 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.2 2.2 2.2 10.1 11.2 2.7 2.2 5: 0.6 0.9 1.1 1.2 1.3 2.8 77.6 6: 0.6 1.2 1.6 1.6 1.8 4.1 1.7 7: 0.6 1.1 1.5 1.5 1.6 3.9 1.6 8: 1.0 1.7 2.5 7.5 9.7 3.0 2.4 9: 1.2 2.0 2.7 8.0 9.3 21.2 3.4 10: 1.5 2.4 2.6 9.8 10.8 2.8 2.6 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.7 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 1: 0.9 1.9 2.0 9.6 10.3 2.2 1.9 2: 0.7 1.6 1.8 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.1 2.2 2.2 10.1 11.4 2.7 2.2 5: 0.6 0.9 1.2 1.2 1.2 2.8 78.2 6: 0.6 1.2 1.7 1.6 1.8 4.2 1.7 7: 0.6 1.1 1.5 1.5 1.7 3.9 1.6 8: 1.0 1.7 2.5 7.5 9.6 3.1 2.3 9: 1.2 2.0 2.7 8.0 9.3 21.1 3.3 10: 1.5 2.4 2.5 9.8 10.8 2.8 2.5 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.7 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.4 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 1: 0.9 1.9 2.0 9.6 10.3 2.2 2.0 2: 0.7 1.6 1.7 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.1 2.2 2.2 10.1 11.2 2.7 2.2 5: 0.6 0.9 1.2 1.2 1.3 2.8 77.3 6: 0.6 1.2 1.7 1.6 1.8 4.2 1.7 7: 0.6 1.1 1.5 1.5 1.7 3.9 1.6 8: 1.0 1.7 2.6 7.4 9.6 3.1 2.3 9: 1.2 2.0 2.7 8.0 9.3 21.1 3.3 10: 1.5 2.4 2.6 9.8 10.8 2.8 2.6 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.6 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 LuaJIT 2.0.0-beta2/Cygwin1.7 1: 0.6 1.5 1.5 7.7 8.3 1.3 1.2 2: 0.4 1.2 1.2 7.1 7.8 1.1 1.0 3: 0.6 1.0 1.2 3.1 4.9 1.7 1.3 4: 0.8 1.6 1.8 8.4 9.0 1.9 1.3 5: 0.4 0.6 0.8 1.2 1.2 2.3 99.2 6: 0.4 0.9 1.1 1.5 1.5 3.2 0.9 7: 0.3 0.8 1.1 1.4 1.5 3.0 0.8 8: 0.6 1.2 1.6 5.3 6.8 1.7 1.3 9: 0.7 1.2 1.8 5.6 6.7 14.4 1.7 10: 0.9 1.6 1.7 7.6 8.3 1.5 1.5 11: 0.3 0.8 1.1 1.4 1.5 2.9 2.1 12: 0.4 0.9 1.1 1.5 1.5 2.7 0.9 13: 0.6 0.7 0.7 1.0 1.4 0.8 0.7 14: 0.6 0.7 0.7 1.0 1.4 0.8 0.8 15: 0.1 0.1 0.1 0.3 0.3 0.2 0.2 ...
The speed of trim5
is relatively favorable or competitive over the data set except it is quite poor in the case of a long string with only whitespace. The use of ".*
" (rather than a non-greedy ".-
") quickly plows to the end of a long string, and the "%S
" then triggers backtracking to avoid the trailing whitespace, but the juxtaposition of "%s*
" and ".*
" imposes substantial backtracking (O(N^2)) if no "%S
" match is found. trim6
handles the worst cast condition specially and behaves well over the entire data set. trim7
is a variant that localizes functions, which gives a bit larger code size and does not give a substantial speed improvement. (On further testing, each localization gives maybe 10% in best case involving empty string data and inlining the function call, but here there are two calls to match
). trim11
is also a variant of trim6
with similar performance characteristics, perhaps slightly better, except speed is about half in the case of a long string with only whitespace (this is fixed in trim12
). trim13
(or its variant trim14
) is an lpeg (LuaPeg) implementation, which generally meets or exceeds the performance of the best pattern matching implementations, particularly exceeding in the case of lots of whitespace, but it is a bit slower in its worst case condition of alternating space and whitespace. The C implementation (trim15
) is by far the fastest over the entire data set.
Reusing the same strings across all test iterations, as done above, might not properly gauge the effect of temporary string creation due to Lua's string interning. A quick test with local ss = {}; for i = 1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end
as a string cache doesn't seem to affect the basic conclusions above.