字串 Trim |
|
-- trim implementations function trim1(s) return (s:gsub("^%s*(.-)%s*$", "%1")) end -- from PiL2 20.4 function trim2(s) return s:match "^%s*(.-)%s*$" end -- variant of trim1 (match) function trim3(s) return s:gsub("^%s+", ""):gsub("%s+$", "") end -- two gsub's function trim4(s) return s:match"^%s*(.*)":match"(.-)%s*$" end -- variant of trim3 (match) function trim5(s) return s:match'^%s*(.*%S)' or '' end -- warning: has bad performance when s:match'^%s*$' and #s is large function trim6(s) return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)' end -- fixes performance problem in trim5. -- note: the '()' avoids the overhead of default string capture. -- This overhead is small, ~ 10% for successful whitespace match call -- alone, and may not be noticeable in the overall benchmarks here, -- but there's little harm either. Instead replacing the first `match` -- with a `find` has a similar effect, but that requires localizing -- two functions in the trim7 variant below. local match = string.match function trim7(s) return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)') end -- variant of trim6 (localize functions) local find = string.find local sub = string.sub function trim8(s) local i1,i2 = find(s,'^%s*') if i2 >= i1 then s = sub(s,i2+1) end local i1,i2 = find(s,'%s*$') if i2 >= i1 then s = sub(s,1,i1-1) end return s end -- based on penlight 0.7.2 function trim9(s) local _, i1 = find(s,'^%s*') local i2 = find(s,'%s*$') return sub(s, i1 + 1, i2 - 1) end -- simplification of trim8 function trim10(s) local a = s:match('^%s*()') local b = s:match('()%s*$', a) return s:sub(a,b-1) end -- variant of trim9 (match) function trim11(s) local n = s:find"%S" return n and s:match(".*%S", n) or "" end -- variant of trim6 (use n position) -- https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00904.html function trim12(s) local from = s:match"^%s*()" return from > #s and "" or s:match(".*%S", from) end -- variant of trim11 (performs better for all -- whitespace string). See Roberto's comments -- on ^%s*$" v.s. "%S" performance: -- https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00921.html do local lpeg = require("lpeg") local space = lpeg.S' \t\n\v\f\r' local nospace = 1 - space local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0) local match = lpeg.match function trim13(s) return match(ptrim, s) end end -- lpeg. based on https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00921.html do local lpeg = require("lpeg") local re = require("re") local ptrim = re.compile"%s* {(%s* %S+)*}" local match = lpeg.match function trim14(s) return match(ptrim, s) end end -- variant with re module. require 'trim' local trim15 = trim -- C implementation (see separate trim.c file) -- test utilities local function trimtest(trim) assert(trim'' == '') assert(trim' ' == '') assert(trim' ' == '') assert(trim'a' == 'a') assert(trim' a' == 'a') assert(trim'a ' == 'a') assert(trim' a ' == 'a') assert(trim' a ' == 'a') assert(trim' ab cd ' == 'ab cd') assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b') end local function perftest(f, s) local time = os.clock -- os.time or os.clock local t1 = time() for i=1,100000 do f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) f(s) end local dt = time() - t1 io.stdout:write(("%4.1f"):format(dt) .. " ") end local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7, trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15} -- correctness tests for _,trim in ipairs(trims) do trimtest(trim) end -- performance tests for j = 1, 3 do for i,trim in ipairs(trims) do io.stdout:write(string.format("%2d",i) .. ": ") perftest(trim, "") perftest(trim, "abcdef") perftest(trim, " abcdef ") perftest(trim, "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef") perftest(trim, " a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ") perftest(trim, " a ") perftest(trim, " ") print() end end
針對 test15 的選擇性 C 模組,根據 LuaList:2009-12/msg00951.html 衍生而成
/* trim.c - based on https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00951.html from Sean Conner */ #include <stddef.h> #include <ctype.h> #include <lua.h> #include <lauxlib.h> int trim(lua_State* L) { const char* front; const char* end; size_t size; front = luaL_checklstring(L,1,&size); end = &front[size - 1]; while (size && isspace((unsigned char)*front)) { size--; front++; } while (size && isspace((unsigned char)*end)) { size--; end--; } lua_pushlstring(L,front,(size_t)(end - front) + 1); return 1; } int luaopen_trim(lua_State* L) { lua_register(L, "trim", trim); return 0; }
測試結果(數字越小表示越快)
Lua 5.1.4/Cygwin1.7 1: 0.9 1.9 2.1 9.6 10.3 2.2 2.0 2: 0.7 1.6 1.7 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.2 2.2 2.2 10.1 11.2 2.7 2.2 5: 0.6 0.9 1.1 1.2 1.3 2.8 77.6 6: 0.6 1.2 1.6 1.6 1.8 4.1 1.7 7: 0.6 1.1 1.5 1.5 1.6 3.9 1.6 8: 1.0 1.7 2.5 7.5 9.7 3.0 2.4 9: 1.2 2.0 2.7 8.0 9.3 21.2 3.4 10: 1.5 2.4 2.6 9.8 10.8 2.8 2.6 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.7 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 1: 0.9 1.9 2.0 9.6 10.3 2.2 1.9 2: 0.7 1.6 1.8 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.1 2.2 2.2 10.1 11.4 2.7 2.2 5: 0.6 0.9 1.2 1.2 1.2 2.8 78.2 6: 0.6 1.2 1.7 1.6 1.8 4.2 1.7 7: 0.6 1.1 1.5 1.5 1.7 3.9 1.6 8: 1.0 1.7 2.5 7.5 9.6 3.1 2.3 9: 1.2 2.0 2.7 8.0 9.3 21.1 3.3 10: 1.5 2.4 2.5 9.8 10.8 2.8 2.5 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.7 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.4 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 1: 0.9 1.9 2.0 9.6 10.3 2.2 2.0 2: 0.7 1.6 1.7 8.9 10.0 2.0 1.8 3: 1.0 1.3 1.7 3.8 6.4 2.6 2.1 4: 1.1 2.2 2.2 10.1 11.2 2.7 2.2 5: 0.6 0.9 1.2 1.2 1.3 2.8 77.3 6: 0.6 1.2 1.7 1.6 1.8 4.2 1.7 7: 0.6 1.1 1.5 1.5 1.7 3.9 1.6 8: 1.0 1.7 2.6 7.4 9.6 3.1 2.3 9: 1.2 2.0 2.7 8.0 9.3 21.1 3.3 10: 1.5 2.4 2.6 9.8 10.8 2.8 2.6 11: 0.5 1.2 1.5 1.6 1.7 3.5 2.5 12: 0.7 1.3 1.6 1.6 1.8 3.0 1.8 13: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 14: 0.8 0.9 1.0 1.3 2.5 1.1 1.0 15: 0.2 0.2 0.2 0.4 0.4 0.3 0.3 LuaJIT 2.0.0-beta2/Cygwin1.7 1: 0.6 1.5 1.5 7.7 8.3 1.3 1.2 2: 0.4 1.2 1.2 7.1 7.8 1.1 1.0 3: 0.6 1.0 1.2 3.1 4.9 1.7 1.3 4: 0.8 1.6 1.8 8.4 9.0 1.9 1.3 5: 0.4 0.6 0.8 1.2 1.2 2.3 99.2 6: 0.4 0.9 1.1 1.5 1.5 3.2 0.9 7: 0.3 0.8 1.1 1.4 1.5 3.0 0.8 8: 0.6 1.2 1.6 5.3 6.8 1.7 1.3 9: 0.7 1.2 1.8 5.6 6.7 14.4 1.7 10: 0.9 1.6 1.7 7.6 8.3 1.5 1.5 11: 0.3 0.8 1.1 1.4 1.5 2.9 2.1 12: 0.4 0.9 1.1 1.5 1.5 2.7 0.9 13: 0.6 0.7 0.7 1.0 1.4 0.8 0.7 14: 0.6 0.7 0.7 1.0 1.4 0.8 0.8 15: 0.1 0.1 0.1 0.3 0.3 0.2 0.2 ...
trim5
的速度相對來說在資料集的表現不錯或具有競爭力,但是如果遇到僅有空白字元的長字串時,速度卻很差。「.*
」(而不是非貪婪的「.-
」)的使用會迅速處理到長字串的結尾,而接著的「%S
」會觸發回溯以避免尾隨的空白字元,但是「%s*
」和「.*
」的並置會產生大量的回溯(O(N^2)),如果找不到「%S
」相符項。trim6
特別處理最糟情況,並在整個資料集內表現良好。trim7
是將函式區域化的變異形式,其會提供較大的程式碼大小,不會大幅度地提升執行速度。(在進一步的測試中,每個區域化可能會在涉及空字串資料和內聯函式呼叫時,在最佳情況下提昇 10%,但是在此處針對 match
進行了兩次呼叫)。trim11
也是 trim6
的變異形式,具有類似的執行效能,或許稍好一點,但是如果遇到僅有空白字元的長字串,其速度會減半(這在 trim12
中已修正)。trim13
(或其變異形式 trim14
)是一種 lpeg (LuaPeg) 實作,其通常會達到或超越最佳模式配對實作的執行效能,特別是在有大量空白字元的情況下超越,但是處理空白字元與非空白字元交替出現的最糟情況時,其速度會慢一點。C 的實作(trim15
)在整個資料集中絕對是最快的。
如同上述,在所有測試反覆運算中重複使用相同的字串,可能無法適當地衡量因為 Lua 的字串內部儲存而導致暫時性字串產生的效應。快速測試 local ss = {}; for i = 1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end
,其作為字串快取,貌似不會影響上述的基本結論。