字串 Trim

lua-users home
wiki

在 Lua 中實作「trim」函式的有許多方法 [1]

-- trim implementations

function trim1(s)
   return (s:gsub("^%s*(.-)%s*$", "%1"))
end
-- from PiL2 20.4

function trim2(s)
   return s:match "^%s*(.-)%s*$"
end
-- variant of trim1 (match)

function trim3(s)
   return s:gsub("^%s+", ""):gsub("%s+$", "")
end
-- two gsub's

function trim4(s)
   return s:match"^%s*(.*)":match"(.-)%s*$"
end
-- variant of trim3 (match)

function trim5(s)
   return s:match'^%s*(.*%S)' or ''
end
-- warning: has bad performance when s:match'^%s*$' and #s is large

function trim6(s)
   return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)'
end
-- fixes performance problem in trim5.
-- note: the '()' avoids the overhead of default string capture.
-- This overhead is small, ~ 10% for successful whitespace match call
-- alone, and may not be noticeable in the overall benchmarks here,
-- but there's little harm either.  Instead replacing the first `match`
-- with a `find` has a similar effect, but that requires localizing
-- two functions in the trim7 variant below.

local match = string.match
function trim7(s)
   return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)')
end
-- variant of trim6 (localize functions)

local find = string.find
local sub = string.sub
function trim8(s)
   local i1,i2 = find(s,'^%s*')
   if i2 >= i1 then
      s = sub(s,i2+1)
   end
   local i1,i2 = find(s,'%s*$')
   if i2 >= i1 then
      s = sub(s,1,i1-1)
   end
   return s
end
-- based on penlight 0.7.2

function trim9(s)
   local _, i1 = find(s,'^%s*')
   local i2 = find(s,'%s*$')
   return sub(s, i1 + 1, i2 - 1)
end
-- simplification of trim8

function trim10(s)
   local a = s:match('^%s*()')
   local b = s:match('()%s*$', a)
   return s:sub(a,b-1)
end
-- variant of trim9 (match)

function trim11(s)
   local n = s:find"%S"
   return n and s:match(".*%S", n) or ""
end
-- variant of trim6 (use n position)
-- https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00904.html

function trim12(s)
   local from = s:match"^%s*()"
   return from > #s and "" or s:match(".*%S", from)
end
-- variant of trim11 (performs better for all
-- whitespace string). See Roberto's comments
-- on ^%s*$" v.s. "%S" performance:
-- https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00921.html

do
   local lpeg = require("lpeg")
   local space = lpeg.S' \t\n\v\f\r'
   local nospace = 1 - space
   local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0)
   local match = lpeg.match
   function trim13(s)
      return match(ptrim, s)
   end
end
-- lpeg.  based on https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00921.html

do
   local lpeg = require("lpeg")
   local re = require("re")
   local ptrim = re.compile"%s* {(%s* %S+)*}"
   local match = lpeg.match
   function trim14(s)
      return match(ptrim, s)
   end
end
-- variant with re module.

require 'trim'
local trim15 = trim
-- C implementation (see separate trim.c file)


-- test utilities

local function trimtest(trim)
   assert(trim'' == '')
   assert(trim' ' == '')
   assert(trim'  ' == '')
   assert(trim'a' == 'a')
   assert(trim' a' == 'a')
   assert(trim'a ' == 'a')
   assert(trim' a ' == 'a')
   assert(trim'  a  ' == 'a')
   assert(trim'  ab cd  ' == 'ab cd')
   assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b')
end

local function perftest(f, s)
   local time = os.clock  -- os.time or os.clock
   local t1 = time()
   for i=1,100000 do
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
   end
   local dt = time() - t1
   io.stdout:write(("%4.1f"):format(dt) .. " ")
end

local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7,
               trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15}

-- correctness tests
for _,trim in ipairs(trims) do
   trimtest(trim)
end

-- performance tests
for j = 1, 3 do
   for i,trim in ipairs(trims) do
      io.stdout:write(string.format("%2d",i) .. ": ")
      perftest(trim,  "")
      perftest(trim,  "abcdef")
      perftest(trim,  "   abcdef   ")
      perftest(trim,  "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef")
      perftest(trim,  "  a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ")
      perftest(trim,  "                               a                            ")
      perftest(trim,  "                                                            ")
      print()
   end
end

針對 test15 的選擇性 C 模組,根據 LuaList:2009-12/msg00951.html 衍生而成

/* trim.c - based on https://lua-users.dev.org.tw/lists/lua-l/2009-12/msg00951.html
            from Sean Conner */
#include <stddef.h>
#include <ctype.h>
#include <lua.h>
#include <lauxlib.h>

int trim(lua_State* L) {
   const char* front;
   const char* end;
   size_t size;

   front = luaL_checklstring(L,1,&size);
   end = &front[size - 1];

   while (size && isspace((unsigned char)*front)) {
      size--;
      front++;
   }

   while (size && isspace((unsigned char)*end)) {
      size--;
      end--;
   }

   lua_pushlstring(L,front,(size_t)(end - front) + 1);
   return 1;
}

int luaopen_trim(lua_State* L) {
   lua_register(L, "trim", trim);
   return 0;
}

測試結果(數字越小表示越快)

Lua 5.1.4/Cygwin1.7
 1:  0.9  1.9  2.1  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.2  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.1  1.2  1.3  2.8 77.6
 6:  0.6  1.2  1.6  1.6  1.8  4.1  1.7
 7:  0.6  1.1  1.5  1.5  1.6  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.7  3.0  2.4
 9:  1.2  2.0  2.7  8.0  9.3 21.2  3.4
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  1.9
 2:  0.7  1.6  1.8  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.4  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.2  2.8 78.2
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.5  9.8 10.8  2.8  2.5
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.4  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.3  2.8 77.3
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.6  7.4  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.6  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3

LuaJIT 2.0.0-beta2/Cygwin1.7
 1:  0.6  1.5  1.5  7.7  8.3  1.3  1.2
 2:  0.4  1.2  1.2  7.1  7.8  1.1  1.0
 3:  0.6  1.0  1.2  3.1  4.9  1.7  1.3
 4:  0.8  1.6  1.8  8.4  9.0  1.9  1.3
 5:  0.4  0.6  0.8  1.2  1.2  2.3 99.2
 6:  0.4  0.9  1.1  1.5  1.5  3.2  0.9
 7:  0.3  0.8  1.1  1.4  1.5  3.0  0.8
 8:  0.6  1.2  1.6  5.3  6.8  1.7  1.3
 9:  0.7  1.2  1.8  5.6  6.7 14.4  1.7
10:  0.9  1.6  1.7  7.6  8.3  1.5  1.5
11:  0.3  0.8  1.1  1.4  1.5  2.9  2.1
12:  0.4  0.9  1.1  1.5  1.5  2.7  0.9
13:  0.6  0.7  0.7  1.0  1.4  0.8  0.7
14:  0.6  0.7  0.7  1.0  1.4  0.8  0.8
15:  0.1  0.1  0.1  0.3  0.3  0.2  0.2
...

trim5 的速度相對來說在資料集的表現不錯或具有競爭力,但是如果遇到僅有空白字元的長字串時,速度卻很差。「.*」(而不是非貪婪的「.-」)的使用會迅速處理到長字串的結尾,而接著的「%S」會觸發回溯以避免尾隨的空白字元,但是「%s*」和「.*」的並置會產生大量的回溯(O(N^2)),如果找不到「%S」相符項。trim6 特別處理最糟情況,並在整個資料集內表現良好。trim7 是將函式區域化的變異形式,其會提供較大的程式碼大小,不會大幅度地提升執行速度。(在進一步的測試中,每個區域化可能會在涉及空字串資料和內聯函式呼叫時,在最佳情況下提昇 10%,但是在此處針對 match 進行了兩次呼叫)。trim11 也是 trim6 的變異形式,具有類似的執行效能,或許稍好一點,但是如果遇到僅有空白字元的長字串,其速度會減半(這在 trim12 中已修正)。trim13(或其變異形式 trim14)是一種 lpeg (LuaPeg) 實作,其通常會達到或超越最佳模式配對實作的執行效能,特別是在有大量空白字元的情況下超越,但是處理空白字元與非空白字元交替出現的最糟情況時,其速度會慢一點。C 的實作(trim15)在整個資料集中絕對是最快的。

如同上述,在所有測試反覆運算中重複使用相同的字串,可能無法適當地衡量因為 Lua 的字串內部儲存而導致暫時性字串產生的效應。快速測試 local ss = {}; for i = 1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end,其作為字串快取,貌似不會影響上述的基本結論。

--DavidManura

另請參閱


RecentChanges · 偏好設定
編輯 · 歷程記錄
最後更新時間格林威治時間 2022 年 4 月 15 日上午 3:02 (diff)