diff --git a/druid/base/text.lua b/druid/base/text.lua index e6ccf07..1641626 100644 --- a/druid/base/text.lua +++ b/druid/base/text.lua @@ -64,6 +64,18 @@ local function update_text_area_size(self) end +-- calculate space width with font +local function get_space_width(self, font) + if not self._space_width[font] then + local no_space = gui.get_text_metrics(font, "1", 0, false, 0, 0).width + local with_space = gui.get_text_metrics(font, " 1", 0, false, 0, 0).width + self._space_width[font] = with_space - no_space + end + + return self._space_width[font] +end + + --- Component init function -- @function text:init -- @tparam node node Gui text node @@ -88,11 +100,34 @@ function M.init(self, node, value, no_adjust) self.on_set_pivot = Event() self.on_update_text_scale = Event() + self._space_width = {} + self:set_to(value or gui.get_text(self.node)) return self end +--- Calculate text width with font with respect to trailing space +-- @function text:get_text_width +-- @tparam[opt] string text +function M.get_text_width(self, text) + text = text or self.last_value + local font = gui.get_font(self.node) + local scale = gui.get_scale(self.node) + local result = gui.get_text_metrics(font, text, 0, false, 0, 0).width + for i = #text, 1, -1 do + local c = string.sub(text, i, i) + if c ~= ' ' then + break + end + + result = result + get_space_width(self, font) + end + + return result * scale.x +end + + --- Set text to text field -- @function text:set_to -- @tparam string set_to Text for node diff --git a/druid/system/utf8.lua b/druid/system/utf8.lua new file mode 100644 index 0000000..7f13914 --- /dev/null +++ b/druid/system/utf8.lua @@ -0,0 +1,1045 @@ +-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $ +-- +-- Provides UTF-8 aware string functions implemented in pure lua: +-- * utf8len(s) +-- * utf8sub(s, i, j) +-- * utf8reverse(s) +-- * utf8char(unicode) +-- * utf8unicode(s, i, j) +-- * utf8gensub(s, sub_len) +-- * utf8find(str, regex, init, plain) +-- * utf8match(str, regex, init) +-- * utf8gmatch(str, regex, all) +-- * utf8gsub(str, regex, repl, limit) +-- +-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these +-- additional functions are available: +-- * utf8upper(s) +-- * utf8lower(s) +-- +-- All functions behave as their non UTF-8 aware counterparts with the exception +-- that UTF-8 characters are used instead of bytes for all units. + +--[[ +Copyright (c) 2006-2007, Kyle Smith +All rights reserved. + +Contributors: + Alimov Stepan + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the author nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--]] + +-- ABNF from RFC 3629 +-- +-- UTF8-octets = *( UTF8-char ) +-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +-- UTF8-1 = %x00-7F +-- UTF8-2 = %xC2-DF UTF8-tail +-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / +-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) +-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / +-- %xF4 %x80-8F 2( UTF8-tail ) +-- UTF8-tail = %x80-BF +-- + +local byte = string.byte +local char = string.char +local dump = string.dump +local find = string.find +local format = string.format +local len = string.len +local lower = string.lower +local rep = string.rep +local sub = string.sub +local upper = string.upper + +-- returns the number of bytes used by the UTF-8 character at byte i in s +-- also doubles as a UTF-8 character validator +local function utf8charbytes (s, i) + -- argument defaults + i = i or 1 + + -- argument checking + if type(s) ~= "string" then + error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")") + end + if type(i) ~= "number" then + error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")") + end + + local c = byte(s, i) + + -- determine bytes needed for character, based on RFC 3629 + -- validate byte 1 + if c > 0 and c <= 127 then + -- UTF8-1 + return 1 + + elseif c >= 194 and c <= 223 then + -- UTF8-2 + local c2 = byte(s, i + 1) + + if not c2 then + error("UTF-8 string terminated early") + end + + -- validate byte 2 + if c2 < 128 or c2 > 191 then + error("Invalid UTF-8 character") + end + + return 2 + + elseif c >= 224 and c <= 239 then + -- UTF8-3 + local c2 = byte(s, i + 1) + local c3 = byte(s, i + 2) + + if not c2 or not c3 then + error("UTF-8 string terminated early") + end + + -- validate byte 2 + if c == 224 and (c2 < 160 or c2 > 191) then + error("Invalid UTF-8 character") + elseif c == 237 and (c2 < 128 or c2 > 159) then + error("Invalid UTF-8 character") + elseif c2 < 128 or c2 > 191 then + error("Invalid UTF-8 character") + end + + -- validate byte 3 + if c3 < 128 or c3 > 191 then + error("Invalid UTF-8 character") + end + + return 3 + + elseif c >= 240 and c <= 244 then + -- UTF8-4 + local c2 = byte(s, i + 1) + local c3 = byte(s, i + 2) + local c4 = byte(s, i + 3) + + if not c2 or not c3 or not c4 then + error("UTF-8 string terminated early") + end + + -- validate byte 2 + if c == 240 and (c2 < 144 or c2 > 191) then + error("Invalid UTF-8 character") + elseif c == 244 and (c2 < 128 or c2 > 143) then + error("Invalid UTF-8 character") + elseif c2 < 128 or c2 > 191 then + error("Invalid UTF-8 character") + end + + -- validate byte 3 + if c3 < 128 or c3 > 191 then + error("Invalid UTF-8 character") + end + + -- validate byte 4 + if c4 < 128 or c4 > 191 then + error("Invalid UTF-8 character") + end + + return 4 + + else + error("Invalid UTF-8 character") + end +end + +-- returns the number of characters in a UTF-8 string +local function utf8len (s) + -- argument checking + if type(s) ~= "string" then + for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end + error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")") + end + + local pos = 1 + local bytes = len(s) + local length = 0 + + while pos <= bytes do + length = length + 1 + pos = pos + utf8charbytes(s, pos) + end + + return length +end + +-- functions identically to string.sub except that i and j are UTF-8 characters +-- instead of bytes +local function utf8sub (s, i, j) + -- argument defaults + j = j or -1 + + local pos = 1 + local bytes = len(s) + local length = 0 + + -- only set l if i or j is negative + local l = (i >= 0 and j >= 0) or utf8len(s) + local startChar = (i >= 0) and i or l + i + 1 + local endChar = (j >= 0) and j or l + j + 1 + + -- can't have start before end! + if startChar > endChar then + return "" + end + + -- byte offsets to pass to string.sub + local startByte,endByte = 1,bytes + + while pos <= bytes do + length = length + 1 + + if length == startChar then + startByte = pos + end + + pos = pos + utf8charbytes(s, pos) + + if length == endChar then + endByte = pos - 1 + break + end + end + + if startChar > length then startByte = bytes+1 end + if endChar < 1 then endByte = 0 end + + return sub(s, startByte, endByte) +end + +--[[ +-- replace UTF-8 characters based on a mapping table +local function utf8replace (s, mapping) + -- argument checking + if type(s) ~= "string" then + error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")") + end + if type(mapping) ~= "table" then + error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")") + end + + local pos = 1 + local bytes = len(s) + local charbytes + local newstr = "" + + while pos <= bytes do + charbytes = utf8charbytes(s, pos) + local c = sub(s, pos, pos + charbytes - 1) + + newstr = newstr .. (mapping[c] or c) + + pos = pos + charbytes + end + + return newstr +end + + +-- identical to string.upper except it knows about unicode simple case conversions +local function utf8upper (s) + return utf8replace(s, utf8_lc_uc) +end + +-- identical to string.lower except it knows about unicode simple case conversions +local function utf8lower (s) + return utf8replace(s, utf8_uc_lc) +end +]] + +-- identical to string.reverse except that it supports UTF-8 +local function utf8reverse (s) + -- argument checking + if type(s) ~= "string" then + error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")") + end + + local bytes = len(s) + local pos = bytes + local charbytes + local newstr = "" + + while pos > 0 do + local c = byte(s, pos) + while c >= 128 and c <= 191 do + pos = pos - 1 + c = byte(s, pos) + end + + charbytes = utf8charbytes(s, pos) + + newstr = newstr .. sub(s, pos, pos + charbytes - 1) + + pos = pos - 1 + end + + return newstr +end + +-- http://en.wikipedia.org/wiki/Utf8 +-- http://developer.coronalabs.com/code/utf-8-conversion-utility +local function utf8char(unicode) + if unicode <= 0x7F then return char(unicode) end + + if (unicode <= 0x7FF) then + local Byte0 = 0xC0 + math.floor(unicode / 0x40); + local Byte1 = 0x80 + (unicode % 0x40); + return char(Byte0, Byte1); + end; + + if (unicode <= 0xFFFF) then + local Byte0 = 0xE0 + math.floor(unicode / 0x1000); + local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40); + local Byte2 = 0x80 + (unicode % 0x40); + return char(Byte0, Byte1, Byte2); + end; + + if (unicode <= 0x10FFFF) then + local code = unicode + local Byte3= 0x80 + (code % 0x40); + code = math.floor(code / 0x40) + local Byte2= 0x80 + (code % 0x40); + code = math.floor(code / 0x40) + local Byte1= 0x80 + (code % 0x40); + code = math.floor(code / 0x40) + local Byte0= 0xF0 + code; + + return char(Byte0, Byte1, Byte2, Byte3); + end; + + error 'Unicode cannot be greater than U+10FFFF!' +end + +local shift_6 = 2^6 +local shift_12 = 2^12 +local shift_18 = 2^18 + +local utf8unicode +utf8unicode = function(str, i, j, byte_pos) + i = i or 1 + j = j or i + + if i > j then return end + + local ch,bytes + + if byte_pos then + bytes = utf8charbytes(str,byte_pos) + ch = sub(str,byte_pos,byte_pos-1+bytes) + else + ch,byte_pos = utf8sub(str,i,i), 0 + bytes = #ch + end + + local unicode + + if bytes == 1 then unicode = byte(ch) end + if bytes == 2 then + local byte0,byte1 = byte(ch,1,2) + local code0,code1 = byte0-0xC0,byte1-0x80 + unicode = code0*shift_6 + code1 + end + if bytes == 3 then + local byte0,byte1,byte2 = byte(ch,1,3) + local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80 + unicode = code0*shift_12 + code1*shift_6 + code2 + end + if bytes == 4 then + local byte0,byte1,byte2,byte3 = byte(ch,1,4) + local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80 + unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3 + end + + return unicode,utf8unicode(str, i+1, j, byte_pos+bytes) +end + +-- Returns an iterator which returns the next substring and its byte interval +local function utf8gensub(str, sub_len) + sub_len = sub_len or 1 + local byte_pos = 1 + local length = #str + return function(skip) + if skip then byte_pos = byte_pos + skip end + local char_count = 0 + local start = byte_pos + repeat + if byte_pos > length then return end + char_count = char_count + 1 + local bytes = utf8charbytes(str,byte_pos) + byte_pos = byte_pos+bytes + + until char_count == sub_len + + local last = byte_pos-1 + local slice = sub(str,start,last) + return slice, start, last + end +end + +local function binsearch(sortedTable, item, comp) + local head, tail = 1, #sortedTable + local mid = math.floor((head + tail)/2) + if not comp then + while (tail - head) > 1 do + if sortedTable[tonumber(mid)] > item then + tail = mid + else + head = mid + end + mid = math.floor((head + tail)/2) + end + end + if sortedTable[tonumber(head)] == item then + return true, tonumber(head) + elseif sortedTable[tonumber(tail)] == item then + return true, tonumber(tail) + else + return false + end +end +local function classMatchGenerator(class, plain) + local codes = {} + local ranges = {} + local ignore = false + local range = false + local firstletter = true + local unmatch = false + + local it = utf8gensub(class) + + local skip + for c, _, be in it do + skip = be + if not ignore and not plain then + if c == "%" then + ignore = true + elseif c == "-" then + table.insert(codes, utf8unicode(c)) + range = true + elseif c == "^" then + if not firstletter then + error('!!!') + else + unmatch = true + end + elseif c == ']' then + break + else + if not range then + table.insert(codes, utf8unicode(c)) + else + table.remove(codes) -- removing '-' + table.insert(ranges, {table.remove(codes), utf8unicode(c)}) + range = false + end + end + elseif ignore and not plain then + if c == 'a' then -- %a: represents all letters. (ONLY ASCII) + table.insert(ranges, {65, 90}) -- A - Z + table.insert(ranges, {97, 122}) -- a - z + elseif c == 'c' then -- %c: represents all control characters. + table.insert(ranges, {0, 31}) + table.insert(codes, 127) + elseif c == 'd' then -- %d: represents all digits. + table.insert(ranges, {48, 57}) -- 0 - 9 + elseif c == 'g' then -- %g: represents all printable characters except space. + table.insert(ranges, {1, 8}) + table.insert(ranges, {14, 31}) + table.insert(ranges, {33, 132}) + table.insert(ranges, {134, 159}) + table.insert(ranges, {161, 5759}) + table.insert(ranges, {5761, 8191}) + table.insert(ranges, {8203, 8231}) + table.insert(ranges, {8234, 8238}) + table.insert(ranges, {8240, 8286}) + table.insert(ranges, {8288, 12287}) + elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII) + table.insert(ranges, {97, 122}) -- a - z + elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII) + table.insert(ranges, {33, 47}) + table.insert(ranges, {58, 64}) + table.insert(ranges, {91, 96}) + table.insert(ranges, {123, 126}) + elseif c == 's' then -- %s: represents all space characters. + table.insert(ranges, {9, 13}) + table.insert(codes, 32) + table.insert(codes, 133) + table.insert(codes, 160) + table.insert(codes, 5760) + table.insert(ranges, {8192, 8202}) + table.insert(codes, 8232) + table.insert(codes, 8233) + table.insert(codes, 8239) + table.insert(codes, 8287) + table.insert(codes, 12288) + elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII) + table.insert(ranges, {65, 90}) -- A - Z + elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII) + table.insert(ranges, {48, 57}) -- 0 - 9 + table.insert(ranges, {65, 90}) -- A - Z + table.insert(ranges, {97, 122}) -- a - z + elseif c == 'x' then -- %x: represents all hexadecimal digits. + table.insert(ranges, {48, 57}) -- 0 - 9 + table.insert(ranges, {65, 70}) -- A - F + table.insert(ranges, {97, 102}) -- a - f + else + if not range then + table.insert(codes, utf8unicode(c)) + else + table.remove(codes) -- removing '-' + table.insert(ranges, {table.remove(codes), utf8unicode(c)}) + range = false + end + end + ignore = false + else + if not range then + table.insert(codes, utf8unicode(c)) + else + table.remove(codes) -- removing '-' + table.insert(ranges, {table.remove(codes), utf8unicode(c)}) + range = false + end + ignore = false + end + + firstletter = false + end + + table.sort(codes) + + local function inRanges(charCode) + for _,r in ipairs(ranges) do + if r[1] <= charCode and charCode <= r[2] then + return true + end + end + return false + end + if not unmatch then + return function(charCode) + return binsearch(codes, charCode) or inRanges(charCode) + end, skip + else + return function(charCode) + return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode)) + end, skip + end +end + +--[[ +-- utf8sub with extra argument, and extra result value +local function utf8subWithBytes (s, i, j, sb) + -- argument defaults + j = j or -1 + + local pos = sb or 1 + local bytes = len(s) + local length = 0 + + -- only set l if i or j is negative + local l = (i >= 0 and j >= 0) or utf8len(s) + local startChar = (i >= 0) and i or l + i + 1 + local endChar = (j >= 0) and j or l + j + 1 + + -- can't have start before end! + if startChar > endChar then + return "" + end + + -- byte offsets to pass to string.sub + local startByte,endByte = 1,bytes + + while pos <= bytes do + length = length + 1 + + if length == startChar then + startByte = pos + end + + pos = pos + utf8charbytes(s, pos) + + if length == endChar then + endByte = pos - 1 + break + end + end + + if startChar > length then startByte = bytes+1 end + if endChar < 1 then endByte = 0 end + + return sub(s, startByte, endByte), endByte + 1 +end +]] + +local cache = setmetatable({},{ + __mode = 'kv' +}) +local cachePlain = setmetatable({},{ + __mode = 'kv' +}) +local function matcherGenerator(regex, plain) + local matcher = { + functions = {}, + captures = {} + } + if not plain then + cache[regex] = matcher + else + cachePlain[regex] = matcher + end + local function simple(func) + return function(cC) + if func(cC) then + matcher:nextFunc() + matcher:nextStr() + else + matcher:reset() + end + end + end + local function star(func) + return function(cC) + if func(cC) then + matcher:fullResetOnNextFunc() + matcher:nextStr() + else + matcher:nextFunc() + end + end + end + local function minus(func) + return function(cC) + if func(cC) then + matcher:fullResetOnNextStr() + end + matcher:nextFunc() + end + end + local function question(func) + return function(cC) + if func(cC) then + matcher:fullResetOnNextFunc() + matcher:nextStr() + end + matcher:nextFunc() + end + end + + local function capture(id) + return function(_) + local l = matcher.captures[id][2] - matcher.captures[id][1] + local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2]) + local check = utf8sub(matcher.string, matcher.str, matcher.str + l) + if captured == check then + for _ = 0, l do + matcher:nextStr() + end + matcher:nextFunc() + else + matcher:reset() + end + end + end + local function captureStart(id) + return function(_) + matcher.captures[id][1] = matcher.str + matcher:nextFunc() + end + end + local function captureStop(id) + return function(_) + matcher.captures[id][2] = matcher.str - 1 + matcher:nextFunc() + end + end + + local function balancer(str) + local sum = 0 + local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2) + local skip = len(bc) + len(ec) + bc, ec = utf8unicode(bc), utf8unicode(ec) + return function(cC) + if cC == ec and sum > 0 then + sum = sum - 1 + if sum == 0 then + matcher:nextFunc() + end + matcher:nextStr() + elseif cC == bc then + sum = sum + 1 + matcher:nextStr() + else + if sum == 0 or cC == -1 then + sum = 0 + matcher:reset() + else + matcher:nextStr() + end + end + end, skip + end + + matcher.functions[1] = function(_) + matcher:fullResetOnNextStr() + matcher.seqStart = matcher.str + matcher:nextFunc() + if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then + matcher.stop = true + matcher.seqStart = nil + end + end + + local lastFunc + local ignore = false + local skip = nil + local it = (function() + local gen = utf8gensub(regex) + return function() + return gen(skip) + end + end)() + local cs = {} + for c, bs, be in it do + skip = nil + if plain then + table.insert(matcher.functions, simple(classMatchGenerator(c, plain))) + else + if ignore then + if find('123456789', c, 1, true) then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + lastFunc = nil + end + table.insert(matcher.functions, capture(tonumber(c))) + elseif c == 'b' then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + lastFunc = nil + end + local b + b, skip = balancer(sub(regex, be + 1, be + 9)) + table.insert(matcher.functions, b) + else + lastFunc = classMatchGenerator('%' .. c) + end + ignore = false + else + if c == '*' then + if lastFunc then + table.insert(matcher.functions, star(lastFunc)) + lastFunc = nil + else + error('invalid regex after ' .. sub(regex, 1, bs)) + end + elseif c == '+' then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + table.insert(matcher.functions, star(lastFunc)) + lastFunc = nil + else + error('invalid regex after ' .. sub(regex, 1, bs)) + end + elseif c == '-' then + if lastFunc then + table.insert(matcher.functions, minus(lastFunc)) + lastFunc = nil + else + error('invalid regex after ' .. sub(regex, 1, bs)) + end + elseif c == '?' then + if lastFunc then + table.insert(matcher.functions, question(lastFunc)) + lastFunc = nil + else + error('invalid regex after ' .. sub(regex, 1, bs)) + end + elseif c == '^' then + if bs == 1 then + matcher.fromStart = true + else + error('invalid regex after ' .. sub(regex, 1, bs)) + end + elseif c == '$' then + if be == len(regex) then + matcher.toEnd = true + else + error('invalid regex after ' .. sub(regex, 1, bs)) + end + elseif c == '[' then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + end + lastFunc, skip = classMatchGenerator(sub(regex, be + 1)) + elseif c == '(' then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + lastFunc = nil + end + table.insert(matcher.captures, {}) + table.insert(cs, #matcher.captures) + table.insert(matcher.functions, captureStart(cs[#cs])) + if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end + elseif c == ')' then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + lastFunc = nil + end + local cap = table.remove(cs) + if not cap then + error('invalid capture: "(" missing') + end + table.insert(matcher.functions, captureStop(cap)) + elseif c == '.' then + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + end + lastFunc = function(cC) return cC ~= -1 end + elseif c == '%' then + ignore = true + else + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + end + lastFunc = classMatchGenerator(c) + end + end + end + end + if #cs > 0 then + error('invalid capture: ")" missing') + end + if lastFunc then + table.insert(matcher.functions, simple(lastFunc)) + end + + table.insert(matcher.functions, function() + if matcher.toEnd and matcher.str ~= matcher.stringLen then + matcher:reset() + else + matcher.stop = true + end + end) + + matcher.nextFunc = function(self) + self.func = self.func + 1 + end + matcher.nextStr = function(self) + self.str = self.str + 1 + end + matcher.strReset = function(self) + local oldReset = self.reset + local str = self.str + self.reset = function(s) + s.str = str + s.reset = oldReset + end + end + matcher.fullResetOnNextFunc = function(self) + local oldReset = self.reset + local func = self.func +1 + local str = self.str + self.reset = function(s) + s.func = func + s.str = str + s.reset = oldReset + end + end + matcher.fullResetOnNextStr = function(self) + local oldReset = self.reset + local str = self.str + 1 + local func = self.func + self.reset = function(s) + s.func = func + s.str = str + s.reset = oldReset + end + end + + matcher.process = function(self, str, start) + + self.func = 1 + start = start or 1 + self.startStr = (start >= 0) and start or utf8len(str) + start + 1 + self.seqStart = self.startStr + self.str = self.startStr + self.stringLen = utf8len(str) + 1 + self.string = str + self.stop = false + + self.reset = function(s) + s.func = 1 + end + + -- local lastPos = self.str + -- local lastByte + local ch + while not self.stop do + if self.str < self.stringLen then + --[[ if lastPos < self.str then + print('last byte', lastByte) + ch, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte) + ch, lastByte = utf8subWithBytes(str, 1, 1, lastByte) + lastByte = lastByte - 1 + else + ch, lastByte = utf8subWithBytes(str, self.str, self.str) + end + lastPos = self.str ]] + ch = utf8sub(str, self.str,self.str) + --print('char', ch, utf8unicode(ch)) + self.functions[self.func](utf8unicode(ch)) + else + self.functions[self.func](-1) + end + end + + if self.seqStart then + local captures = {} + for _,pair in pairs(self.captures) do + if pair.empty then + table.insert(captures, pair[1]) + else + table.insert(captures, utf8sub(str, pair[1], pair[2])) + end + end + return self.seqStart, self.str - 1, unpack(captures) + end + end + + return matcher +end + +-- string.find +local function utf8find(str, regex, init, plain) + local matcher = cache[regex] or matcherGenerator(regex, plain) + return matcher:process(str, init) +end + +-- string.match +local function utf8match(str, regex, init) + init = init or 1 + local found = {utf8find(str, regex, init)} + if found[1] then + if found[3] then + return unpack(found, 3) + end + return utf8sub(str, found[1], found[2]) + end +end + +-- string.gmatch +local function utf8gmatch(str, regex, all) + regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex + local lastChar = 1 + return function() + local found = {utf8find(str, regex, lastChar)} + if found[1] then + lastChar = found[2] + 1 + if found[all and 1 or 3] then + return unpack(found, all and 1 or 3) + end + return utf8sub(str, found[1], found[2]) + end + end +end + +local function replace(repl, args) + local ret = '' + if type(repl) == 'string' then + local ignore = false + local num + for c in utf8gensub(repl) do + if not ignore then + if c == '%' then + ignore = true + else + ret = ret .. c + end + else + num = tonumber(c) + if num then + ret = ret .. args[num] + else + ret = ret .. c + end + ignore = false + end + end + elseif type(repl) == 'table' then + ret = repl[args[1] or args[0]] or '' + elseif type(repl) == 'function' then + if #args > 0 then + ret = repl(unpack(args, 1)) or '' + else + ret = repl(args[0]) or '' + end + end + return ret +end +-- string.gsub +local function utf8gsub(str, regex, repl, limit) + limit = limit or -1 + local ret = '' + local prevEnd = 1 + local it = utf8gmatch(str, regex, true) + local found = {it()} + local n = 0 + while #found > 0 and limit ~= n do + local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)} + ret = ret .. utf8sub(str, prevEnd, found[1] - 1) + .. replace(repl, args) + prevEnd = found[2] + 1 + n = n + 1 + found = {it()} + end + return ret .. utf8sub(str, prevEnd), n +end + +local utf8 = {} +utf8.len = utf8len +utf8.sub = utf8sub +utf8.reverse = utf8reverse +utf8.char = utf8char +utf8.unicode = utf8unicode +utf8.gensub = utf8gensub +utf8.byte = utf8unicode +utf8.find = utf8find +utf8.match = utf8match +utf8.gmatch = utf8gmatch +utf8.gsub = utf8gsub +utf8.dump = dump +utf8.format = format +utf8.lower = lower +utf8.upper = upper +utf8.rep = rep +return utf8