lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


That code is converting from the ascii value[1] of the numerical character to the equivalent decimal digit by subtracting the offset of the first ascii digit (0x30 == '0'), and shifting it left one place (multiply by 10).


[1] https://duckduckgo.com/?q=ascii+table

On Tue, Jun 2, 2015 at 7:52 AM, Lionel Duboeuf <lionel.duboeuf@gmail.com> wrote:
great, i will test it, thanks!. I'm not familar with byte decoding. Do you know any resource that deals with byte manipulation so that i can understand this kind of line:  "n = 10 * n + (c1 - 0x30)"

lionel



2015-06-02 16:22 GMT+02:00 Shunsuke Shimizu <grafi@grafi.jp>:
Sorry, the line `n = tostring(n)` does nothing. This lines is inteneted
to be `n = tonumber(n)` (and tonumber should also be cached as a local
variable). This change has little performance effect.

Interestingly, erasing the line `n = tostring(n)` makes the version
faster than string.byte version. If you do not need error handling, this
seems to be the best way (even though string.byte version wins when
nbRows = 99 instead of 999).

On 06/02/2015 10:46 PM, Shunsuke Shimizu wrote:
> If you can permit tricky code, you can achieve a little better speed by
> parsing numbers manually using string.byte(). Parsing of large numbers
> can be slowed down by this way, but I suppose this effect is negligible
> since the cost of creating a substring is large when a number is large.
>
> Following is benchmark code, decoding data 1000 times. The length of
> strings within the data is between 1 to 999.
>
> The result of the benchmark on my machine with Lua 5.1.5 is
>   tonumber version:     about 3.5 sec
>   string.byte version:  about 3.1 sec.
>
> If you make strings shorter, string.byte version performs better.
>
> ----
> local tostring, byte, find, sub = tostring, string.byte, string.find,
> string.sub
> local times = 1000
>
> local nbRows = 999
> local cols = { "col1", "col2", "col3", "col4" }
>
> local data = ""> > local as = ""
> for i = 1, nbRows do
>       for j = 1, #cols do
>               data = "" .. "<" .. tostring(i) .. " " .. as .. tostring(j) .. "/> "
>       end
>       as = as .. "a"
> end
>
> local t1 = os.clock()
> for t = 1, times do
>       local pos, rs = 0, {}
>       for i = 1, nbRows do
>               local row = {}
>
>               local _, n
>               for j = 1, #cols do
>                       _, pos, n = find(data, "<(%d+)%s", pos)
>                       n = tostring(n)
>                       local endpos = pos + n
>                       row[cols[j]] = sub(data, pos + 1, endpos)
>                       pos = endpos + 1
>               end
>
>               rs[i] = row
>       end
> end
>
> local t2 = os.clock()
> for t = 1, times do
>       local pos, rs = 0, {}
>       for i = 1, nbRows do
>               local row = {}
>
>               for j = 1, #cols do
>                       pos = find(data, "<", pos, true)
>                       local n, c1, c2, c3 = byte(data, pos + 1, pos + 4)
>                       n = n - 0x30
>                       if n < 1 or 9 < n then
>                               error()
>                       end
>                       while true do
>                               if c1 < 0x30 or 0x3A <= c1 then
>                                       if c1 == 0x20 then
>                                               pos = pos + 3
>                                               break
>                                       else
>                                               error()
>                                       end
>                               end
>                               n = 10 * n + (c1 - 0x30)
>                               if c2 < 0x30 or 0x3A <= c2 then
>                                       if c2 == 0x20 then
>                                               pos = pos + 4
>                                               break
>                                       else
>                                               error()
>                                       end
>                               end
>                               n = 10 * n + (c2 - 0x30)
>                               if c3 < 0x30 or 0x3A <= c3 then
>                                       if c3 == 0x20 then
>                                               pos = pos + 5
>                                               break
>                                       else
>                                               error()
>                                       end
>                               end
>                               n = 10 * n + (c3 - 0x30)
>                               pos = pos + 3
>                               c1, c2, c3 = byte(data, pos + 2, pos + 4)
>                       end
>
>                       local newpos = pos + n
>                       row[cols[j]] = sub(data, pos, newpos - 1)
>                       pos = newpos
>               end
>
>               rs[i] = row
>       end
> end
>
> local t3 = os.clock()
> print(t2 - t1, t3 - t2)
>
>
> On 05/30/2015 03:25 AM, Lionel Duboeuf wrote:
>> hello you all,
>>
>> Just in case i'm doing it not efficiently and to learn best practices:
>> I have a character stream that is formated like this one:
>>
>> ...<6 orange/> <2 20/> <1 1/> <2 20/> <5 false/> <1 0/> <16 orange
>> mechanics/> <2 25/>...
>>
>> which correspond to a row column format like this
>> t = {
>>     {  "col1" = "orange" ,  "col2" = 20  },
>>     {  "col1" = 1 ,  "col2" = 20  },
>>     {  "col1" = false ,  "col2" = 0  },
>>     {  "col1" = "orange mechanics" ,  "col2" = 25  },
>> ...
>> }
>>
>>
>>
>> to do so, i parse it like this:
>>
>> pos = current position of the stream
>>
>> local rs = { }
>> local sNbByte, nbByte, val, _
>> local nbRows = 4
>> cols = { "col1","col2" }
>> for i = 1,  nbRows do
>>
>>       local row = {}
>>
>>       for j = 1, #cols do
>>
>>         _, pos, sNbByte = string.find(data, "<(%d+)%s",pos)
>>         nbByte = tonumber(sNbByte)
>>
>>         if (nbByte > 0) then
>>           val = string.sub(data, pos, pos + nbByte)
>>           pos = pos + nbByte
>>         end
>>
>>         pos = pos + 1 --just after value
>>
>>         row[cols[j]] = val
>>
>>       end
>>
>>     rs[i] = row
>>   end
>>
>>
>>
>> i did some benchmarks, and found using gmatch and iterating trough
>> captures more efficient, but it is not usable when we need to specify a
>> starting offset position (like string.find) and i don't want to split my
>> string to avoid copies.
>>
>> any advices will be very appreciated.
>>
>> thanks
>>
>> lionel
>>
>>
>





--
Brigham Toskin