Read Defined Chunks |
|
io.stdin
.
It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible.
The purpose is to process a) files with tons of megabaytes b) mixed formated input e.g. MIME multipart messages are a mix of lines \r\n and binary data
Please note a simple non Lua standard enhancement: I use the number variable lua.maxread
to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB.
-- a simple example without using all the specials local Handle=io.open('File','r') local ReadUntil=io.readuntil(Handle) repeat Line=coroutine.resume(ReadUntil,'\n',true) if Line then end until Line==nil
-- another example local Handle = io.open('File', 'r') local ReadUntil = io.readuntil(Handle) local Chunk, Found repeat _,Chunk, Found = coroutine.resume(ReadUntil, 'search this string in a huge file',false) if Found then _,Chunk, Found = coroutine.resume(ReadUntil, 'search another string in the same file',true) if Found then break end end until Chunk == nil -- Now if chunk ~= nil, then chunk is the stuff between -- 'search this string in a huge file' and 'search another -- string in the same file'. Yes, it's possible to do the same -- very simply, but the advantage here is that the large file -- isn't loaded at once into memory.
Code:
function io.readuntil(Filehandle, Delimiter, Collect, Limit) -- Filehandle (userdata) -- Delimiter (string, optional); max. length is lua.maxread; -- optional because coroutine.resume() also accepts <delimiter> -- Collect (boolean, optional) = true (default); read until -- <delimiter> is found or end of file or <limit> is reached -- and return string at once = false; return string also before -- <delimiter> is found or end of file or <limit> is reached -- Limit (number, optional); number of bytes to read from -- <filehandle>; default is unlimited -- <function> = cooroutine.resume(Function, Delimiter, Collect) -- Function (thread); returned from io.readuntil() -- Delimiter (string, optional); see io.readuntil() -- Collect (boolean, optional); see io.readuntil() -- return (boolean); = true; no error -- = false; an error occured and the second -- argument returned is the errormessage -- (string or nil) = nil; end of file -- (boolean) = true; delimiter found -- = false; delimiter not found -- note: if the coroutine returns true,<string>,false then -- if <collect> = false it does not have to be the end of file -- = true the end of file is reached and the next -- coroutine.resume returns true,nil(,nil) if type(Delimiter) == 'boolean' then Collect,Delimiter = Delimiter,Collect end if type(Delimiter) == 'number' then Limit,Delimiter = Delimiter,nil end if type(Collect) == 'number' then Limit,Collect = Collect,nil end return coroutine.create(function(NewDelimiter,NewCollect) local Next = function(NewDelimiter,NewCollect) if type(NewDelimiter) == 'boolean' then NewCollect,NewDelimiter = NewDelimiter,nil end return NewDelimiter or Delimiter,NewCollect or Collect end Delimiter,Collect = Next(NewDelimiter,NewCollect) local Chunksize,Chunk,Length,First,Second,SearchFrom, GetFrom,FoundFrom,FoundTo = lua.maxread,{},0,1,2,1,1 if Limit and Length+Chunksize>Limit then Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[First] = Filehandle:read(Chunksize) end if Chunk[First] then Length = Length + string.len(Chunk[First]) while true do if string.len(Delimiter)>Chunksize then error('io.readuntil: delimiter to long') end FoundFrom,FoundTo = string.find( Chunk[First],Delimiter,SearchFrom,true) if FoundFrom then -- delimiter found in first chunk Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom,FoundFrom-1),true)) SearchFrom,GetFrom = FoundTo+1,FoundTo+1 else if Limit and Length+Chunksize > Limit then Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[Second] = Filehandle:read(Chunksize) end if Chunk[Second] then Length = Length + string.len(Chunk[Second]) -- concatenate end of first chunk with start of -- second chunk so that a possible splitted delimiter -- must be found FoundFrom,FoundTo = string.find( string.sub(Chunk[First], string.len(Chunk[First])-string.len(Delimiter)+2) .. string.sub(Chunk[Second],1,string.len(Delimiter)-1), Delimiter,1,true) if FoundFrom then -- delimiter is splitted between first and second chunk Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom,string.len(Chunk[First])- string.len(Delimiter)+FoundFrom), true )) First,Second = Second,First SearchFrom,GetFrom = FoundFrom+1,FoundFrom+1 else -- delimiter isn't splitted between first and second chunk if Collect then SearchFrom = string.len(Chunk[First])+1 Chunk[First] = Chunk[First]..Chunk[Second] else if string.len(Chunk[First]) >= GetFrom then Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom),false)) end First,Second = Second,First SearchFrom,GetFrom = 1,1 end end else -- no delimiter found and no further input break end end end if string.len(Chunk[First]) >= GetFrom then -- return rest of first chunk coroutine.yield(string.sub(Chunk[First],GetFrom),false) end end end) -- return (thread); a coroutine end
-- MarkusHuber