Validate Unicode String

wiki

This code treat UTF-8 encoded Unicode strings. It provides the following API:

string.subutf8(string, start[,end]) substrings, UTF-8 aware
pos, char = string.nextutf8(string, orig_pos) returns the char at orig_pos and the next char's position in pos.
for i, char in str:nextutf8(orig_pos) iterates through the string, starting at orig_pos.
pos = string.seekutf8(string, orig_pos, n) returns the position orig_pos, N characters forward (or backwards, if N negative).
char = string.utf8char(code) returns the char the code of which is code.
code = string.utf8code(char) returns the code of char (UTF-8 character).
len = string.lenutf8(string) returns the length of string in UTF-8 characters.

UTF-8 BOM has by convention a code of 0. Valid code ranges are: 0-0xD7FF, 0xE000-0x10FFFF.

The UTF-8 encoding

Unicode is an universal character set, widely used in XML documents.

The point of this is that Unicode codepoints are 0-21 bits in length. With UTF-8, ASCII characters are stored as one byte, others use from 2 to 4 bytes. See [RFC 3629].

The previous paragraph formerly linked to [RFC 2279], which has been obsoleted by RFC-3629 to bring it into alignment with the Unicode Standard [1]. A reasonably fast standards-compliant pure Lua library can be found at [2]. (link broken)

StephaneArnold 2007-11-13 - I delete the posted code that was not compliant to the latest UTF-8 standard. I have converted some functions of the the 'pure Lua library' to C functions :

lua_utf8.c

/*==================================================================*/
/*			C program by sarnold@free.fr 2007, MIT license
			based on the work of Rici Lake rici@ricilake.net		*/
/*==================================================================*/

#include <memory.h>
#include "lua.h"
#include "lauxlib.h"
#include "lualib.h"

#define INVALID_UTF8 "invalid utf-8 string"
#define POINTS_ASCII(p) (*((unsigned char*)p) < 128)
#define RANGE(x, min, max) ((x)>=min && (x)<=max)
#define RANGE_SND(x) RANGE(x,128,191)
#define UTF8_BOM(p) (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)
int sarn_utf8_next(const unsigned char* str)
{
	if (*str < 128)
		return 1;
	if (UTF8_BOM(str))
		return 3;
	if (*str < 194)
		return 0;
	if (*str > 244)
		return 0;
	if (*str < 224 && RANGE_SND(str[1]))
		return 2;
	if (RANGE(*str, 225, 239) && *str != 237 
		&& RANGE_SND(str[1]) && RANGE_SND(str[2]))
		return 3;
	if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2]))
		return 3;
	if (*str == 237 && RANGE(str[1],128,159) && RANGE_SND(str[2]))
		return 3;
	if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) 
		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
		return 4;
	if (*str == 240 && RANGE(str[1],144,191) 
		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
		return 4;
	if (*str == 244 && RANGE(str[1],128,143) 
		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
		return 4;
	return 0;
}

#define BACK(str, remain) if (--remain == 0) return 0; else str--
int sarn_utf8_prev(unsigned char* str, int remain)
{
	BACK(str,remain);
	if (*str < 128)
		return 1;
	
	BACK(str,remain);
	if (RANGE(*str,195,224) && RANGE_SND(str[1]))
		return 2;
	
	BACK(str,remain);
	if (UTF8_BOM(str))
		return 3;
	if (RANGE(*str, 225, 239) && *str != 237 
		&& RANGE_SND(str[1]) && RANGE_SND(str[2]))
		return 3;
	if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2]))
		return 3;
	if (*str == 237 && RANGE(str[1],160,191) && RANGE_SND(str[2]))
		return 3;
	
	BACK(str,remain);
	if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) 
		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
		return 4;
	if (*str == 240 && RANGE(str[1],144,191) 
		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
		return 4;
	if (*str == 244 && RANGE(str[1],128,143) 
		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))
		return 4;
	/* fail back */
	return 0;
}


/** Realign index on an UTF-8 char boundary in str.
	Returns the offset (0 to 3) to be seeked backwards, or -1 if it fails.
 */
int sarn_utf8_realign(unsigned char* str, size_t index)
{
	size_t size, i;
	
	for (i = 0; i<4 && index>=i;i++) {
		if (sarn_utf8_next(str-i)!=0)
			return i;
	}
	return -1;
}
		

int sarn_utf8_next_func(lua_State* L)
{
	const char *str;
	size_t pos, clen;
	char utf8[5];
	
	str = luaL_checkstring(L, 1);
	pos = luaL_checklong(L, 2);
	if (strlen(str)<pos) {
		lua_pushnil(L);
		return 1;
	}
	memset(utf8, '\0', sizeof(utf8));
	
	if (pos == 0)
		return luaL_error(L, "bad index value : 0");
	
	clen = sarn_utf8_next((unsigned char *)str+pos-1);
	if (!clen)
		return luaL_error(L, INVALID_UTF8);
	
	lua_pushnumber(L, pos+clen);
	strncpy(utf8, str+pos-1, clen);
	lua_pushstring(L, utf8);
	return 2;
}

int sarn_utf8_len_func(lua_State *L)
{
	unsigned char *str;
	int l;
	size_t len = 0;
	
	str = (unsigned char*) luaL_checkstring(L, 1);
	
	while (*str) {
		if (POINTS_ASCII(str)) {
			str++;
			len++;
			continue;
		}
		l = sarn_utf8_next(str);
		if (!l)
			return luaL_error(L, INVALID_UTF8);
		
		len++;
		str+=l;
	}
	lua_pushnumber(L, len);
	return 1;
}

int sarn_utf8_seek_func(lua_State *L)
{
	unsigned char* str;
	int pos, shift;
	int clen, len;

	str = (unsigned char*)luaL_checkstring(L, 1);
	pos = luaL_checklong(L, 2);
	shift = luaL_checklong(L, 3);
	len = strlen(str);
	
	if (shift == 0) {
		lua_pushinteger(L, pos);
		return 1;
	}
	
	if (pos > len || pos < 1)
		return luaL_error(L, "invalid index (arg #2)");
	
	/* then, pos is 0-based */
	pos--;
	
	if (abs(shift) > len) {
		/* out of range */
		lua_pushnil(L);
		return 1;
	}
	
	if (shift < 0) {
		while ((shift++) != 0) {
			clen = sarn_utf8_prev(str+pos, pos+1);
			if (clen == 0 || pos+1 < clen) {
				lua_pushnil(L);
				return 1;
			}
			pos -= clen;
		}
	} else {
		while ((shift--) != 0) {
			if (POINTS_ASCII(str+pos)) {
				pos ++;
				continue;
			}
			clen = sarn_utf8_next(str+pos);
			if (clen == 0 || pos+clen >= len) {
				lua_pushnil(L);
				return 1;
			}
			pos += clen;
		}
	}
	
	lua_pushinteger(L, pos+1);
	return 1;
}

int sarn_utf8_char_func(lua_State *L)
{
	unsigned char str[2];
	long int i;
	unsigned long int code;
	unsigned char result[5];
	
	i = luaL_checklong(L, 1);
	memset(result, '\0', sizeof(result));
	code = i;
	
	if (i >= 0xD800 && i <= 0xDFFF)
		return luaL_error(L, "invalid utf-8 code");
	
	if (i >= 0 && i < 0x110000UL) {
		if (code == 0) {
			/* UTF8 BOM */
			lua_pushstring(L, "\xEF\xBB\xBF");
			return 1;
		}
		if (code < 128) {
			result[0] = code;
			lua_pushstring(L, (char*)result);
			return 1;
		}
		str[0] = 0x80 + (code & 63);
		code = code >> 6;
		if (code < 32) {
			result[0] = 0xC0+code;
			result[1] = str[0];
			lua_pushstring(L, (char*)result);
			return 1;
		}
		str[1] = code & 0x3f;
		code = code >> 6;
		if (code < 16 && (code != 13 || str[1] < 32)) {
			result[0] = 0xE0 + code;
			result[1] = str[1] + 0x80;
			result[2] = str[0];
			lua_pushstring(L, (char*)result);
			return 1;
		} else if (code >= 16 && code < 0x110) {
			result[1] = 0x80 + (code & 0x3f);
			result[0] = 0xF0 + (code >> 6);
			result[2] = str[1] + 0x80;
			result[3] = str[0];
			lua_pushstring(L, (char*) result);
			return 1;
		}
	}
	return luaL_error(L, "invalid utf-8 code");
}
	
int sarn_utf8_code_func(lua_State *L)
{
	unsigned char* str;
	size_t len, i;
	unsigned long int code;
	unsigned long int offset[] = {0, 0x3000,
  0xE0000UL,
  0x3C00000UL};
	
	str = (unsigned char*)luaL_checklstring(L, 1, &len);
	
	if (len != sarn_utf8_next(str))
		return luaL_error(L, INVALID_UTF8);
	
	if (UTF8_BOM(str)) {
		lua_pushinteger(L, 0);
		return 1;
	}
		
	
	code = str[0];
	for (i = 1; i < len; i++) {
		code = (code << 6) + (str[i] & 63);
	}
	lua_pushinteger(L, code - offset[len-1]);
	
	return 1;
}
	
	
int luaopen_libluautf8 (lua_State *L)
{
	lua_getglobal(L, "string");
	lua_pushcfunction(L, sarn_utf8_next_func);
	lua_setfield(L, -2, "nextutf8");
	lua_pushcfunction(L, sarn_utf8_len_func);
	lua_setfield(L, -2, "utf8len");
	lua_pushcfunction(L, sarn_utf8_seek_func);
	lua_setfield(L, -2, "seekutf8");
	lua_pushcfunction(L, sarn_utf8_code_func);
	lua_setfield(L, -2, "utf8code");
	lua_pushcfunction(L, sarn_utf8_char_func);
	lua_setfield(L, -2, "utf8char");
	return 0;
}

Makefile

all: compile
LUA_CFLAGS=-O2 -fpic
LUA_LDFLAGS=-O -shared -fpic

compile: lua_utf8

lua_utf8: lua_utf8.c
	$(CC) $(CFLAGS) $(LUA_CFLAGS) -c lua_utf8.c
	$(CC) $(CFLAGS) $(LUA_LDFLAGS) -o libluautf8.so lua_utf8.o

utf8.lua

module(...,package.seeall)
require'libluautf8'

local mt = {}
local unistr = {}
function unistr:new(str)
	return setmetatable({value = str or ''},mt)
end

-- redirects methods to unistr
mt.__index = function(t,key) 
	if key == 'length' then return string.utf8len(t.value) end
	if key == 'value' then return t.value end
	return unistr[key]
end

-- substrings, utf8 ready
-- it might be very expensive
-- isn't every encoding function expensive compared to raw access
-- to bytes
function unistr:sub (first, last)
	local fn 
	fn = function (str,idx)
		if idx == 1 or idx == 0 then return idx end
		if idx<0 then
			-- negative indices are counted backwards
			return str:seekutf8(#str, idx) or 1
		else
			return str:seekutf8(1, idx-1) or #str+1
		end
	end
	local i = fn(self.value, first)
	if last == nil then
		return self.value:sub(i)
	end
	if last < 0 then
		if first > 0 or (first<0 and last-first > -last) then
			-- we must anyway walk through the encoded string
			-- when walking from the end of the string backwards
			-- has costs less than walking from the first index
			-- we choose the least cost
			
			-- we get the last index from fn
			return self.value:sub(i, fn(self.value, last))
		end
	end
	if first == 0 then return self.value:sub(i, fn(self.value, last)) end
	return self.value:sub(i, self.value:seekutf8(i, last-first))	
end
local u2s=function (str)
	if type(str) == 'string' then return str else return str.value end
end
	
-- unicode strings concat
function mt.__concat(a,b) 
	return u(u2s(a)..u2s(b)) 
end
-- encoded string length with a metatable is not possible
-- so let's stick with a len() method
function unistr:len() 
	return self.value:utf8len() 
end
-- iterator
function unistr:each(pos) return string.nextutf8, self.value, pos or 1 end
-- creates a global "u" function to be used like that: 
-- str = u"Hello" (it feels Python-like but is really a Lua function)
-- then, thanks to the metatable mechanism, concatenation and other funcs
-- can be invoked as if it was a simple scalar of type string
_G.u = function(str) return unistr:new(str) end

function unicodize(f)
	return function(str) return f(u2s(str)) end
end
_G.print = unicodize(print)
-- return this function
return _G.u

Test code

require 'utf8'
a=u'hello'
b="hello"

function assertEqual(name,a,b) 
	if a~=b then 
		print(name.."["..a..'|'..b..']') 
	else
		--print(name.."...OK")
	end 
end

for i = 0,10 do
  assertEqual("sub1."..i,a:sub(i),b:sub(i))
end

for i = 0,5 do
	for j = i,10 do
		assertEqual("sub2."..i.."-"..j,a:sub(i,j),b:sub(i,j))
	end
end

lentest = {{"h",1},{"",0},{"hel",3},{"hi St�phane",11}}

for _,val in ipairs(lentest) do
	str = u(val[1])
	assertEqual("len1.".._, str.length, val[2])
end

firstName=u"St�phane"
lastName = u"Arnold"
print("hello "..firstName.." "..lastName)

RecentChanges · preferences
edit · history
Last edited February 22, 2018 12:25 am GMT (diff)