I might have optimize project() too much, a simple version would do nicely.
static lua_Unsigned project(lua_Unsigned ran, lua_Unsigned n, RanState *state)
{
int bits = __builtin_clzll(n);
while ((ran >>= bits) > n) ran = I2UInt( xorshift128plus(state->s) );
return ran;
}