diff --git a/build/linux.mk b/build/linux.mk
index b414909..fc161f9 100644
--- a/build/linux.mk
+++ b/build/linux.mk
@@ -10,7 +10,8 @@ SOURCES += \
 	src/kilo.o \
 	src/embed_libtcc1.a.o \
 	src/embed_include.o \
-	src/embed_contrib_headers.o
+	src/embed_contrib_headers.o \
+	src/embed_stb.o
 
 all: embed cjit
 
@@ -20,8 +21,12 @@ embed: lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/include
 	bash build/embed-path.sh lib/contrib_headers
-	@echo "\nreturn(true);\n}\n" >> src/embedded.c
-	@echo "\n#endif\n" >> src/embedded.h
+	bash build/embed-path.sh lib/stb
+	@echo                 >> src/embedded.c
+	@echo "return(true);" >> src/embedded.c
+	@echo "}"             >> src/embedded.c
+	@echo          >> src/embedded.h
+	@echo "#endif" >> src/embedded.h
 
 tinycc_config += --with-libgcc
 ifeq ($(shell sestatus | awk -F': *' '/SELinux status:/ {print $2}'), enabled)
diff --git a/build/musl.mk b/build/musl.mk
index 96e9c57..b07fc5a 100644
--- a/build/musl.mk
+++ b/build/musl.mk
@@ -16,7 +16,8 @@ SOURCES += \
 	src/embed_include.o \
 	src/embed_contrib_headers.o \
 	src/embed_libc.so.o \
-	src/musl-symbols.o
+	src/musl-symbols.o \
+	src/embed_stb.o
 
 # SOURCES += src/embed-musl-libc.o src/musl-symbols.o src/kilo.o
 
@@ -32,6 +33,7 @@ embed: lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/include
 	bash build/embed-path.sh lib/contrib_headers
 	bash build/embed-path.sh /lib/x86_64-linux-musl/libc.so
+	bash build/embed-path.sh lib/stb
 	@echo                 >> src/embedded.c
 	@echo "return(true);" >> src/embedded.c
 	@echo "}"             >> src/embedded.c
diff --git a/build/osx.mk b/build/osx.mk
index a08de39..4f43830 100644
--- a/build/osx.mk
+++ b/build/osx.mk
@@ -7,7 +7,8 @@ SOURCES += \
 	src/kilo.o \
 	src/embed_libtcc1.a.o \
 	src/embed_include.o \
-	src/embed_contrib_headers.o
+	src/embed_contrib_headers.o \
+	src/embed_stb.o
 
 all: embed cjit.command
 
@@ -17,8 +18,12 @@ embed: lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/include
 	bash build/embed-path.sh lib/contrib_headers
-	@echo "\nreturn(true);\n}\n" >> src/embedded.c
-	@echo "\n#endif\n" >> src/embedded.h
+	bash build/embed-path.sh lib/stb
+	@echo                 >> src/embedded.c
+	@echo "return(true);" >> src/embedded.c
+	@echo "}"             >> src/embedded.c
+	@echo          >> src/embedded.h
+	@echo "#endif" >> src/embedded.h
 
 cjit.command: ${SOURCES}
 	$(cc) $(cflags) -o $@ $(SOURCES) ${ldflags} ${ldadd}
diff --git a/build/update-libs.sh b/build/update-libs.sh
new file mode 100755
index 0000000..996edcc
--- /dev/null
+++ b/build/update-libs.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+odir=lib/contrib_headers
+
+function fetch() {
+	[ -z $odir ] && {
+		>&2 echo "Script error: \$odir not set"
+		exit 1
+	}
+	out="$1"
+	url="$2"
+	mkdir -p ${odir}
+	mkdir -p .${odir}
+	if [ -r ${odir}/${out} ];then
+		mv ${odir}/${out} .${odir}/${out}
+		>&2 echo "Update: ${odir}/${out}"
+	else
+		>&2 echo "Download: ${odir}/${out}"
+	fi
+	curl -sL --output ${odir}/${out} ${url}
+	[ -r .${odir}/${out} ] || return
+    diff ${odir}/${out} .${odir}/${out} > /dev/null || {
+		>&2 echo "DIFF ${out}"
+		>&2 diff ${odir}/${out} .${odir}/${out}
+		>&2 echo "END DIFF"
+	}
+	rm -rf .${odir}
+}
+
+fetch dmon.h    https://raw.githubusercontent.com/septag/dmon/master/dmon.h
+fetch nuklear.h https://raw.githubusercontent.com/Immediate-Mode-UI/Nuklear/master/nuklear.h
+fetch miniaudio.h https://raw.githubusercontent.com/mackron/miniaudio/master/miniaudio.h
+
+# win32ports
+odir="lib/win32ports"
+fetch unistd.h  https://raw.githubusercontent.com/win32ports/unistd_h/refs/heads/master/unistd.h
+fetch strings.h https://raw.githubusercontent.com/win32ports/strings_h/refs/heads/master/strings.h
+fetch dirent.h  https://raw.githubusercontent.com/win32ports/dirent_h/refs/heads/master/dirent.h
+odir="lib/win32ports/sys"
+fetch time.h https://raw.githubusercontent.com/win32ports/sys_time_h/refs/heads/master/sys/time.h
+fetch wait.h https://raw.githubusercontent.com/win32ports/sys_wait_h/refs/heads/master/sys/wait.h
+
+
+[ "$1" = "stb" ] && {
+# std headers
+	if [ -d stb ]; then cd stb && git pull --rebase; cd -
+	else git clone https://github.com/nothings/stb.git
+	fi
+	mkdir -p lib/stb
+	cp stb/*.h lib/stb/
+}
diff --git a/build/win-native.mk b/build/win-native.mk
index 177d40f..f409d6e 100755
--- a/build/win-native.mk
+++ b/build/win-native.mk
@@ -23,9 +23,10 @@ ldadd := lib/tinycc/libtcc.a -lshlwapi
 SOURCES += src/win-compat.o  \
 	src/embed_libtcc1.a.o     \
 	src/embed_include.o \
-	src/embed_contrib_headers.o \
 	src/embed_tinycc_win32.o \
-	src/embed_win32ports.o
+	src/embed_win32ports.o \
+	src/embed_contrib_headers.o \
+	src/embed_stb.o
 
 all: embed cjit.exe
 
@@ -35,8 +36,9 @@ embed: lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/include
 	bash build/embed-path.sh lib/tinycc/win32/include tinycc_win32
-	bash build/embed-path.sh lib/contrib_headers
 	bash build/embed-path.sh lib/win32ports
+	bash build/embed-path.sh lib/contrib_headers
+	bash build/embed-path.sh lib/stb
 	@echo                 >> src/embedded.c
 	@echo "return(true);" >> src/embedded.c
 	@echo "}"             >> src/embedded.c
diff --git a/build/win-wsl.mk b/build/win-wsl.mk
index 86ed3fe..22bf2ef 100644
--- a/build/win-wsl.mk
+++ b/build/win-wsl.mk
@@ -21,9 +21,10 @@ tinycc_config += --ar=${ar}
 SOURCES += src/win-compat.o  \
 	src/embed_libtcc1.a.o     \
 	src/embed_include.o \
-	src/embed_contrib_headers.o \
 	src/embed_tinycc_win32.o \
-	src/embed_win32ports.o
+	src/embed_win32ports.o \
+	src/embed_contrib_headers.o \
+	src/embed_stb.o
 
 all: deps embed cjit.exe
 
@@ -33,10 +34,14 @@ embed: lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/libtcc1.a
 	bash build/embed-path.sh lib/tinycc/include
 	bash build/embed-path.sh lib/tinycc/win32/include tinycc_win32
-	bash build/embed-path.sh lib/contrib_headers
 	bash build/embed-path.sh lib/win32ports
-	@echo "\nreturn(true);\n}\n" >> src/embedded.c
-	@echo "\n#endif\n" >> src/embedded.h
+	bash build/embed-path.sh lib/contrib_headers
+	bash build/embed-path.sh lib/stb
+	@echo                 >> src/embedded.c
+	@echo "return(true);" >> src/embedded.c
+	@echo "}"             >> src/embedded.c
+	@echo          >> src/embedded.h
+	@echo "#endif" >> src/embedded.h
 
 cjit.exe: ${SOURCES}
 	./build/stamp-exe.sh
diff --git a/lib/stb/stb_c_lexer.h b/lib/stb/stb_c_lexer.h
new file mode 100644
index 0000000..fd42f1c
--- /dev/null
+++ b/lib/stb/stb_c_lexer.h
@@ -0,0 +1,941 @@
+// stb_c_lexer.h - v0.12 - public domain Sean Barrett 2013
+// lexer for making little C-like languages with recursive-descent parsers
+//
+// This file provides both the interface and the implementation.
+// To instantiate the implementation,
+//      #define STB_C_LEXER_IMPLEMENTATION
+// in *ONE* source file, before #including this file.
+//
+// The default configuration is fairly close to a C lexer, although
+// suffixes on integer constants are not handled (you can override this).
+//
+// History:
+//     0.12 fix compilation bug for NUL support; better support separate inclusion
+//     0.11 fix clang static analysis warning
+//     0.10 fix warnings
+//     0.09 hex floats, no-stdlib fixes
+//     0.08 fix bad pointer comparison
+//     0.07 fix mishandling of hexadecimal constants parsed by strtol
+//     0.06 fix missing next character after ending quote mark (Andreas Fredriksson)
+//     0.05 refixed get_location because github version had lost the fix
+//     0.04 fix octal parsing bug
+//     0.03 added STB_C_LEX_DISCARD_PREPROCESSOR option
+//          refactor API to simplify (only one struct instead of two)
+//          change literal enum names to have 'lit' at the end
+//     0.02 first public release
+//
+// Status:
+//     - haven't tested compiling as C++
+//     - haven't tested the float parsing path
+//     - haven't tested the non-default-config paths (e.g. non-stdlib)
+//     - only tested default-config paths by eyeballing output of self-parse
+//
+//     - haven't implemented multiline strings
+//     - haven't implemented octal/hex character constants
+//     - haven't implemented support for unicode CLEX_char
+//     - need to expand error reporting so you don't just get "CLEX_parse_error"
+//
+// Contributors:
+//   Arpad Goretity (bugfix)
+//   Alan Hickman (hex floats)
+//   github:mundusnine (bugfix)
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+#ifdef STB_C_LEXER_IMPLEMENTATION
+#ifndef STB_C_LEXER_DEFINITIONS
+// to change the default parsing rules, copy the following lines
+// into your C/C++ file *before* including this, and then replace
+// the Y's with N's for the ones you don't want. This needs to be
+// set to the same values for every place in your program where
+// stb_c_lexer.h is included.
+// --BEGIN--
+
+#if defined(Y) || defined(N)
+#error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
+#endif
+
+#define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
+#define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
+#define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
+#define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)     CLEX_floatlit
+#define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+     CLEX_floatlit
+#define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
+#define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
+#define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
+#define STB_C_LEX_C_CHARS           Y   //  single-quote-delimited character with escape CLEX_charlits
+#define STB_C_LEX_C_COMMENTS        Y   //  "/* comment */"
+#define STB_C_LEX_CPP_COMMENTS      Y   //  "// comment to end of line\n"
+#define STB_C_LEX_C_COMPARISONS     Y   //  "==" CLEX_eq  "!=" CLEX_noteq   "<=" CLEX_lesseq  ">=" CLEX_greatereq
+#define STB_C_LEX_C_LOGICAL         Y   //  "&&"  CLEX_andand   "||"  CLEX_oror
+#define STB_C_LEX_C_SHIFTS          Y   //  "<<"  CLEX_shl      ">>"  CLEX_shr
+#define STB_C_LEX_C_INCREMENTS      Y   //  "++"  CLEX_plusplus "--"  CLEX_minusminus
+#define STB_C_LEX_C_ARROW           Y   //  "->"  CLEX_arrow
+#define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
+#define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
+#define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
+                                        //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
+                                        //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
+                                        //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
+
+#define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
+#define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
+#define STB_C_LEX_HEX_SUFFIXES      ""  // e.g. "uUlL"
+#define STB_C_LEX_OCTAL_SUFFIXES    ""  // e.g. "uUlL"
+#define STB_C_LEX_FLOAT_SUFFIXES    ""  //
+
+#define STB_C_LEX_0_IS_EOF             N  // if Y, ends parsing at '\0'; if N, returns '\0' as token
+#define STB_C_LEX_INTEGERS_AS_DOUBLES  N  // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
+#define STB_C_LEX_MULTILINE_DSTRINGS   N  // allow newlines in double-quoted strings
+#define STB_C_LEX_MULTILINE_SSTRINGS   N  // allow newlines in single-quoted strings
+#define STB_C_LEX_USE_STDLIB           Y  // use strtod,strtol for parsing #s; otherwise inaccurate hack
+#define STB_C_LEX_DOLLAR_IDENTIFIER    Y  // allow $ as an identifier character
+#define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
+
+#define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
+                                              // leaving it as N should help you catch config bugs
+
+#define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
+                                              // still have #line, #pragma, etc)
+
+//#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
+
+#define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
+// --END--
+#endif
+#endif
+
+#ifndef INCLUDE_STB_C_LEXER_H
+#define INCLUDE_STB_C_LEXER_H
+
+typedef struct
+{
+   // lexer variables
+   char *input_stream;
+   char *eof;
+   char *parse_point;
+   char *string_storage;
+   int   string_storage_len;
+
+   // lexer parse location for error messages
+   char *where_firstchar;
+   char *where_lastchar;
+
+   // lexer token variables
+   long token;
+   double real_number;
+   long   int_number;
+   char *string;
+   int string_len;
+} stb_lexer;
+
+typedef struct
+{
+   int line_number;
+   int line_offset;
+} stb_lex_location;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length);
+// this function initialize the 'lexer' structure
+//   Input:
+//   - input_stream points to the file to parse, loaded into memory
+//   - input_stream_end points to the end of the file, or NULL if you use 0-for-EOF
+//   - string_store is storage the lexer can use for storing parsed strings and identifiers
+//   - store_length is the length of that storage
+
+extern int stb_c_lexer_get_token(stb_lexer *lexer);
+// this function returns non-zero if a token is parsed, or 0 if at EOF
+//   Output:
+//   - lexer->token is the token ID, which is unicode code point for a single-char token, < 0 for a multichar or eof or error
+//   - lexer->real_number is a double constant value for CLEX_floatlit, or CLEX_intlit if STB_C_LEX_INTEGERS_AS_DOUBLES
+//   - lexer->int_number is an integer constant for CLEX_intlit if !STB_C_LEX_INTEGERS_AS_DOUBLES, or character for CLEX_charlit
+//   - lexer->string is a 0-terminated string for CLEX_dqstring or CLEX_sqstring or CLEX_identifier
+//   - lexer->string_len is the byte length of lexer->string
+
+extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc);
+// this inefficient function returns the line number and character offset of a
+// given location in the file as returned by stb_lex_token. Because it's inefficient,
+// you should only call it for errors, not for every token.
+// For error messages of invalid tokens, you typically want the location of the start
+// of the token (which caused the token to be invalid). For bugs involving legit
+// tokens, you can report the first or the range.
+//    Output:
+//    - loc->line_number is the line number in the file, counting from 1, of the location
+//    - loc->line_offset is the char-offset in the line, counting from 0, of the location
+
+
+#ifdef __cplusplus
+}
+#endif
+
+enum
+{
+   CLEX_eof = 256,
+   CLEX_parse_error,
+   CLEX_intlit        ,
+   CLEX_floatlit      ,
+   CLEX_id            ,
+   CLEX_dqstring      ,
+   CLEX_sqstring      ,
+   CLEX_charlit       ,
+   CLEX_eq            ,
+   CLEX_noteq         ,
+   CLEX_lesseq        ,
+   CLEX_greatereq     ,
+   CLEX_andand        ,
+   CLEX_oror          ,
+   CLEX_shl           ,
+   CLEX_shr           ,
+   CLEX_plusplus      ,
+   CLEX_minusminus    ,
+   CLEX_pluseq        ,
+   CLEX_minuseq       ,
+   CLEX_muleq         ,
+   CLEX_diveq         ,
+   CLEX_modeq         ,
+   CLEX_andeq         ,
+   CLEX_oreq          ,
+   CLEX_xoreq         ,
+   CLEX_arrow         ,
+   CLEX_eqarrow       ,
+   CLEX_shleq, CLEX_shreq,
+
+   CLEX_first_unused_token
+
+};
+#endif // INCLUDE_STB_C_LEXER_H
+
+#ifdef STB_C_LEXER_IMPLEMENTATION
+
+// Hacky definitions so we can easily #if on them
+#define Y(x) 1
+#define N(x) 0
+
+#if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
+typedef double     stb__clex_int;
+#define intfield   real_number
+#define STB__clex_int_as_double
+#else
+typedef long       stb__clex_int;
+#define intfield   int_number
+#endif
+
+// Convert these config options to simple conditional #defines so we can more
+// easily test them once we've change the meaning of Y/N
+
+#if STB_C_LEX_PARSE_SUFFIXES(x)
+#define STB__clex_parse_suffixes
+#endif
+
+#if STB_C_LEX_C99_HEX_FLOATS(x)
+#define STB__clex_hex_floats
+#endif
+
+#if STB_C_LEX_C_HEX_INTS(x)
+#define STB__clex_hex_ints
+#endif
+
+#if STB_C_LEX_C_DECIMAL_INTS(x)
+#define STB__clex_decimal_ints
+#endif
+
+#if STB_C_LEX_C_OCTAL_INTS(x)
+#define STB__clex_octal_ints
+#endif
+
+#if STB_C_LEX_C_DECIMAL_FLOATS(x)
+#define STB__clex_decimal_floats
+#endif
+
+#if STB_C_LEX_DISCARD_PREPROCESSOR(x)
+#define STB__clex_discard_preprocessor
+#endif
+
+#if STB_C_LEX_USE_STDLIB(x) && (!defined(STB__clex_hex_floats) || __STDC_VERSION__ >= 199901L)
+#define STB__CLEX_use_stdlib
+#include <stdlib.h>
+#endif
+
+// Now for the rest of the file we'll use the basic definition where
+// where Y expands to its contents and N expands to nothing
+#undef  Y
+#define Y(a) a
+#undef N
+#define N(a)
+
+// API function
+void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length)
+{
+   lexer->input_stream = (char *) input_stream;
+   lexer->eof = (char *) input_stream_end;
+   lexer->parse_point = (char *) input_stream;
+   lexer->string_storage = string_store;
+   lexer->string_storage_len = store_length;
+}
+
+// API function
+void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc)
+{
+   char *p = lexer->input_stream;
+   int line_number = 1;
+   int char_offset = 0;
+   while (*p && p < where) {
+      if (*p == '\n' || *p == '\r') {
+         p += (p[0]+p[1] == '\r'+'\n' ? 2 : 1); // skip newline
+         line_number += 1;
+         char_offset = 0;
+      } else {
+         ++p;
+         ++char_offset;
+      }
+   }
+   loc->line_number = line_number;
+   loc->line_offset = char_offset;
+}
+
+// main helper function for returning a parsed token
+static int stb__clex_token(stb_lexer *lexer, int token, char *start, char *end)
+{
+   lexer->token = token;
+   lexer->where_firstchar = start;
+   lexer->where_lastchar = end;
+   lexer->parse_point = end+1;
+   return 1;
+}
+
+// helper function for returning eof
+static int stb__clex_eof(stb_lexer *lexer)
+{
+   lexer->token = CLEX_eof;
+   return 0;
+}
+
+static int stb__clex_iswhite(int x)
+{
+   return x == ' ' || x == '\t' || x == '\r' || x == '\n' || x == '\f';
+}
+
+static const char *stb__strchr(const char *str, int ch)
+{
+   for (; *str; ++str)
+      if (*str == ch)
+         return str;
+   return 0;
+}
+
+// parse suffixes at the end of a number
+static int stb__clex_parse_suffixes(stb_lexer *lexer, long tokenid, char *start, char *cur, const char *suffixes)
+{
+   #ifdef STB__clex_parse_suffixes
+   lexer->string = lexer->string_storage;
+   lexer->string_len = 0;
+
+   while ((*cur >= 'a' && *cur <= 'z') || (*cur >= 'A' && *cur <= 'Z')) {
+      if (stb__strchr(suffixes, *cur) == 0)
+         return stb__clex_token(lexer, CLEX_parse_error, start, cur);
+      if (lexer->string_len+1 >= lexer->string_storage_len)
+         return stb__clex_token(lexer, CLEX_parse_error, start, cur);
+      lexer->string[lexer->string_len++] = *cur++;
+   }
+   #else
+   suffixes = suffixes; // attempt to suppress warnings
+   #endif
+   return stb__clex_token(lexer, tokenid, start, cur-1);
+}
+
+#ifndef STB__CLEX_use_stdlib
+static double stb__clex_pow(double base, unsigned int exponent)
+{
+   double value=1;
+   for ( ; exponent; exponent >>= 1) {
+      if (exponent & 1)
+         value *= base;
+      base *= base;
+   }
+   return value;
+}
+
+static double stb__clex_parse_float(char *p, char **q)
+{
+   char *s = p;
+   double value=0;
+   int base=10;
+   int exponent=0;
+
+#ifdef STB__clex_hex_floats
+   if (*p == '0') {
+      if (p[1] == 'x' || p[1] == 'X') {
+         base=16;
+         p += 2;
+      }
+   }
+#endif
+
+   for (;;) {
+      if (*p >= '0' && *p <= '9')
+         value = value*base + (*p++ - '0');
+#ifdef STB__clex_hex_floats
+      else if (base == 16 && *p >= 'a' && *p <= 'f')
+         value = value*base + 10 + (*p++ - 'a');
+      else if (base == 16 && *p >= 'A' && *p <= 'F')
+         value = value*base + 10 + (*p++ - 'A');
+#endif
+      else
+         break;
+   }
+
+   if (*p == '.') {
+      double pow, addend = 0;
+      ++p;
+      for (pow=1; ; pow*=base) {
+         if (*p >= '0' && *p <= '9')
+            addend = addend*base + (*p++ - '0');
+#ifdef STB__clex_hex_floats
+         else if (base == 16 && *p >= 'a' && *p <= 'f')
+            addend = addend*base + 10 + (*p++ - 'a');
+         else if (base == 16 && *p >= 'A' && *p <= 'F')
+            addend = addend*base + 10 + (*p++ - 'A');
+#endif
+         else
+            break;
+      }
+      value += addend / pow;
+   }
+#ifdef STB__clex_hex_floats
+   if (base == 16) {
+      // exponent required for hex float literal
+      if (*p != 'p' && *p != 'P') {
+         *q = s;
+         return 0;
+      }
+      exponent = 1;
+   } else
+#endif
+      exponent = (*p == 'e' || *p == 'E');
+
+   if (exponent) {
+      int sign = p[1] == '-';
+      unsigned int exponent=0;
+      double power=1;
+      ++p;
+      if (*p == '-' || *p == '+')
+         ++p;
+      while (*p >= '0' && *p <= '9')
+         exponent = exponent*10 + (*p++ - '0');
+
+#ifdef STB__clex_hex_floats
+      if (base == 16)
+         power = stb__clex_pow(2, exponent);
+      else
+#endif
+         power = stb__clex_pow(10, exponent);
+      if (sign)
+         value /= power;
+      else
+         value *= power;
+   }
+   *q = p;
+   return value;
+}
+#endif
+
+static int stb__clex_parse_char(char *p, char **q)
+{
+   if (*p == '\\') {
+      *q = p+2; // tentatively guess we'll parse two characters
+      switch(p[1]) {
+         case '\\': return '\\';
+         case '\'': return '\'';
+         case '"': return '"';
+         case 't': return '\t';
+         case 'f': return '\f';
+         case 'n': return '\n';
+         case 'r': return '\r';
+         case '0': return '\0'; // @TODO ocatal constants
+         case 'x': case 'X': return -1; // @TODO hex constants
+         case 'u': return -1; // @TODO unicode constants
+      }
+   }
+   *q = p+1;
+   return (unsigned char) *p;
+}
+
+static int stb__clex_parse_string(stb_lexer *lexer, char *p, int type)
+{
+   char *start = p;
+   char delim = *p++; // grab the " or ' for later matching
+   char *out = lexer->string_storage;
+   char *outend = lexer->string_storage + lexer->string_storage_len;
+   while (*p != delim) {
+      int n;
+      if (*p == '\\') {
+         char *q;
+         n = stb__clex_parse_char(p, &q);
+         if (n < 0)
+            return stb__clex_token(lexer, CLEX_parse_error, start, q);
+         p = q;
+      } else {
+         // @OPTIMIZE: could speed this up by looping-while-not-backslash
+         n = (unsigned char) *p++;
+      }
+      if (out+1 > outend)
+         return stb__clex_token(lexer, CLEX_parse_error, start, p);
+      // @TODO expand unicode escapes to UTF8
+      *out++ = (char) n;
+   }
+   *out = 0;
+   lexer->string = lexer->string_storage;
+   lexer->string_len = (int) (out - lexer->string_storage);
+   return stb__clex_token(lexer, type, start, p);
+}
+
+int stb_c_lexer_get_token(stb_lexer *lexer)
+{
+   char *p = lexer->parse_point;
+
+   // skip whitespace and comments
+   for (;;) {
+      #ifdef STB_C_LEX_ISWHITE
+      while (p != lexer->stream_end) {
+         int n;
+         n = STB_C_LEX_ISWHITE(p);
+         if (n == 0) break;
+         if (lexer->eof && lexer->eof - lexer->parse_point < n)
+            return stb__clex_token(tok, CLEX_parse_error, p,lexer->eof-1);
+         p += n;
+      }
+      #else
+      while (p != lexer->eof && stb__clex_iswhite(*p))
+         ++p;
+      #endif
+
+      STB_C_LEX_CPP_COMMENTS(
+         if (p != lexer->eof && p[0] == '/' && p[1] == '/') {
+            while (p != lexer->eof && *p != '\r' && *p != '\n')
+               ++p;
+            continue;
+         }
+      )
+
+      STB_C_LEX_C_COMMENTS(
+         if (p != lexer->eof && p[0] == '/' && p[1] == '*') {
+            char *start = p;
+            p += 2;
+            while (p != lexer->eof && (p[0] != '*' || p[1] != '/'))
+               ++p;
+            if (p == lexer->eof)
+               return stb__clex_token(lexer, CLEX_parse_error, start, p-1);
+            p += 2;
+            continue;
+         }
+      )
+
+      #ifdef STB__clex_discard_preprocessor
+         // @TODO this discards everything after a '#', regardless
+         // of where in the line the # is, rather than requiring it
+         // be at the start. (because this parser doesn't otherwise
+         // check for line breaks!)
+         if (p != lexer->eof && p[0] == '#') {
+            while (p != lexer->eof && *p != '\r' && *p != '\n')
+               ++p;
+            continue;
+         }
+      #endif
+
+      break;
+   }
+
+   if (p == lexer->eof)
+      return stb__clex_eof(lexer);
+
+   switch (*p) {
+      default:
+         if (   (*p >= 'a' && *p <= 'z')
+             || (*p >= 'A' && *p <= 'Z')
+             || *p == '_' || (unsigned char) *p >= 128    // >= 128 is UTF8 char
+             STB_C_LEX_DOLLAR_IDENTIFIER( || *p == '$' ) )
+         {
+            int n = 0;
+            lexer->string = lexer->string_storage;
+            do {
+               if (n+1 >= lexer->string_storage_len)
+                  return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
+               lexer->string[n] = p[n];
+               ++n;
+            } while (
+                  (p[n] >= 'a' && p[n] <= 'z')
+               || (p[n] >= 'A' && p[n] <= 'Z')
+               || (p[n] >= '0' && p[n] <= '9') // allow digits in middle of identifier
+               || p[n] == '_' || (unsigned char) p[n] >= 128
+                STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
+            );
+            lexer->string[n] = 0;
+            lexer->string_len = n;
+            return stb__clex_token(lexer, CLEX_id, p, p+n-1);
+         }
+
+         // check for EOF
+         STB_C_LEX_0_IS_EOF(
+            if (*p == 0)
+               return stb__clex_eof(lexer);
+         )
+
+      single_char:
+         // not an identifier, return the character as itself
+         return stb__clex_token(lexer, *p, p, p);
+
+      case '+':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_INCREMENTS(if (p[1] == '+') return stb__clex_token(lexer, CLEX_plusplus, p,p+1);)
+            STB_C_LEX_C_ARITHEQ(   if (p[1] == '=') return stb__clex_token(lexer, CLEX_pluseq  , p,p+1);)
+         }
+         goto single_char;
+      case '-':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_INCREMENTS(if (p[1] == '-') return stb__clex_token(lexer, CLEX_minusminus, p,p+1);)
+            STB_C_LEX_C_ARITHEQ(   if (p[1] == '=') return stb__clex_token(lexer, CLEX_minuseq   , p,p+1);)
+            STB_C_LEX_C_ARROW(     if (p[1] == '>') return stb__clex_token(lexer, CLEX_arrow     , p,p+1);)
+         }
+         goto single_char;
+      case '&':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_LOGICAL(  if (p[1] == '&') return stb__clex_token(lexer, CLEX_andand, p,p+1);)
+            STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_andeq , p,p+1);)
+         }
+         goto single_char;
+      case '|':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_LOGICAL(  if (p[1] == '|') return stb__clex_token(lexer, CLEX_oror, p,p+1);)
+            STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_oreq, p,p+1);)
+         }
+         goto single_char;
+      case '=':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_eq, p,p+1);)
+            STB_C_LEX_EQUAL_ARROW(  if (p[1] == '>') return stb__clex_token(lexer, CLEX_eqarrow, p,p+1);)
+         }
+         goto single_char;
+      case '!':
+         STB_C_LEX_C_COMPARISONS(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_noteq, p,p+1);)
+         goto single_char;
+      case '^':
+         STB_C_LEX_C_BITWISEEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_xoreq, p,p+1));
+         goto single_char;
+      case '%':
+         STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_modeq, p,p+1));
+         goto single_char;
+      case '*':
+         STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_muleq, p,p+1));
+         goto single_char;
+      case '/':
+         STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_diveq, p,p+1));
+         goto single_char;
+      case '<':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_lesseq, p,p+1);)
+            STB_C_LEX_C_SHIFTS(     if (p[1] == '<') {
+                                       STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
+                                                              return stb__clex_token(lexer, CLEX_shleq, p,p+2);)
+                                       return stb__clex_token(lexer, CLEX_shl, p,p+1);
+                                    }
+                              )
+         }
+         goto single_char;
+      case '>':
+         if (p+1 != lexer->eof) {
+            STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_greatereq, p,p+1);)
+            STB_C_LEX_C_SHIFTS(     if (p[1] == '>') {
+                                       STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
+                                                              return stb__clex_token(lexer, CLEX_shreq, p,p+2);)
+                                       return stb__clex_token(lexer, CLEX_shr, p,p+1);
+                                    }
+                              )
+         }
+         goto single_char;
+
+      case '"':
+         STB_C_LEX_C_DQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_dqstring);)
+         goto single_char;
+      case '\'':
+         STB_C_LEX_C_SQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_sqstring);)
+         STB_C_LEX_C_CHARS(
+         {
+            char *start = p;
+            lexer->int_number = stb__clex_parse_char(p+1, &p);
+            if (lexer->int_number < 0)
+               return stb__clex_token(lexer, CLEX_parse_error, start,start);
+            if (p == lexer->eof || *p != '\'')
+               return stb__clex_token(lexer, CLEX_parse_error, start,p);
+            return stb__clex_token(lexer, CLEX_charlit, start, p+1);
+         })
+         goto single_char;
+
+      case '0':
+         #if defined(STB__clex_hex_ints) || defined(STB__clex_hex_floats)
+            if (p+1 != lexer->eof) {
+               if (p[1] == 'x' || p[1] == 'X') {
+                  char *q;
+
+                  #ifdef STB__clex_hex_floats
+                  for (q=p+2;
+                       q != lexer->eof && ((*q >= '0' && *q <= '9') || (*q >= 'a' && *q <= 'f') || (*q >= 'A' && *q <= 'F'));
+                       ++q);
+                  if (q != lexer->eof) {
+                     if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'p' || *q == 'P')) {
+                        #ifdef STB__CLEX_use_stdlib
+                        lexer->real_number = strtod((char *) p, (char**) &q);
+                        #else
+                        lexer->real_number = stb__clex_parse_float(p, &q);
+                        #endif
+
+                        if (p == q)
+                           return stb__clex_token(lexer, CLEX_parse_error, p,q);
+                        return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
+
+                     }
+                  }
+                  #endif   // STB__CLEX_hex_floats
+
+                  #ifdef STB__clex_hex_ints
+                  #ifdef STB__CLEX_use_stdlib
+                  lexer->int_number = strtol((char *) p, (char **) &q, 16);
+                  #else
+                  {
+                     stb__clex_int n=0;
+                     for (q=p+2; q != lexer->eof; ++q) {
+                        if (*q >= '0' && *q <= '9')
+                           n = n*16 + (*q - '0');
+                        else if (*q >= 'a' && *q <= 'f')
+                           n = n*16 + (*q - 'a') + 10;
+                        else if (*q >= 'A' && *q <= 'F')
+                           n = n*16 + (*q - 'A') + 10;
+                        else
+                           break;
+                     }
+                     lexer->int_number = n;
+                  }
+                  #endif
+                  if (q == p+2)
+                     return stb__clex_token(lexer, CLEX_parse_error, p-2,p-1);
+                  return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_HEX_SUFFIXES);
+                  #endif
+               }
+            }
+         #endif // defined(STB__clex_hex_ints) || defined(STB__clex_hex_floats)
+         // can't test for octal because we might parse '0.0' as float or as '0' '.' '0',
+         // so have to do float first
+
+         /* FALL THROUGH */
+      case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+
+         #ifdef STB__clex_decimal_floats
+         {
+            char *q = p;
+            while (q != lexer->eof && (*q >= '0' && *q <= '9'))
+               ++q;
+            if (q != lexer->eof) {
+               if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'e' || *q == 'E')) {
+                  #ifdef STB__CLEX_use_stdlib
+                  lexer->real_number = strtod((char *) p, (char**) &q);
+                  #else
+                  lexer->real_number = stb__clex_parse_float(p, &q);
+                  #endif
+
+                  return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
+
+               }
+            }
+         }
+         #endif // STB__clex_decimal_floats
+
+         #ifdef STB__clex_octal_ints
+         if (p[0] == '0') {
+            char *q = p;
+            #ifdef STB__CLEX_use_stdlib
+            lexer->int_number = strtol((char *) p, (char **) &q, 8);
+            #else
+            stb__clex_int n=0;
+            while (q != lexer->eof) {
+               if (*q >= '0' && *q <= '7')
+                  n = n*8 + (*q - '0');
+               else
+                  break;
+               ++q;
+            }
+            if (q != lexer->eof && (*q == '8' || *q=='9'))
+               return stb__clex_token(lexer, CLEX_parse_error, p, q);
+            lexer->int_number = n;
+            #endif
+            return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
+         }
+         #endif // STB__clex_octal_ints
+
+         #ifdef STB__clex_decimal_ints
+         {
+            char *q = p;
+            #ifdef STB__CLEX_use_stdlib
+            lexer->int_number = strtol((char *) p, (char **) &q, 10);
+            #else
+            stb__clex_int n=0;
+            while (q != lexer->eof) {
+               if (*q >= '0' && *q <= '9')
+                  n = n*10 + (*q - '0');
+               else
+                  break;
+               ++q;
+            }
+            lexer->int_number = n;
+            #endif
+            return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
+         }
+         #endif // STB__clex_decimal_ints
+         goto single_char;
+   }
+}
+#endif // STB_C_LEXER_IMPLEMENTATION
+
+#ifdef STB_C_LEXER_SELF_TEST
+#define _CRT_SECURE_NO_WARNINGS
+#include <stdio.h>
+#include <stdlib.h>
+
+static void print_token(stb_lexer *lexer)
+{
+   switch (lexer->token) {
+      case CLEX_id        : printf("_%s", lexer->string); break;
+      case CLEX_eq        : printf("=="); break;
+      case CLEX_noteq     : printf("!="); break;
+      case CLEX_lesseq    : printf("<="); break;
+      case CLEX_greatereq : printf(">="); break;
+      case CLEX_andand    : printf("&&"); break;
+      case CLEX_oror      : printf("||"); break;
+      case CLEX_shl       : printf("<<"); break;
+      case CLEX_shr       : printf(">>"); break;
+      case CLEX_plusplus  : printf("++"); break;
+      case CLEX_minusminus: printf("--"); break;
+      case CLEX_arrow     : printf("->"); break;
+      case CLEX_andeq     : printf("&="); break;
+      case CLEX_oreq      : printf("|="); break;
+      case CLEX_xoreq     : printf("^="); break;
+      case CLEX_pluseq    : printf("+="); break;
+      case CLEX_minuseq   : printf("-="); break;
+      case CLEX_muleq     : printf("*="); break;
+      case CLEX_diveq     : printf("/="); break;
+      case CLEX_modeq     : printf("%%="); break;
+      case CLEX_shleq     : printf("<<="); break;
+      case CLEX_shreq     : printf(">>="); break;
+      case CLEX_eqarrow   : printf("=>"); break;
+      case CLEX_dqstring  : printf("\"%s\"", lexer->string); break;
+      case CLEX_sqstring  : printf("'\"%s\"'", lexer->string); break;
+      case CLEX_charlit   : printf("'%s'", lexer->string); break;
+      #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
+      case CLEX_intlit    : printf("#%g", lexer->real_number); break;
+      #else
+      case CLEX_intlit    : printf("#%ld", lexer->int_number); break;
+      #endif
+      case CLEX_floatlit  : printf("%g", lexer->real_number); break;
+      default:
+         if (lexer->token >= 0 && lexer->token < 256)
+            printf("%c", (int) lexer->token);
+         else {
+            printf("<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
+         }
+         break;
+   }
+}
+
+/* Force a test
+of parsing
+multiline comments */
+
+/*/ comment /*/
+/**/ extern /**/
+
+void dummy(void)
+{
+   double some_floats[] = {
+      1.0501, -10.4e12, 5E+10,
+#if 0   // not supported in C++ or C-pre-99, so don't try to compile it, but let our parser test it
+      0x1.0p+24, 0xff.FP-8, 0x1p-23,
+#endif
+      4.
+   };
+   (void) sizeof(some_floats);
+   (void) some_floats[1];
+
+   printf("test %d",1); // https://github.com/nothings/stb/issues/13
+}
+
+int main(int argc, char **argv)
+{
+   FILE *f = fopen("stb_c_lexer.h","rb");
+   char *text = (char *) malloc(1 << 20);
+   int len = f ? (int) fread(text, 1, 1<<20, f) : -1;
+   stb_lexer lex;
+   if (len < 0) {
+      fprintf(stderr, "Error opening file\n");
+      free(text);
+      fclose(f);
+      return 1;
+   }
+   fclose(f);
+
+   stb_c_lexer_init(&lex, text, text+len, (char *) malloc(0x10000), 0x10000);
+   while (stb_c_lexer_get_token(&lex)) {
+      if (lex.token == CLEX_parse_error) {
+         printf("\n<<<PARSE ERROR>>>\n");
+         break;
+      }
+      print_token(&lex);
+      printf("  ");
+   }
+   return 0;
+}
+#endif
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_connected_components.h b/lib/stb/stb_connected_components.h
new file mode 100644
index 0000000..f762f65
--- /dev/null
+++ b/lib/stb/stb_connected_components.h
@@ -0,0 +1,1049 @@
+// stb_connected_components - v0.96 - public domain connected components on grids
+//                                                 http://github.com/nothings/stb
+//
+// Finds connected components on 2D grids for testing reachability between
+// two points, with fast updates when changing reachability (e.g. on one machine
+// it was typically 0.2ms w/ 1024x1024 grid). Each grid square must be "open" or
+// "closed" (traversable or untraversable), and grid squares are only connected
+// to their orthogonal neighbors, not diagonally.
+//
+// In one source file, create the implementation by doing something like this:
+//
+//   #define STBCC_GRID_COUNT_X_LOG2    10
+//   #define STBCC_GRID_COUNT_Y_LOG2    10
+//   #define STB_CONNECTED_COMPONENTS_IMPLEMENTATION
+//   #include "stb_connected_components.h"
+//
+// The above creates an implementation that can run on maps up to 1024x1024.
+// Map sizes must be a multiple of (1<<(LOG2/2)) on each axis (e.g. 32 if LOG2=10,
+// 16 if LOG2=8, etc.) (You can just pad your map with untraversable space.)
+//
+// MEMORY USAGE
+//
+//   Uses about 6-7 bytes per grid square (e.g. 7MB for a 1024x1024 grid).
+//   Uses a single worst-case allocation which you pass in.
+//
+// PERFORMANCE
+//
+//   On a core i7-2700K at 3.5 Ghz, for a particular 1024x1024 map (map_03.png):
+//
+//       Creating map                   : 44.85 ms
+//       Making one square   traversable:  0.27 ms    (average over 29,448 calls)
+//       Making one square untraversable:  0.23 ms    (average over 30,123 calls)
+//       Reachability query:               0.00001 ms (average over 4,000,000 calls)
+//
+//   On non-degenerate maps update time is O(N^0.5), but on degenerate maps like
+//   checkerboards or 50% random, update time is O(N^0.75) (~2ms on above machine).
+//
+// CHANGELOG
+//
+//    0.96  (2019-03-04)  Fix warnings
+//    0.95  (2016-10-16)  Bugfix if multiple clumps in one cluster connect to same clump in another
+//    0.94  (2016-04-17)  Bugfix & optimize worst case (checkerboard & random)
+//    0.93  (2016-04-16)  Reduce memory by 10x for 1Kx1K map; small speedup
+//    0.92  (2016-04-16)  Compute sqrt(N) cluster size by default
+//    0.91  (2016-04-15)  Initial release
+//
+// TODO:
+//    - better API documentation
+//    - more comments
+//    - try re-integrating naive algorithm & compare performance
+//    - more optimized batching (current approach still recomputes local clumps many times)
+//    - function for setting a grid of squares at once (just use batching)
+//
+// LICENSE
+//
+//   See end of file for license information.
+//
+// ALGORITHM
+//
+//   The NxN grid map is split into sqrt(N) x sqrt(N) blocks called
+//  "clusters". Each cluster independently computes a set of connected
+//   components within that cluster (ignoring all connectivity out of
+//   that cluster) using a union-find disjoint set forest. This produces a bunch
+//   of locally connected components called "clumps". Each clump is (a) connected
+//   within its cluster, (b) does not directly connect to any other clumps in the
+//   cluster (though it may connect to them by paths that lead outside the cluster,
+//   but those are ignored at this step), and (c) maintains an adjacency list of
+//   all clumps in adjacent clusters that it _is_ connected to. Then a second
+//   union-find disjoint set forest is used to compute connected clumps
+//   globally, across the whole map. Reachability is then computed by
+//   finding which clump each input point belongs to, and checking whether
+//   those clumps are in the same "global" connected component.
+//
+//   The above data structure can be updated efficiently; on a change
+//   of a single grid square on the map, only one cluster changes its
+//   purely-local state, so only one cluster needs its clumps fully
+//   recomputed. Clumps in adjacent clusters need their adjacency lists
+//   updated: first to remove all references to the old clumps in the
+//   rebuilt cluster, then to add new references to the new clumps. Both
+//   of these operations can use the existing "find which clump each input
+//   point belongs to" query to compute that adjacency information rapidly.
+
+#ifndef INCLUDE_STB_CONNECTED_COMPONENTS_H
+#define INCLUDE_STB_CONNECTED_COMPONENTS_H
+
+#include <stdlib.h>
+
+typedef struct st_stbcc_grid stbcc_grid;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+//  initialization
+//
+
+// you allocate the grid data structure to this size (note that it will be very big!!!)
+extern size_t stbcc_grid_sizeof(void);
+
+// initialize the grid, value of map[] is 0 = traversable, non-0 is solid
+extern void stbcc_init_grid(stbcc_grid *g, unsigned char *map, int w, int h);
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+//  main functionality
+//
+
+// update a grid square state, 0 = traversable, non-0 is solid
+// i can add a batch-update if it's needed
+extern void stbcc_update_grid(stbcc_grid *g, int x, int y, int solid);
+
+// query if two grid squares are reachable from each other
+extern int stbcc_query_grid_node_connection(stbcc_grid *g, int x1, int y1, int x2, int y2);
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+//
+//  bonus functions
+//
+
+// wrap multiple stbcc_update_grid calls in these function to compute
+// multiple updates more efficiently; cannot make queries inside batch
+extern void stbcc_update_batch_begin(stbcc_grid *g);
+extern void stbcc_update_batch_end(stbcc_grid *g);
+
+// query the grid data structure for whether a given square is open or not
+extern int stbcc_query_grid_open(stbcc_grid *g, int x, int y);
+
+// get a unique id for the connected component this is in; it's not necessarily
+// small, you'll need a hash table or something to remap it (or just use
+extern unsigned int stbcc_get_unique_id(stbcc_grid *g, int x, int y);
+#define STBCC_NULL_UNIQUE_ID 0xffffffff // returned for closed map squares
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INCLUDE_STB_CONNECTED_COMPONENTS_H
+
+#ifdef STB_CONNECTED_COMPONENTS_IMPLEMENTATION
+
+#include <assert.h>
+#include <string.h> // memset
+
+#if !defined(STBCC_GRID_COUNT_X_LOG2) || !defined(STBCC_GRID_COUNT_Y_LOG2)
+   #error "You must define STBCC_GRID_COUNT_X_LOG2 and STBCC_GRID_COUNT_Y_LOG2 to define the max grid supported."
+#endif
+
+#define STBCC__GRID_COUNT_X (1 << STBCC_GRID_COUNT_X_LOG2)
+#define STBCC__GRID_COUNT_Y (1 << STBCC_GRID_COUNT_Y_LOG2)
+
+#define STBCC__MAP_STRIDE   (1 << (STBCC_GRID_COUNT_X_LOG2-3))
+
+#ifndef STBCC_CLUSTER_SIZE_X_LOG2
+   #define STBCC_CLUSTER_SIZE_X_LOG2   (STBCC_GRID_COUNT_X_LOG2/2) // log2(sqrt(2^N)) = 1/2 * log2(2^N)) = 1/2 * N
+   #if STBCC_CLUSTER_SIZE_X_LOG2 > 6
+   #undef STBCC_CLUSTER_SIZE_X_LOG2
+   #define STBCC_CLUSTER_SIZE_X_LOG2 6
+   #endif
+#endif
+
+#ifndef STBCC_CLUSTER_SIZE_Y_LOG2
+   #define STBCC_CLUSTER_SIZE_Y_LOG2   (STBCC_GRID_COUNT_Y_LOG2/2)
+   #if STBCC_CLUSTER_SIZE_Y_LOG2 > 6
+   #undef STBCC_CLUSTER_SIZE_Y_LOG2
+   #define STBCC_CLUSTER_SIZE_Y_LOG2 6
+   #endif
+#endif
+
+#define STBCC__CLUSTER_SIZE_X   (1 << STBCC_CLUSTER_SIZE_X_LOG2)
+#define STBCC__CLUSTER_SIZE_Y   (1 << STBCC_CLUSTER_SIZE_Y_LOG2)
+
+#define STBCC__CLUSTER_COUNT_X_LOG2   (STBCC_GRID_COUNT_X_LOG2 - STBCC_CLUSTER_SIZE_X_LOG2)
+#define STBCC__CLUSTER_COUNT_Y_LOG2   (STBCC_GRID_COUNT_Y_LOG2 - STBCC_CLUSTER_SIZE_Y_LOG2)
+
+#define STBCC__CLUSTER_COUNT_X  (1 << STBCC__CLUSTER_COUNT_X_LOG2)
+#define STBCC__CLUSTER_COUNT_Y  (1 << STBCC__CLUSTER_COUNT_Y_LOG2)
+
+#if STBCC__CLUSTER_SIZE_X >= STBCC__GRID_COUNT_X || STBCC__CLUSTER_SIZE_Y >= STBCC__GRID_COUNT_Y
+   #error "STBCC_CLUSTER_SIZE_X/Y_LOG2 must be smaller than STBCC_GRID_COUNT_X/Y_LOG2"
+#endif
+
+// worst case # of clumps per cluster
+#define STBCC__MAX_CLUMPS_PER_CLUSTER_LOG2   (STBCC_CLUSTER_SIZE_X_LOG2 + STBCC_CLUSTER_SIZE_Y_LOG2-1)
+#define STBCC__MAX_CLUMPS_PER_CLUSTER        (1 << STBCC__MAX_CLUMPS_PER_CLUSTER_LOG2)
+#define STBCC__MAX_CLUMPS                    (STBCC__MAX_CLUMPS_PER_CLUSTER * STBCC__CLUSTER_COUNT_X * STBCC__CLUSTER_COUNT_Y)
+#define STBCC__NULL_CLUMPID                  STBCC__MAX_CLUMPS_PER_CLUSTER
+
+#define STBCC__CLUSTER_X_FOR_COORD_X(x)  ((x) >> STBCC_CLUSTER_SIZE_X_LOG2)
+#define STBCC__CLUSTER_Y_FOR_COORD_Y(y)  ((y) >> STBCC_CLUSTER_SIZE_Y_LOG2)
+
+#define STBCC__MAP_BYTE_MASK(x,y)       (1 << ((x) & 7))
+#define STBCC__MAP_BYTE(g,x,y)          ((g)->map[y][(x) >> 3])
+#define STBCC__MAP_OPEN(g,x,y)          (STBCC__MAP_BYTE(g,x,y) & STBCC__MAP_BYTE_MASK(x,y))
+
+typedef unsigned short stbcc__clumpid;
+typedef unsigned char stbcc__verify_max_clumps[STBCC__MAX_CLUMPS_PER_CLUSTER < (1 << (8*sizeof(stbcc__clumpid))) ? 1 : -1];
+
+#define STBCC__MAX_EXITS_PER_CLUSTER   (STBCC__CLUSTER_SIZE_X + STBCC__CLUSTER_SIZE_Y)   // 64 for 32x32
+#define STBCC__MAX_EXITS_PER_CLUMP     (STBCC__CLUSTER_SIZE_X + STBCC__CLUSTER_SIZE_Y)   // 64 for 32x32
+#define STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER  (STBCC__MAX_EXITS_PER_CLUMP)
+
+// 2^19 * 2^6 => 2^25 exits => 2^26  => 64MB for 1024x1024
+
+// Logic for above on 4x4 grid:
+//
+// Many clumps:      One clump:
+//   + +               +  +
+//  +X.X.             +XX.X+
+//   .X.X+             .XXX
+//  +X.X.              XXX.
+//   .X.X+            +X.XX+
+//    + +              +  +
+//
+// 8 exits either way
+
+typedef unsigned char stbcc__verify_max_exits[STBCC__MAX_EXITS_PER_CLUMP <= 256];
+
+typedef struct
+{
+   unsigned short clump_index:12;
+     signed short cluster_dx:2;
+     signed short cluster_dy:2;
+} stbcc__relative_clumpid;
+
+typedef union
+{
+   struct {
+      unsigned int clump_index:12;
+      unsigned int cluster_x:10;
+      unsigned int cluster_y:10;
+   } f;
+   unsigned int c;
+} stbcc__global_clumpid;
+
+// rebuilt cluster 3,4
+
+// what changes in cluster 2,4
+
+typedef struct
+{
+   stbcc__global_clumpid global_label;        // 4
+   unsigned char num_adjacent;                // 1
+   unsigned char max_adjacent;                // 1
+   unsigned char adjacent_clump_list_index;   // 1
+   unsigned char reserved;
+} stbcc__clump; // 8
+
+#define STBCC__CLUSTER_ADJACENCY_COUNT   (STBCC__MAX_EXITS_PER_CLUSTER*2)
+typedef struct
+{
+   short num_clumps;
+   unsigned char num_edge_clumps;
+   unsigned char rebuild_adjacency;
+   stbcc__clump clump[STBCC__MAX_CLUMPS_PER_CLUSTER];       // 8 * 2^9 = 4KB
+   stbcc__relative_clumpid adjacency_storage[STBCC__CLUSTER_ADJACENCY_COUNT]; // 256 bytes
+} stbcc__cluster;
+
+struct st_stbcc_grid
+{
+   int w,h,cw,ch;
+   int in_batched_update;
+   //unsigned char cluster_dirty[STBCC__CLUSTER_COUNT_Y][STBCC__CLUSTER_COUNT_X]; // could bitpack, but: 1K x 1K => 1KB
+   unsigned char map[STBCC__GRID_COUNT_Y][STBCC__MAP_STRIDE]; // 1K x 1K => 1K x 128 => 128KB
+   stbcc__clumpid clump_for_node[STBCC__GRID_COUNT_Y][STBCC__GRID_COUNT_X];  // 1K x 1K x 2 = 2MB
+   stbcc__cluster cluster[STBCC__CLUSTER_COUNT_Y][STBCC__CLUSTER_COUNT_X]; //  1K x 4.5KB = 4.5MB
+};
+
+int stbcc_query_grid_node_connection(stbcc_grid *g, int x1, int y1, int x2, int y2)
+{
+   stbcc__global_clumpid label1, label2;
+   stbcc__clumpid c1 = g->clump_for_node[y1][x1];
+   stbcc__clumpid c2 = g->clump_for_node[y2][x2];
+   int cx1 = STBCC__CLUSTER_X_FOR_COORD_X(x1);
+   int cy1 = STBCC__CLUSTER_Y_FOR_COORD_Y(y1);
+   int cx2 = STBCC__CLUSTER_X_FOR_COORD_X(x2);
+   int cy2 = STBCC__CLUSTER_Y_FOR_COORD_Y(y2);
+   assert(!g->in_batched_update);
+   if (c1 == STBCC__NULL_CLUMPID || c2 == STBCC__NULL_CLUMPID)
+      return 0;
+   label1 = g->cluster[cy1][cx1].clump[c1].global_label;
+   label2 = g->cluster[cy2][cx2].clump[c2].global_label;
+   if (label1.c == label2.c)
+      return 1;
+   return 0;
+}
+
+int stbcc_query_grid_open(stbcc_grid *g, int x, int y)
+{
+   return STBCC__MAP_OPEN(g, x, y) != 0;
+}
+
+unsigned int stbcc_get_unique_id(stbcc_grid *g, int x, int y)
+{
+   stbcc__clumpid c = g->clump_for_node[y][x];
+   int cx = STBCC__CLUSTER_X_FOR_COORD_X(x);
+   int cy = STBCC__CLUSTER_Y_FOR_COORD_Y(y);
+   assert(!g->in_batched_update);
+   if (c == STBCC__NULL_CLUMPID) return STBCC_NULL_UNIQUE_ID;
+   return g->cluster[cy][cx].clump[c].global_label.c;
+}
+
+typedef struct
+{
+   unsigned char x,y;
+} stbcc__tinypoint;
+
+typedef struct
+{
+   stbcc__tinypoint parent[STBCC__CLUSTER_SIZE_Y][STBCC__CLUSTER_SIZE_X]; // 32x32 => 2KB
+   stbcc__clumpid   label[STBCC__CLUSTER_SIZE_Y][STBCC__CLUSTER_SIZE_X];
+} stbcc__cluster_build_info;
+
+static void stbcc__build_clumps_for_cluster(stbcc_grid *g, int cx, int cy);
+static void stbcc__remove_connections_to_adjacent_cluster(stbcc_grid *g, int cx, int cy, int dx, int dy);
+static void stbcc__add_connections_to_adjacent_cluster(stbcc_grid *g, int cx, int cy, int dx, int dy);
+
+static stbcc__global_clumpid stbcc__clump_find(stbcc_grid *g, stbcc__global_clumpid n)
+{
+   stbcc__global_clumpid q;
+   stbcc__clump *c = &g->cluster[n.f.cluster_y][n.f.cluster_x].clump[n.f.clump_index];
+
+   if (c->global_label.c == n.c)
+      return n;
+
+   q = stbcc__clump_find(g, c->global_label);
+   c->global_label = q;
+   return q;
+}
+
+typedef struct
+{
+   unsigned int cluster_x;
+   unsigned int cluster_y;
+   unsigned int clump_index;
+} stbcc__unpacked_clumpid;
+
+static void stbcc__clump_union(stbcc_grid *g, stbcc__unpacked_clumpid m, int x, int y, int idx)
+{
+   stbcc__clump *mc = &g->cluster[m.cluster_y][m.cluster_x].clump[m.clump_index];
+   stbcc__clump *nc = &g->cluster[y][x].clump[idx];
+   stbcc__global_clumpid mp = stbcc__clump_find(g, mc->global_label);
+   stbcc__global_clumpid np = stbcc__clump_find(g, nc->global_label);
+
+   if (mp.c == np.c)
+      return;
+
+   g->cluster[mp.f.cluster_y][mp.f.cluster_x].clump[mp.f.clump_index].global_label = np;
+}
+
+static void stbcc__build_connected_components_for_clumps(stbcc_grid *g)
+{
+   int i,j,k,h;
+
+   for (j=0; j < STBCC__CLUSTER_COUNT_Y; ++j) {
+      for (i=0; i < STBCC__CLUSTER_COUNT_X; ++i) {
+         stbcc__cluster *cluster = &g->cluster[j][i];
+         for (k=0; k < (int) cluster->num_edge_clumps; ++k) {
+            stbcc__global_clumpid m;
+            m.f.clump_index = k;
+            m.f.cluster_x = i;
+            m.f.cluster_y = j;
+            assert((int) m.f.clump_index == k && (int) m.f.cluster_x == i && (int) m.f.cluster_y == j);
+            cluster->clump[k].global_label = m;
+         }
+      }
+   }
+
+   for (j=0; j < STBCC__CLUSTER_COUNT_Y; ++j) {
+      for (i=0; i < STBCC__CLUSTER_COUNT_X; ++i) {
+         stbcc__cluster *cluster = &g->cluster[j][i];
+         for (k=0; k < (int) cluster->num_edge_clumps; ++k) {
+            stbcc__clump *clump = &cluster->clump[k];
+            stbcc__unpacked_clumpid m;
+            stbcc__relative_clumpid *adj;
+            m.clump_index = k;
+            m.cluster_x = i;
+            m.cluster_y = j;
+            adj = &cluster->adjacency_storage[clump->adjacent_clump_list_index];
+            for (h=0; h < clump->num_adjacent; ++h) {
+               unsigned int clump_index = adj[h].clump_index;
+               unsigned int x = adj[h].cluster_dx + i;
+               unsigned int y = adj[h].cluster_dy + j;
+               stbcc__clump_union(g, m, x, y, clump_index);
+            }
+         }
+      }
+   }
+
+   for (j=0; j < STBCC__CLUSTER_COUNT_Y; ++j) {
+      for (i=0; i < STBCC__CLUSTER_COUNT_X; ++i) {
+         stbcc__cluster *cluster = &g->cluster[j][i];
+         for (k=0; k < (int) cluster->num_edge_clumps; ++k) {
+            stbcc__global_clumpid m;
+            m.f.clump_index = k;
+            m.f.cluster_x = i;
+            m.f.cluster_y = j;
+            stbcc__clump_find(g, m);
+         }
+      }
+   }
+}
+
+static void stbcc__build_all_connections_for_cluster(stbcc_grid *g, int cx, int cy)
+{
+   // in this particular case, we are fully non-incremental. that means we
+   // can discover the correct sizes for the arrays, but requires we build
+   // the data into temporary data structures, or just count the sizes, so
+   // for simplicity we do the latter
+   stbcc__cluster *cluster = &g->cluster[cy][cx];
+   unsigned char connected[STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER][STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER/8]; // 64 x 8 => 1KB
+   unsigned char num_adj[STBCC__MAX_CLUMPS_PER_CLUSTER] = { 0 };
+   int x = cx * STBCC__CLUSTER_SIZE_X;
+   int y = cy * STBCC__CLUSTER_SIZE_Y;
+   int step_x, step_y=0, i, j, k, n, m, dx, dy, total;
+   int extra;
+
+   g->cluster[cy][cx].rebuild_adjacency = 0;
+
+   total = 0;
+   for (m=0; m < 4; ++m) {
+      switch (m) {
+         case 0:
+            dx = 1, dy = 0;
+            step_x = 0, step_y = 1;
+            i = STBCC__CLUSTER_SIZE_X-1;
+            j = 0;
+            n = STBCC__CLUSTER_SIZE_Y;
+            break;
+         case 1:
+            dx = -1, dy = 0;
+            i = 0;
+            j = 0;
+            step_x = 0;
+            step_y = 1;
+            n = STBCC__CLUSTER_SIZE_Y;
+            break;
+         case 2:
+            dy = -1, dx = 0;
+            i = 0;
+            j = 0;
+            step_x = 1;
+            step_y = 0;
+            n = STBCC__CLUSTER_SIZE_X;
+            break;
+         case 3:
+            dy = 1, dx = 0;
+            i = 0;
+            j = STBCC__CLUSTER_SIZE_Y-1;
+            step_x = 1;
+            step_y = 0;
+            n = STBCC__CLUSTER_SIZE_X;
+            break;
+      }
+
+      if (cx+dx < 0 || cx+dx >= g->cw || cy+dy < 0 || cy+dy >= g->ch)
+         continue;
+
+      memset(connected, 0, sizeof(connected));
+      for (k=0; k < n; ++k) {
+         if (STBCC__MAP_OPEN(g, x+i, y+j) && STBCC__MAP_OPEN(g, x+i+dx, y+j+dy)) {
+            stbcc__clumpid src = g->clump_for_node[y+j][x+i];
+            stbcc__clumpid dest = g->clump_for_node[y+j+dy][x+i+dx];
+            if (0 == (connected[src][dest>>3] & (1 << (dest & 7)))) {
+               connected[src][dest>>3] |= 1 << (dest & 7);
+               ++num_adj[src];
+               ++total;
+            }
+         }
+         i += step_x;
+         j += step_y;
+      }
+   }
+
+   assert(total <= STBCC__CLUSTER_ADJACENCY_COUNT);
+
+   // decide how to apportion unused adjacency slots; only clumps that lie
+   // on the edges of the cluster need adjacency slots, so divide them up
+   // evenly between those clumps
+
+   // we want:
+   //    extra = (STBCC__CLUSTER_ADJACENCY_COUNT - total) / cluster->num_edge_clumps;
+   // but we efficiently approximate this without a divide, because
+   // ignoring edge-vs-non-edge with 'num_adj[i]*2' was faster than
+   // 'num_adj[i]+extra' with the divide
+   if      (total + (cluster->num_edge_clumps<<2) <= STBCC__CLUSTER_ADJACENCY_COUNT)
+      extra = 4;
+   else if (total + (cluster->num_edge_clumps<<1) <= STBCC__CLUSTER_ADJACENCY_COUNT)
+      extra = 2;
+   else if (total + (cluster->num_edge_clumps<<0) <= STBCC__CLUSTER_ADJACENCY_COUNT)
+      extra = 1;
+   else
+      extra = 0;
+
+   total = 0;
+   for (i=0; i < (int) cluster->num_edge_clumps; ++i) {
+      int alloc = num_adj[i]+extra;
+      if (alloc > STBCC__MAX_EXITS_PER_CLUSTER)
+         alloc = STBCC__MAX_EXITS_PER_CLUSTER;
+      assert(total < 256); // must fit in byte
+      cluster->clump[i].adjacent_clump_list_index = (unsigned char) total;
+      cluster->clump[i].max_adjacent = alloc;
+      cluster->clump[i].num_adjacent = 0;
+      total += alloc;
+   }
+   assert(total <= STBCC__CLUSTER_ADJACENCY_COUNT);
+
+   stbcc__add_connections_to_adjacent_cluster(g, cx, cy, -1, 0);
+   stbcc__add_connections_to_adjacent_cluster(g, cx, cy,  1, 0);
+   stbcc__add_connections_to_adjacent_cluster(g, cx, cy,  0,-1);
+   stbcc__add_connections_to_adjacent_cluster(g, cx, cy,  0, 1);
+   // make sure all of the above succeeded.
+   assert(g->cluster[cy][cx].rebuild_adjacency == 0);
+}
+
+static void stbcc__add_connections_to_adjacent_cluster_with_rebuild(stbcc_grid *g, int cx, int cy, int dx, int dy)
+{
+   if (cx >= 0 && cx < g->cw && cy >= 0 && cy < g->ch) {
+      stbcc__add_connections_to_adjacent_cluster(g, cx, cy, dx, dy);
+      if (g->cluster[cy][cx].rebuild_adjacency)
+         stbcc__build_all_connections_for_cluster(g, cx, cy);
+   }
+}
+
+void stbcc_update_grid(stbcc_grid *g, int x, int y, int solid)
+{
+   int cx,cy;
+
+   if (!solid) {
+      if (STBCC__MAP_OPEN(g,x,y))
+         return;
+   } else {
+      if (!STBCC__MAP_OPEN(g,x,y))
+         return;
+   }
+
+   cx = STBCC__CLUSTER_X_FOR_COORD_X(x);
+   cy = STBCC__CLUSTER_Y_FOR_COORD_Y(y);
+
+   stbcc__remove_connections_to_adjacent_cluster(g, cx-1, cy,  1, 0);
+   stbcc__remove_connections_to_adjacent_cluster(g, cx+1, cy, -1, 0);
+   stbcc__remove_connections_to_adjacent_cluster(g, cx, cy-1,  0, 1);
+   stbcc__remove_connections_to_adjacent_cluster(g, cx, cy+1,  0,-1);
+
+   if (!solid)
+      STBCC__MAP_BYTE(g,x,y) |= STBCC__MAP_BYTE_MASK(x,y);
+   else
+      STBCC__MAP_BYTE(g,x,y) &= ~STBCC__MAP_BYTE_MASK(x,y);
+
+   stbcc__build_clumps_for_cluster(g, cx, cy);
+   stbcc__build_all_connections_for_cluster(g, cx, cy);
+
+   stbcc__add_connections_to_adjacent_cluster_with_rebuild(g, cx-1, cy,  1, 0);
+   stbcc__add_connections_to_adjacent_cluster_with_rebuild(g, cx+1, cy, -1, 0);
+   stbcc__add_connections_to_adjacent_cluster_with_rebuild(g, cx, cy-1,  0, 1);
+   stbcc__add_connections_to_adjacent_cluster_with_rebuild(g, cx, cy+1,  0,-1);
+
+   if (!g->in_batched_update)
+      stbcc__build_connected_components_for_clumps(g);
+   #if 0
+   else
+      g->cluster_dirty[cy][cx] = 1;
+   #endif
+}
+
+void stbcc_update_batch_begin(stbcc_grid *g)
+{
+   assert(!g->in_batched_update);
+   g->in_batched_update = 1;
+}
+
+void stbcc_update_batch_end(stbcc_grid *g)
+{
+   assert(g->in_batched_update);
+   g->in_batched_update =  0;
+   stbcc__build_connected_components_for_clumps(g); // @OPTIMIZE: only do this if update was non-empty
+}
+
+size_t stbcc_grid_sizeof(void)
+{
+   return sizeof(stbcc_grid);
+}
+
+void stbcc_init_grid(stbcc_grid *g, unsigned char *map, int w, int h)
+{
+   int i,j,k;
+   assert(w % STBCC__CLUSTER_SIZE_X == 0);
+   assert(h % STBCC__CLUSTER_SIZE_Y == 0);
+   assert(w % 8 == 0);
+
+   g->w = w;
+   g->h = h;
+   g->cw = w >> STBCC_CLUSTER_SIZE_X_LOG2;
+   g->ch = h >> STBCC_CLUSTER_SIZE_Y_LOG2;
+   g->in_batched_update = 0;
+
+   #if 0
+   for (j=0; j < STBCC__CLUSTER_COUNT_Y; ++j)
+      for (i=0; i < STBCC__CLUSTER_COUNT_X; ++i)
+         g->cluster_dirty[j][i] = 0;
+   #endif
+
+   for (j=0; j < h; ++j) {
+      for (i=0; i < w; i += 8) {
+         unsigned char c = 0;
+         for (k=0; k < 8; ++k)
+            if (map[j*w + (i+k)] == 0)
+               c |= (1 << k);
+         g->map[j][i>>3] = c;
+      }
+   }
+
+   for (j=0; j < g->ch; ++j)
+      for (i=0; i < g->cw; ++i)
+         stbcc__build_clumps_for_cluster(g, i, j);
+
+   for (j=0; j < g->ch; ++j)
+      for (i=0; i < g->cw; ++i)
+         stbcc__build_all_connections_for_cluster(g, i, j);
+
+   stbcc__build_connected_components_for_clumps(g);
+
+   for (j=0; j < g->h; ++j)
+      for (i=0; i < g->w; ++i)
+         assert(g->clump_for_node[j][i] <= STBCC__NULL_CLUMPID);
+}
+
+
+static void stbcc__add_clump_connection(stbcc_grid *g, int x1, int y1, int x2, int y2)
+{
+   stbcc__cluster *cluster;
+   stbcc__clump *clump;
+
+   int cx1 = STBCC__CLUSTER_X_FOR_COORD_X(x1);
+   int cy1 = STBCC__CLUSTER_Y_FOR_COORD_Y(y1);
+   int cx2 = STBCC__CLUSTER_X_FOR_COORD_X(x2);
+   int cy2 = STBCC__CLUSTER_Y_FOR_COORD_Y(y2);
+
+   stbcc__clumpid c1 = g->clump_for_node[y1][x1];
+   stbcc__clumpid c2 = g->clump_for_node[y2][x2];
+
+   stbcc__relative_clumpid rc;
+
+   assert(cx1 != cx2 || cy1 != cy2);
+   assert(abs(cx1-cx2) + abs(cy1-cy2) == 1);
+
+   // add connection to c2 in c1
+
+   rc.clump_index = c2;
+   rc.cluster_dx = x2-x1;
+   rc.cluster_dy = y2-y1;
+
+   cluster = &g->cluster[cy1][cx1];
+   clump = &cluster->clump[c1];
+   assert(clump->num_adjacent <= clump->max_adjacent);
+   if (clump->num_adjacent == clump->max_adjacent)
+      g->cluster[cy1][cx1].rebuild_adjacency = 1;
+   else {
+      stbcc__relative_clumpid *adj = &cluster->adjacency_storage[clump->adjacent_clump_list_index];
+      assert(clump->num_adjacent < STBCC__MAX_EXITS_PER_CLUMP);
+      assert(clump->adjacent_clump_list_index + clump->num_adjacent <= STBCC__CLUSTER_ADJACENCY_COUNT);
+      adj[clump->num_adjacent++] = rc;
+   }
+}
+
+static void stbcc__remove_clump_connection(stbcc_grid *g, int x1, int y1, int x2, int y2)
+{
+   stbcc__cluster *cluster;
+   stbcc__clump *clump;
+   stbcc__relative_clumpid *adj;
+   int i;
+
+   int cx1 = STBCC__CLUSTER_X_FOR_COORD_X(x1);
+   int cy1 = STBCC__CLUSTER_Y_FOR_COORD_Y(y1);
+   int cx2 = STBCC__CLUSTER_X_FOR_COORD_X(x2);
+   int cy2 = STBCC__CLUSTER_Y_FOR_COORD_Y(y2);
+
+   stbcc__clumpid c1 = g->clump_for_node[y1][x1];
+   stbcc__clumpid c2 = g->clump_for_node[y2][x2];
+
+   stbcc__relative_clumpid rc;
+
+   assert(cx1 != cx2 || cy1 != cy2);
+   assert(abs(cx1-cx2) + abs(cy1-cy2) == 1);
+
+   // add connection to c2 in c1
+
+   rc.clump_index = c2;
+   rc.cluster_dx = x2-x1;
+   rc.cluster_dy = y2-y1;
+
+   cluster = &g->cluster[cy1][cx1];
+   clump = &cluster->clump[c1];
+   adj = &cluster->adjacency_storage[clump->adjacent_clump_list_index];
+
+   for (i=0; i < clump->num_adjacent; ++i)
+      if (rc.clump_index == adj[i].clump_index &&
+          rc.cluster_dx  == adj[i].cluster_dx  &&
+          rc.cluster_dy  == adj[i].cluster_dy)
+         break;
+
+   if (i < clump->num_adjacent)
+      adj[i] = adj[--clump->num_adjacent];
+   else
+      assert(0);
+}
+
+static void stbcc__add_connections_to_adjacent_cluster(stbcc_grid *g, int cx, int cy, int dx, int dy)
+{
+   unsigned char connected[STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER][STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER/8] = { { 0 } };
+   int x = cx * STBCC__CLUSTER_SIZE_X;
+   int y = cy * STBCC__CLUSTER_SIZE_Y;
+   int step_x, step_y=0, i, j, k, n;
+
+   if (cx < 0 || cx >= g->cw || cy < 0 || cy >= g->ch)
+      return;
+
+   if (cx+dx < 0 || cx+dx >= g->cw || cy+dy < 0 || cy+dy >= g->ch)
+      return;
+
+   if (g->cluster[cy][cx].rebuild_adjacency)
+      return;
+
+   assert(abs(dx) + abs(dy) == 1);
+
+   if (dx == 1) {
+      i = STBCC__CLUSTER_SIZE_X-1;
+      j = 0;
+      step_x = 0;
+      step_y = 1;
+      n = STBCC__CLUSTER_SIZE_Y;
+   } else if (dx == -1) {
+      i = 0;
+      j = 0;
+      step_x = 0;
+      step_y = 1;
+      n = STBCC__CLUSTER_SIZE_Y;
+   } else if (dy == -1) {
+      i = 0;
+      j = 0;
+      step_x = 1;
+      step_y = 0;
+      n = STBCC__CLUSTER_SIZE_X;
+   } else if (dy == 1) {
+      i = 0;
+      j = STBCC__CLUSTER_SIZE_Y-1;
+      step_x = 1;
+      step_y = 0;
+      n = STBCC__CLUSTER_SIZE_X;
+   } else {
+      assert(0);
+      return;
+   }
+
+   for (k=0; k < n; ++k) {
+      if (STBCC__MAP_OPEN(g, x+i, y+j) && STBCC__MAP_OPEN(g, x+i+dx, y+j+dy)) {
+         stbcc__clumpid src = g->clump_for_node[y+j][x+i];
+         stbcc__clumpid dest = g->clump_for_node[y+j+dy][x+i+dx];
+         if (0 == (connected[src][dest>>3] & (1 << (dest & 7)))) {
+            assert((dest>>3) < sizeof(connected));
+            connected[src][dest>>3] |= 1 << (dest & 7);
+            stbcc__add_clump_connection(g, x+i, y+j, x+i+dx, y+j+dy);
+            if (g->cluster[cy][cx].rebuild_adjacency)
+               break;
+         }
+      }
+      i += step_x;
+      j += step_y;
+   }
+}
+
+static void stbcc__remove_connections_to_adjacent_cluster(stbcc_grid *g, int cx, int cy, int dx, int dy)
+{
+   unsigned char disconnected[STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER][STBCC__MAX_EDGE_CLUMPS_PER_CLUSTER/8] = { { 0 } };
+   int x = cx * STBCC__CLUSTER_SIZE_X;
+   int y = cy * STBCC__CLUSTER_SIZE_Y;
+   int step_x, step_y=0, i, j, k, n;
+
+   if (cx < 0 || cx >= g->cw || cy < 0 || cy >= g->ch)
+      return;
+
+   if (cx+dx < 0 || cx+dx >= g->cw || cy+dy < 0 || cy+dy >= g->ch)
+      return;
+
+   assert(abs(dx) + abs(dy) == 1);
+
+   if (dx == 1) {
+      i = STBCC__CLUSTER_SIZE_X-1;
+      j = 0;
+      step_x = 0;
+      step_y = 1;
+      n = STBCC__CLUSTER_SIZE_Y;
+   } else if (dx == -1) {
+      i = 0;
+      j = 0;
+      step_x = 0;
+      step_y = 1;
+      n = STBCC__CLUSTER_SIZE_Y;
+   } else if (dy == -1) {
+      i = 0;
+      j = 0;
+      step_x = 1;
+      step_y = 0;
+      n = STBCC__CLUSTER_SIZE_X;
+   } else if (dy == 1) {
+      i = 0;
+      j = STBCC__CLUSTER_SIZE_Y-1;
+      step_x = 1;
+      step_y = 0;
+      n = STBCC__CLUSTER_SIZE_X;
+   } else {
+      assert(0);
+      return;
+   }
+
+   for (k=0; k < n; ++k) {
+      if (STBCC__MAP_OPEN(g, x+i, y+j) && STBCC__MAP_OPEN(g, x+i+dx, y+j+dy)) {
+         stbcc__clumpid src = g->clump_for_node[y+j][x+i];
+         stbcc__clumpid dest = g->clump_for_node[y+j+dy][x+i+dx];
+         if (0 == (disconnected[src][dest>>3] & (1 << (dest & 7)))) {
+            disconnected[src][dest>>3] |= 1 << (dest & 7);
+            stbcc__remove_clump_connection(g, x+i, y+j, x+i+dx, y+j+dy);
+         }
+      }
+      i += step_x;
+      j += step_y;
+   }
+}
+
+static stbcc__tinypoint stbcc__incluster_find(stbcc__cluster_build_info *cbi, int x, int y)
+{
+   stbcc__tinypoint p,q;
+   p = cbi->parent[y][x];
+   if (p.x == x && p.y == y)
+      return p;
+   q = stbcc__incluster_find(cbi, p.x, p.y);
+   cbi->parent[y][x] = q;
+   return q;
+}
+
+static void stbcc__incluster_union(stbcc__cluster_build_info *cbi, int x1, int y1, int x2, int y2)
+{
+   stbcc__tinypoint p = stbcc__incluster_find(cbi, x1,y1);
+   stbcc__tinypoint q = stbcc__incluster_find(cbi, x2,y2);
+
+   if (p.x == q.x && p.y == q.y)
+      return;
+
+   cbi->parent[p.y][p.x] = q;
+}
+
+static void stbcc__switch_root(stbcc__cluster_build_info *cbi, int x, int y, stbcc__tinypoint p)
+{
+   cbi->parent[p.y][p.x].x = x;
+   cbi->parent[p.y][p.x].y = y;
+   cbi->parent[y][x].x = x;
+   cbi->parent[y][x].y = y;
+}
+
+static void stbcc__build_clumps_for_cluster(stbcc_grid *g, int cx, int cy)
+{
+   stbcc__cluster *c;
+   stbcc__cluster_build_info cbi;
+   int label=0;
+   int i,j;
+   int x = cx * STBCC__CLUSTER_SIZE_X;
+   int y = cy * STBCC__CLUSTER_SIZE_Y;
+
+   // set initial disjoint set forest state
+   for (j=0; j < STBCC__CLUSTER_SIZE_Y; ++j) {
+      for (i=0; i < STBCC__CLUSTER_SIZE_X; ++i) {
+         cbi.parent[j][i].x = i;
+         cbi.parent[j][i].y = j;
+      }
+   }
+
+   // join all sets that are connected
+   for (j=0; j < STBCC__CLUSTER_SIZE_Y; ++j) {
+      // check down only if not on bottom row
+      if (j < STBCC__CLUSTER_SIZE_Y-1)
+         for (i=0; i < STBCC__CLUSTER_SIZE_X; ++i)
+            if (STBCC__MAP_OPEN(g,x+i,y+j) && STBCC__MAP_OPEN(g,x+i  ,y+j+1))
+               stbcc__incluster_union(&cbi, i,j, i,j+1);
+      // check right for everything but rightmost column
+      for (i=0; i < STBCC__CLUSTER_SIZE_X-1; ++i)
+         if (STBCC__MAP_OPEN(g,x+i,y+j) && STBCC__MAP_OPEN(g,x+i+1,y+j  ))
+            stbcc__incluster_union(&cbi, i,j, i+1,j);
+   }
+
+   // label all non-empty clumps along edges so that all edge clumps are first
+   // in list; this means in degenerate case we can skip traversing non-edge clumps.
+   // because in the first pass we only label leaders, we swap the leader to the
+   // edge first
+
+   // first put solid labels on all the edges; these will get overwritten if they're open
+   for (j=0; j < STBCC__CLUSTER_SIZE_Y; ++j)
+      cbi.label[j][0] = cbi.label[j][STBCC__CLUSTER_SIZE_X-1] = STBCC__NULL_CLUMPID;
+   for (i=0; i < STBCC__CLUSTER_SIZE_X; ++i)
+      cbi.label[0][i] = cbi.label[STBCC__CLUSTER_SIZE_Y-1][i] = STBCC__NULL_CLUMPID;
+
+   for (j=0; j < STBCC__CLUSTER_SIZE_Y; ++j) {
+      i = 0;
+      if (STBCC__MAP_OPEN(g, x+i, y+j)) {
+         stbcc__tinypoint p = stbcc__incluster_find(&cbi, i,j);
+         if (p.x == i && p.y == j)
+            // if this is the leader, give it a label
+            cbi.label[j][i] = label++;
+         else if (!(p.x == 0 || p.x == STBCC__CLUSTER_SIZE_X-1 || p.y == 0 || p.y == STBCC__CLUSTER_SIZE_Y-1)) {
+            // if leader is in interior, promote this edge node to leader and label
+            stbcc__switch_root(&cbi, i, j, p);
+            cbi.label[j][i] = label++;
+         }
+         // else if leader is on edge, do nothing (it'll get labelled when we reach it)
+      }
+      i = STBCC__CLUSTER_SIZE_X-1;
+      if (STBCC__MAP_OPEN(g, x+i, y+j)) {
+         stbcc__tinypoint p = stbcc__incluster_find(&cbi, i,j);
+         if (p.x == i && p.y == j)
+            cbi.label[j][i] = label++;
+         else if (!(p.x == 0 || p.x == STBCC__CLUSTER_SIZE_X-1 || p.y == 0 || p.y == STBCC__CLUSTER_SIZE_Y-1)) {
+            stbcc__switch_root(&cbi, i, j, p);
+            cbi.label[j][i] = label++;
+         }
+      }
+   }
+
+   for (i=1; i < STBCC__CLUSTER_SIZE_Y-1; ++i) {
+      j = 0;
+      if (STBCC__MAP_OPEN(g, x+i, y+j)) {
+         stbcc__tinypoint p = stbcc__incluster_find(&cbi, i,j);
+         if (p.x == i && p.y == j)
+            cbi.label[j][i] = label++;
+         else if (!(p.x == 0 || p.x == STBCC__CLUSTER_SIZE_X-1 || p.y == 0 || p.y == STBCC__CLUSTER_SIZE_Y-1)) {
+            stbcc__switch_root(&cbi, i, j, p);
+            cbi.label[j][i] = label++;
+         }
+      }
+      j = STBCC__CLUSTER_SIZE_Y-1;
+      if (STBCC__MAP_OPEN(g, x+i, y+j)) {
+         stbcc__tinypoint p = stbcc__incluster_find(&cbi, i,j);
+         if (p.x == i && p.y == j)
+            cbi.label[j][i] = label++;
+         else if (!(p.x == 0 || p.x == STBCC__CLUSTER_SIZE_X-1 || p.y == 0 || p.y == STBCC__CLUSTER_SIZE_Y-1)) {
+            stbcc__switch_root(&cbi, i, j, p);
+            cbi.label[j][i] = label++;
+         }
+      }
+   }
+
+   c = &g->cluster[cy][cx];
+   c->num_edge_clumps = label;
+
+   // label any internal clusters
+   for (j=1; j < STBCC__CLUSTER_SIZE_Y-1; ++j) {
+      for (i=1; i < STBCC__CLUSTER_SIZE_X-1; ++i) {
+         stbcc__tinypoint p = cbi.parent[j][i];
+         if (p.x == i && p.y == j) {
+            if (STBCC__MAP_OPEN(g,x+i,y+j))
+               cbi.label[j][i] = label++;
+            else
+               cbi.label[j][i] = STBCC__NULL_CLUMPID;
+         }
+      }
+   }
+
+   // label all other nodes
+   for (j=0; j < STBCC__CLUSTER_SIZE_Y; ++j) {
+      for (i=0; i < STBCC__CLUSTER_SIZE_X; ++i) {
+         stbcc__tinypoint p = stbcc__incluster_find(&cbi, i,j);
+         if (p.x != i || p.y != j) {
+            if (STBCC__MAP_OPEN(g,x+i,y+j))
+               cbi.label[j][i] = cbi.label[p.y][p.x];
+         }
+         if (STBCC__MAP_OPEN(g,x+i,y+j))
+            assert(cbi.label[j][i] != STBCC__NULL_CLUMPID);
+      }
+   }
+
+   c->num_clumps = label;
+
+   for (i=0; i < label; ++i) {
+      c->clump[i].num_adjacent = 0;
+      c->clump[i].max_adjacent = 0;
+   }
+
+   for (j=0; j < STBCC__CLUSTER_SIZE_Y; ++j)
+      for (i=0; i < STBCC__CLUSTER_SIZE_X; ++i) {
+         g->clump_for_node[y+j][x+i] = cbi.label[j][i]; // @OPTIMIZE: remove cbi.label entirely
+         assert(g->clump_for_node[y+j][x+i] <= STBCC__NULL_CLUMPID);
+      }
+
+   // set the global label for all interior clumps since they can't have connections,
+   // so we don't have to do this on the global pass (brings from O(N) to O(N^0.75))
+   for (i=(int) c->num_edge_clumps; i < (int) c->num_clumps; ++i) {
+      stbcc__global_clumpid gc;
+      gc.f.cluster_x = cx;
+      gc.f.cluster_y = cy;
+      gc.f.clump_index = i;
+      c->clump[i].global_label = gc;
+   }
+
+   c->rebuild_adjacency = 1; // flag that it has no valid adjacency data
+}
+
+#endif // STB_CONNECTED_COMPONENTS_IMPLEMENTATION
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_divide.h b/lib/stb/stb_divide.h
new file mode 100644
index 0000000..6a51e3f
--- /dev/null
+++ b/lib/stb/stb_divide.h
@@ -0,0 +1,433 @@
+// stb_divide.h - v0.94 - public domain - Sean Barrett, Feb 2010
+// Three kinds of divide/modulus of signed integers.
+//
+// HISTORY
+//
+//   v0.94              Fix integer overflow issues
+//   v0.93  2020-02-02  Write useful exit() value from main()
+//   v0.92  2019-02-25  Fix warning
+//   v0.91  2010-02-27  Fix euclidean division by INT_MIN for non-truncating C
+//                      Check result with 64-bit math to catch such cases
+//   v0.90  2010-02-24  First public release
+//
+// USAGE
+//
+// In *ONE* source file, put:
+//
+//    #define STB_DIVIDE_IMPLEMENTATION
+//    // #define C_INTEGER_DIVISION_TRUNCATES  // see Note 1
+//    // #define C_INTEGER_DIVISION_FLOORS     // see Note 2
+//    #include "stb_divide.h"
+//
+// Other source files should just include stb_divide.h
+//
+// Note 1: On platforms/compilers that you know signed C division
+// truncates, you can #define C_INTEGER_DIVISION_TRUNCATES.
+//
+// Note 2: On platforms/compilers that you know signed C division
+// floors (rounds to negative infinity), you can #define
+// C_INTEGER_DIVISION_FLOORS.
+//
+// You can #define STB_DIVIDE_TEST in which case the implementation
+// will generate a main() and compiling the result will create a
+// program that tests the implementation. Run it with no arguments
+// and any output indicates an error; run it with any argument and
+// it will also print the test results. Define STB_DIVIDE_TEST_64
+// to a 64-bit integer type to avoid overflows in the result-checking
+// which give false negatives.
+//
+// ABOUT
+//
+// This file provides three different consistent divide/mod pairs
+// implemented on top of arbitrary C/C++ division, including correct
+// handling of overflow of intermediate calculations:
+//
+//     trunc:   a/b truncates to 0,           a%b has same sign as a
+//     floor:   a/b truncates to -inf,        a%b has same sign as b
+//     eucl:    a/b truncates to sign(b)*inf, a%b is non-negative
+//
+// Not necessarily optimal; I tried to keep it generally efficient,
+// but there may be better ways.
+//
+// Briefly, for those who are not familiar with the problem, we note
+// the reason these divides exist and are interesting:
+//
+//     'trunc' is easy to implement in hardware (strip the signs,
+//          compute, reapply the signs), thus is commonly defined
+//          by many languages (including C99)
+//
+//     'floor' is simple to define and better behaved than trunc;
+//          for example it divides integers into fixed-size buckets
+//          without an extra-wide bucket at 0, and for a fixed
+//          divisor N there are only |N| possible moduli.
+//
+//     'eucl' guarantees fixed-sized buckets *and* a non-negative
+//          modulus and defines division to be whatever is needed
+//          to achieve that result.
+//
+// See "The Euclidean definition of the functions div and mod"
+// by Raymond Boute (1992), or "Division and Modulus for Computer
+// Scientists" by Daan Leijen (2001)
+//
+// We assume of the built-in C division:
+//     (a) modulus is the remainder for the corresponding division
+//     (b) a/b truncates if a and b are the same sign
+//
+// Property (a) requires (a/b)*b + (a%b)==a, and is required by C.
+// Property (b) seems to be true of all hardware but is *not* satisfied
+// by the euclidean division operator we define, so it's possibly not
+// always true. If any such platform turns up, we can add more cases.
+// (Possibly only stb_div_trunc currently relies on property (b).)
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+
+#ifndef INCLUDE_STB_DIVIDE_H
+#define INCLUDE_STB_DIVIDE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int stb_div_trunc(int value_to_be_divided, int value_to_divide_by);
+extern int stb_div_floor(int value_to_be_divided, int value_to_divide_by);
+extern int stb_div_eucl (int value_to_be_divided, int value_to_divide_by);
+extern int stb_mod_trunc(int value_to_be_divided, int value_to_divide_by);
+extern int stb_mod_floor(int value_to_be_divided, int value_to_divide_by);
+extern int stb_mod_eucl (int value_to_be_divided, int value_to_divide_by);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef STB_DIVIDE_IMPLEMENTATION
+
+#if defined(__STDC_VERSION) && __STDC_VERSION__ >= 19901
+   #ifndef C_INTEGER_DIVISION_TRUNCATES
+      #define C_INTEGER_DIVISION_TRUNCATES
+   #endif
+#endif
+
+#ifndef INT_MIN
+#include <limits.h> // if you have no limits.h, #define INT_MIN yourself
+#endif
+
+// the following macros are designed to allow testing
+// other platforms by simulating them
+#ifndef STB_DIVIDE_TEST_FLOOR
+   #define stb__div(a,b)  ((a)/(b))
+   #define stb__mod(a,b)  ((a)%(b))
+#else
+   // implement floor-style divide on trunc platform
+   #ifndef C_INTEGER_DIVISION_TRUNCATES
+   #error "floor test requires truncating division"
+   #endif
+   #undef C_INTEGER_DIVISION_TRUNCATES
+   int stb__div(int v1, int v2)
+   {
+      int q = v1/v2, r = v1%v2;
+      if ((r > 0 && v2 < 0) || (r < 0 && v2 > 0))
+         return q-1;
+      else
+         return q;
+   }
+
+   int stb__mod(int v1, int v2)
+   {
+      int r = v1%v2;
+      if ((r > 0 && v2 < 0) || (r < 0 && v2 > 0))
+         return r+v2;
+      else
+         return r;
+   }
+#endif
+
+int stb_div_trunc(int v1, int v2)
+{
+   #ifdef C_INTEGER_DIVISION_TRUNCATES
+   return v1/v2;
+   #else
+   if (v1 >= 0 && v2 <= 0)
+      return -stb__div(-v1,v2);  // both negative to avoid overflow
+   if (v1 <= 0 && v2 >= 0)
+      if (v1 != INT_MIN)
+         return -stb__div(v1,-v2);    // both negative to avoid overflow
+      else
+         return -stb__div(v1+v2,-v2)-1; // push v1 away from wrap point
+   else
+      return v1/v2;            // same sign, so expect truncation
+   #endif
+}
+
+int stb_div_floor(int v1, int v2)
+{
+   #ifdef C_INTEGER_DIVISION_FLOORS
+   return v1/v2;
+   #else
+   if (v1 >= 0 && v2 < 0) {
+      if (v2 + 1 >= INT_MIN + v1) // check if increasing v1's magnitude overflows
+         return -stb__div((v2+1)-v1,v2); // nope, so just compute it
+      else
+         return -stb__div(-v1,v2) + ((-v1)%v2 ? -1 : 0);
+   }
+   if (v1 < 0 && v2 >= 0) {
+      if (v1 != INT_MIN) {
+         if (v1 + 1 >= INT_MIN + v2) // check if increasing v1's magnitude overflows
+            return -stb__div((v1+1)-v2,-v2); // nope, so just compute it
+         else
+            return -stb__div(-v1,v2) + (stb__mod(v1,-v2) ? -1 : 0);
+      } else // it must be possible to compute -(v1+v2) without overflowing
+         return -stb__div(-(v1+v2),v2) + (stb__mod(-(v1+v2),v2) ? -2 : -1);
+   } else
+      return v1/v2;           // same sign, so expect truncation
+   #endif
+}
+
+int stb_div_eucl(int v1, int v2)
+{
+   int q,r;
+   #ifdef C_INTEGER_DIVISION_TRUNCATES
+   q = v1/v2;
+   r = v1%v2;
+   #else
+   // handle every quadrant separately, since we can't rely on q and r flor
+   if (v1 >= 0)
+      if (v2 >= 0)
+         return stb__div(v1,v2);
+      else if (v2 != INT_MIN)
+         q = -stb__div(v1,-v2), r = stb__mod(v1,-v2);
+      else
+         q = 0, r = v1;
+   else if (v1 != INT_MIN)
+      if (v2 >= 0)
+         q = -stb__div(-v1,v2), r = -stb__mod(-v1,v2);
+      else if (v2 != INT_MIN)
+         q = stb__div(-v1,-v2), r = -stb__mod(-v1,-v2);
+      else // if v2 is INT_MIN, then we can't use -v2, but we can't divide by v2
+         q = 1, r = v1-q*v2;
+   else // if v1 is INT_MIN, we have to move away from overflow place
+      if (v2 >= 0)
+         q = -stb__div(-(v1+v2),v2)-1, r = -stb__mod(-(v1+v2),v2);
+      else if (v2 != INT_MIN)
+         q = stb__div(-(v1-v2),-v2)+1, r = -stb__mod(-(v1-v2),-v2);
+      else // for INT_MIN / INT_MIN, we need to be extra-careful to avoid overflow
+         q = 1, r = 0;
+   #endif
+   if (r >= 0)
+      return q;
+   else
+      return q + (v2 > 0 ? -1 : 1);
+}
+
+int stb_mod_trunc(int v1, int v2)
+{
+   #ifdef C_INTEGER_DIVISION_TRUNCATES
+   return v1%v2;
+   #else
+   if (v1 >= 0) { // modulus result should always be positive
+      int r = stb__mod(v1,v2);
+      if (r >= 0)
+         return r;
+      else
+         return r - (v2 < 0 ? v2 : -v2);
+   } else {    // modulus result should always be negative
+      int r = stb__mod(v1,v2);
+      if (r <= 0)
+         return r;
+      else
+         return r + (v2 < 0 ? v2 : -v2);
+   }
+   #endif
+}
+
+int stb_mod_floor(int v1, int v2)
+{
+   #ifdef C_INTEGER_DIVISION_FLOORS
+   return v1%v2;
+   #else
+   if (v2 >= 0) { // result should always be positive
+      int r = stb__mod(v1,v2);
+      if (r >= 0)
+         return r;
+      else
+         return r + v2;
+   } else { // result should always be negative
+      int r = stb__mod(v1,v2);
+      if (r <= 0)
+         return r;
+      else
+         return r + v2;
+   }
+   #endif
+}
+
+int stb_mod_eucl(int v1, int v2)
+{
+   int r = stb__mod(v1,v2);
+
+   if (r >= 0)
+      return r;
+   else
+      return r - (v2 < 0 ? v2 : -v2); // negative abs() [to avoid overflow]
+}
+
+#ifdef STB_DIVIDE_TEST
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+
+int show=0;
+int err=0;
+
+void stbdiv_check(int q, int r, int a, int b, char *type, int dir)
+{
+   if ((dir > 0 && r < 0) || (dir < 0 && r > 0)) {
+      fprintf(stderr, "FAILED: %s(%d,%d) remainder %d in wrong direction\n", type,a,b,r);
+      err++;
+   } else
+      if (b != INT_MIN) // can't compute abs(), but if b==INT_MIN all remainders are valid
+         if (r <= -abs(b) || r >= abs(b)) {
+            fprintf(stderr, "FAILED: %s(%d,%d) remainder %d out of range\n", type,a,b,r);
+            err++;
+         }
+   #ifdef STB_DIVIDE_TEST_64
+   {
+      STB_DIVIDE_TEST_64 q64 = q, r64=r, a64=a, b64=b;
+      if (q64*b64+r64 != a64) {
+         fprintf(stderr, "FAILED: %s(%d,%d) remainder %d doesn't match quotient %d\n", type,a,b,r,q);
+         err++;
+      }
+   }
+   #else
+   if (q*b+r != a) {
+      fprintf(stderr, "FAILED: %s(%d,%d) remainder %d doesn't match quotient %d\n", type,a,b,r,q);
+      err++;
+   }
+   #endif
+}
+
+void test(int a, int b)
+{
+   int q,r;
+   if (show) printf("(%+11d,%+d) |  ", a,b);
+   q = stb_div_trunc(a,b), r = stb_mod_trunc(a,b);
+   if (show) printf("(%+11d,%+2d)  ", q,r); stbdiv_check(q,r,a,b, "trunc",a);
+   q = stb_div_floor(a,b), r = stb_mod_floor(a,b);
+   if (show) printf("(%+11d,%+2d)  ", q,r); stbdiv_check(q,r,a,b, "floor",b);
+   q = stb_div_eucl (a,b), r = stb_mod_eucl (a,b);
+   if (show) printf("(%+11d,%+2d)\n", q,r); stbdiv_check(q,r,a,b, "euclidean",1);
+}
+
+void testh(int a, int b)
+{
+   int q,r;
+   if (show) printf("(%08x,%08x) |\n", a,b);
+   q = stb_div_trunc(a,b), r = stb_mod_trunc(a,b); stbdiv_check(q,r,a,b, "trunc",a);
+   if (show) printf("             (%08x,%08x)", q,r);
+   q = stb_div_floor(a,b), r = stb_mod_floor(a,b); stbdiv_check(q,r,a,b, "floor",b);
+   if (show) printf("   (%08x,%08x)", q,r);
+   q = stb_div_eucl (a,b), r = stb_mod_eucl (a,b); stbdiv_check(q,r,a,b, "euclidean",1);
+   if (show) printf("   (%08x,%08x)\n ", q,r);
+}
+
+int main(int argc, char **argv)
+{
+   if (argc > 1) show=1;
+
+   test(8,3);
+   test(8,-3);
+   test(-8,3);
+   test(-8,-3);
+   test(1,2);
+   test(1,-2);
+   test(-1,2);
+   test(-1,-2);
+   test(8,4);
+   test(8,-4);
+   test(-8,4);
+   test(-8,-4);
+
+   test(INT_MAX,1);
+   test(INT_MIN,1);
+   test(INT_MIN+1,1);
+   test(INT_MAX,-1);
+   //test(INT_MIN,-1); // this traps in MSVC, so we leave it untested
+   test(INT_MIN+1,-1);
+   test(INT_MIN,-2);
+   test(INT_MIN+1,2);
+   test(INT_MIN+1,-2);
+   test(INT_MAX,2);
+   test(INT_MAX,-2);
+   test(INT_MIN+1,2);
+   test(INT_MIN+1,-2);
+   test(INT_MIN,2);
+   test(INT_MIN,-2);
+   test(INT_MIN,7);
+   test(INT_MIN,-7);
+   test(INT_MIN+1,4);
+   test(INT_MIN+1,-4);
+
+   testh(-7, INT_MIN);
+   testh(-1, INT_MIN);
+   testh(1, INT_MIN);
+   testh(7, INT_MIN);
+
+   testh(INT_MAX-1, INT_MIN);
+   testh(INT_MAX,   INT_MIN);
+   testh(INT_MIN,   INT_MIN);
+   testh(INT_MIN+1, INT_MIN);
+
+   testh(INT_MAX-1, INT_MAX);
+   testh(INT_MAX  , INT_MAX);
+   testh(INT_MIN  , INT_MAX);
+   testh(INT_MIN+1, INT_MAX);
+
+   return err > 0 ? 1 : 0;
+}
+#endif // STB_DIVIDE_TEST
+#endif // STB_DIVIDE_IMPLEMENTATION
+#endif // INCLUDE_STB_DIVIDE_H
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_ds.h b/lib/stb/stb_ds.h
new file mode 100644
index 0000000..e84c82d
--- /dev/null
+++ b/lib/stb/stb_ds.h
@@ -0,0 +1,1895 @@
+/* stb_ds.h - v0.67 - public domain data structures - Sean Barrett 2019
+
+   This is a single-header-file library that provides easy-to-use
+   dynamic arrays and hash tables for C (also works in C++).
+
+   For a gentle introduction:
+      http://nothings.org/stb_ds
+
+   To use this library, do this in *one* C or C++ file:
+      #define STB_DS_IMPLEMENTATION
+      #include "stb_ds.h"
+
+TABLE OF CONTENTS
+
+  Table of Contents
+  Compile-time options
+  License
+  Documentation
+  Notes
+  Notes - Dynamic arrays
+  Notes - Hash maps
+  Credits
+
+COMPILE-TIME OPTIONS
+
+  #define STBDS_NO_SHORT_NAMES
+
+     This flag needs to be set globally.
+
+     By default stb_ds exposes shorter function names that are not qualified
+     with the "stbds_" prefix. If these names conflict with the names in your
+     code, define this flag.
+
+  #define STBDS_SIPHASH_2_4
+
+     This flag only needs to be set in the file containing #define STB_DS_IMPLEMENTATION.
+
+     By default stb_ds.h hashes using a weaker variant of SipHash and a custom hash for
+     4- and 8-byte keys. On 64-bit platforms, you can define the above flag to force
+     stb_ds.h to use specification-compliant SipHash-2-4 for all keys. Doing so makes
+     hash table insertion about 20% slower on 4- and 8-byte keys, 5% slower on
+     64-byte keys, and 10% slower on 256-byte keys on my test computer.
+
+  #define STBDS_REALLOC(context,ptr,size) better_realloc
+  #define STBDS_FREE(context,ptr)         better_free
+
+     These defines only need to be set in the file containing #define STB_DS_IMPLEMENTATION.
+
+     By default stb_ds uses stdlib realloc() and free() for memory management. You can
+     substitute your own functions instead by defining these symbols. You must either
+     define both, or neither. Note that at the moment, 'context' will always be NULL.
+     @TODO add an array/hash initialization function that takes a memory context pointer.
+
+  #define STBDS_UNIT_TESTS
+
+     Defines a function stbds_unit_tests() that checks the functioning of the data structures.
+
+  Note that on older versions of gcc (e.g. 5.x.x) you may need to build with '-std=c++0x'
+     (or equivalentally '-std=c++11') when using anonymous structures as seen on the web
+     page or in STBDS_UNIT_TESTS.
+
+LICENSE
+
+  Placed in the public domain and also MIT licensed.
+  See end of file for detailed license information.
+
+DOCUMENTATION
+
+  Dynamic Arrays
+
+    Non-function interface:
+
+      Declare an empty dynamic array of type T
+        T* foo = NULL;
+
+      Access the i'th item of a dynamic array 'foo' of type T, T* foo:
+        foo[i]
+
+    Functions (actually macros)
+
+      arrfree:
+        void arrfree(T*);
+          Frees the array.
+
+      arrlen:
+        ptrdiff_t arrlen(T*);
+          Returns the number of elements in the array.
+
+      arrlenu:
+        size_t arrlenu(T*);
+          Returns the number of elements in the array as an unsigned type.
+
+      arrpop:
+        T arrpop(T* a)
+          Removes the final element of the array and returns it.
+
+      arrput:
+        T arrput(T* a, T b);
+          Appends the item b to the end of array a. Returns b.
+
+      arrins:
+        T arrins(T* a, int p, T b);
+          Inserts the item b into the middle of array a, into a[p],
+          moving the rest of the array over. Returns b.
+
+      arrinsn:
+        void arrinsn(T* a, int p, int n);
+          Inserts n uninitialized items into array a starting at a[p],
+          moving the rest of the array over.
+
+      arraddnptr:
+        T* arraddnptr(T* a, int n)
+          Appends n uninitialized items onto array at the end.
+          Returns a pointer to the first uninitialized item added.
+
+      arraddnindex:
+        size_t arraddnindex(T* a, int n)
+          Appends n uninitialized items onto array at the end.
+          Returns the index of the first uninitialized item added.
+
+      arrdel:
+        void arrdel(T* a, int p);
+          Deletes the element at a[p], moving the rest of the array over.
+
+      arrdeln:
+        void arrdeln(T* a, int p, int n);
+          Deletes n elements starting at a[p], moving the rest of the array over.
+
+      arrdelswap:
+        void arrdelswap(T* a, int p);
+          Deletes the element at a[p], replacing it with the element from
+          the end of the array. O(1) performance.
+
+      arrsetlen:
+        void arrsetlen(T* a, int n);
+          Changes the length of the array to n. Allocates uninitialized
+          slots at the end if necessary.
+
+      arrsetcap:
+        size_t arrsetcap(T* a, int n);
+          Sets the length of allocated storage to at least n. It will not
+          change the length of the array.
+
+      arrcap:
+        size_t arrcap(T* a);
+          Returns the number of total elements the array can contain without
+          needing to be reallocated.
+
+  Hash maps & String hash maps
+
+    Given T is a structure type: struct { TK key; TV value; }. Note that some
+    functions do not require TV value and can have other fields. For string
+    hash maps, TK must be 'char *'.
+
+    Special interface:
+
+      stbds_rand_seed:
+        void stbds_rand_seed(size_t seed);
+          For security against adversarially chosen data, you should seed the
+          library with a strong random number. Or at least seed it with time().
+
+      stbds_hash_string:
+        size_t stbds_hash_string(char *str, size_t seed);
+          Returns a hash value for a string.
+
+      stbds_hash_bytes:
+        size_t stbds_hash_bytes(void *p, size_t len, size_t seed);
+          These functions hash an arbitrary number of bytes. The function
+          uses a custom hash for 4- and 8-byte data, and a weakened version
+          of SipHash for everything else. On 64-bit platforms you can get
+          specification-compliant SipHash-2-4 on all data by defining
+          STBDS_SIPHASH_2_4, at a significant cost in speed.
+
+    Non-function interface:
+
+      Declare an empty hash map of type T
+        T* foo = NULL;
+
+      Access the i'th entry in a hash table T* foo:
+        foo[i]
+
+    Function interface (actually macros):
+
+      hmfree
+      shfree
+        void hmfree(T*);
+        void shfree(T*);
+          Frees the hashmap and sets the pointer to NULL.
+
+      hmlen
+      shlen
+        ptrdiff_t hmlen(T*)
+        ptrdiff_t shlen(T*)
+          Returns the number of elements in the hashmap.
+
+      hmlenu
+      shlenu
+        size_t hmlenu(T*)
+        size_t shlenu(T*)
+          Returns the number of elements in the hashmap.
+
+      hmgeti
+      shgeti
+      hmgeti_ts
+        ptrdiff_t hmgeti(T*, TK key)
+        ptrdiff_t shgeti(T*, char* key)
+        ptrdiff_t hmgeti_ts(T*, TK key, ptrdiff_t tempvar)
+          Returns the index in the hashmap which has the key 'key', or -1
+          if the key is not present.
+
+      hmget
+      hmget_ts
+      shget
+        TV hmget(T*, TK key)
+        TV shget(T*, char* key)
+        TV hmget_ts(T*, TK key, ptrdiff_t tempvar)
+          Returns the value corresponding to 'key' in the hashmap.
+          The structure must have a 'value' field
+
+      hmgets
+      shgets
+        T hmgets(T*, TK key)
+        T shgets(T*, char* key)
+          Returns the structure corresponding to 'key' in the hashmap.
+
+      hmgetp
+      shgetp
+      hmgetp_ts
+      hmgetp_null
+      shgetp_null
+        T* hmgetp(T*, TK key)
+        T* shgetp(T*, char* key)
+        T* hmgetp_ts(T*, TK key, ptrdiff_t tempvar)
+        T* hmgetp_null(T*, TK key)
+        T* shgetp_null(T*, char *key)
+          Returns a pointer to the structure corresponding to 'key' in
+          the hashmap. Functions ending in "_null" return NULL if the key
+          is not present in the hashmap; the others return a pointer to a
+          structure holding the default value (but not the searched-for key).
+
+      hmdefault
+      shdefault
+        TV hmdefault(T*, TV value)
+        TV shdefault(T*, TV value)
+          Sets the default value for the hashmap, the value which will be
+          returned by hmget/shget if the key is not present.
+
+      hmdefaults
+      shdefaults
+        TV hmdefaults(T*, T item)
+        TV shdefaults(T*, T item)
+          Sets the default struct for the hashmap, the contents which will be
+          returned by hmgets/shgets if the key is not present.
+
+      hmput
+      shput
+        TV hmput(T*, TK key, TV value)
+        TV shput(T*, char* key, TV value)
+          Inserts a <key,value> pair into the hashmap. If the key is already
+          present in the hashmap, updates its value.
+
+      hmputs
+      shputs
+        T hmputs(T*, T item)
+        T shputs(T*, T item)
+          Inserts a struct with T.key into the hashmap. If the struct is already
+          present in the hashmap, updates it.
+
+      hmdel
+      shdel
+        int hmdel(T*, TK key)
+        int shdel(T*, char* key)
+          If 'key' is in the hashmap, deletes its entry and returns 1.
+          Otherwise returns 0.
+
+    Function interface (actually macros) for strings only:
+
+      sh_new_strdup
+        void sh_new_strdup(T*);
+          Overwrites the existing pointer with a newly allocated
+          string hashmap which will automatically allocate and free
+          each string key using realloc/free
+
+      sh_new_arena
+        void sh_new_arena(T*);
+          Overwrites the existing pointer with a newly allocated
+          string hashmap which will automatically allocate each string
+          key to a string arena. Every string key ever used by this
+          hash table remains in the arena until the arena is freed.
+          Additionally, any key which is deleted and reinserted will
+          be allocated multiple times in the string arena.
+
+NOTES
+
+  * These data structures are realloc'd when they grow, and the macro
+    "functions" write to the provided pointer. This means: (a) the pointer
+    must be an lvalue, and (b) the pointer to the data structure is not
+    stable, and you must maintain it the same as you would a realloc'd
+    pointer. For example, if you pass a pointer to a dynamic array to a
+    function which updates it, the function must return back the new
+    pointer to the caller. This is the price of trying to do this in C.
+
+  * The following are the only functions that are thread-safe on a single data
+    structure, i.e. can be run in multiple threads simultaneously on the same
+    data structure
+        hmlen        shlen
+        hmlenu       shlenu
+        hmget_ts     shget_ts
+        hmgeti_ts    shgeti_ts
+        hmgets_ts    shgets_ts
+
+  * You iterate over the contents of a dynamic array and a hashmap in exactly
+    the same way, using arrlen/hmlen/shlen:
+
+      for (i=0; i < arrlen(foo); ++i)
+         ... foo[i] ...
+
+  * All operations except arrins/arrdel are O(1) amortized, but individual
+    operations can be slow, so these data structures may not be suitable
+    for real time use. Dynamic arrays double in capacity as needed, so
+    elements are copied an average of once. Hash tables double/halve
+    their size as needed, with appropriate hysteresis to maintain O(1)
+    performance.
+
+NOTES - DYNAMIC ARRAY
+
+  * If you know how long a dynamic array is going to be in advance, you can avoid
+    extra memory allocations by using arrsetlen to allocate it to that length in
+    advance and use foo[n] while filling it out, or arrsetcap to allocate the memory
+    for that length and use arrput/arrpush as normal.
+
+  * Unlike some other versions of the dynamic array, this version should
+    be safe to use with strict-aliasing optimizations.
+
+NOTES - HASH MAP
+
+  * For compilers other than GCC and clang (e.g. Visual Studio), for hmput/hmget/hmdel
+    and variants, the key must be an lvalue (so the macro can take the address of it).
+    Extensions are used that eliminate this requirement if you're using C99 and later
+    in GCC or clang, or if you're using C++ in GCC. But note that this can make your
+    code less portable.
+
+  * To test for presence of a key in a hashmap, just do 'hmgeti(foo,key) >= 0'.
+
+  * The iteration order of your data in the hashmap is determined solely by the
+    order of insertions and deletions. In particular, if you never delete, new
+    keys are always added at the end of the array. This will be consistent
+    across all platforms and versions of the library. However, you should not
+    attempt to serialize the internal hash table, as the hash is not consistent
+    between different platforms, and may change with future versions of the library.
+
+  * Use sh_new_arena() for string hashmaps that you never delete from. Initialize
+    with NULL if you're managing the memory for your strings, or your strings are
+    never freed (at least until the hashmap is freed). Otherwise, use sh_new_strdup().
+    @TODO: make an arena variant that garbage collects the strings with a trivial
+    copy collector into a new arena whenever the table shrinks / rebuilds. Since
+    current arena recommendation is to only use arena if it never deletes, then
+    this can just replace current arena implementation.
+
+  * If adversarial input is a serious concern and you're on a 64-bit platform,
+    enable STBDS_SIPHASH_2_4 (see the 'Compile-time options' section), and pass
+    a strong random number to stbds_rand_seed.
+
+  * The default value for the hash table is stored in foo[-1], so if you
+    use code like 'hmget(T,k)->value = 5' you can accidentally overwrite
+    the value stored by hmdefault if 'k' is not present.
+
+CREDITS
+
+  Sean Barrett -- library, idea for dynamic array API/implementation
+  Per Vognsen  -- idea for hash table API/implementation
+  Rafael Sachetto -- arrpop()
+  github:HeroicKatora -- arraddn() reworking
+
+  Bugfixes:
+    Andy Durdin
+    Shane Liesegang
+    Vinh Truong
+    Andreas Molzer
+    github:hashitaku
+    github:srdjanstipic
+    Macoy Madson
+    Andreas Vennstrom
+    Tobias Mansfield-Williams
+*/
+
+#ifdef STBDS_UNIT_TESTS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#ifndef INCLUDE_STB_DS_H
+#define INCLUDE_STB_DS_H
+
+#include <stddef.h>
+#include <string.h>
+
+#ifndef STBDS_NO_SHORT_NAMES
+#define arrlen      stbds_arrlen
+#define arrlenu     stbds_arrlenu
+#define arrput      stbds_arrput
+#define arrpush     stbds_arrput
+#define arrpop      stbds_arrpop
+#define arrfree     stbds_arrfree
+#define arraddn     stbds_arraddn // deprecated, use one of the following instead:
+#define arraddnptr  stbds_arraddnptr
+#define arraddnindex stbds_arraddnindex
+#define arrsetlen   stbds_arrsetlen
+#define arrlast     stbds_arrlast
+#define arrins      stbds_arrins
+#define arrinsn     stbds_arrinsn
+#define arrdel      stbds_arrdel
+#define arrdeln     stbds_arrdeln
+#define arrdelswap  stbds_arrdelswap
+#define arrcap      stbds_arrcap
+#define arrsetcap   stbds_arrsetcap
+
+#define hmput       stbds_hmput
+#define hmputs      stbds_hmputs
+#define hmget       stbds_hmget
+#define hmget_ts    stbds_hmget_ts
+#define hmgets      stbds_hmgets
+#define hmgetp      stbds_hmgetp
+#define hmgetp_ts   stbds_hmgetp_ts
+#define hmgetp_null stbds_hmgetp_null
+#define hmgeti      stbds_hmgeti
+#define hmgeti_ts   stbds_hmgeti_ts
+#define hmdel       stbds_hmdel
+#define hmlen       stbds_hmlen
+#define hmlenu      stbds_hmlenu
+#define hmfree      stbds_hmfree
+#define hmdefault   stbds_hmdefault
+#define hmdefaults  stbds_hmdefaults
+
+#define shput       stbds_shput
+#define shputi      stbds_shputi
+#define shputs      stbds_shputs
+#define shget       stbds_shget
+#define shgeti      stbds_shgeti
+#define shgets      stbds_shgets
+#define shgetp      stbds_shgetp
+#define shgetp_null stbds_shgetp_null
+#define shdel       stbds_shdel
+#define shlen       stbds_shlen
+#define shlenu      stbds_shlenu
+#define shfree      stbds_shfree
+#define shdefault   stbds_shdefault
+#define shdefaults  stbds_shdefaults
+#define sh_new_arena  stbds_sh_new_arena
+#define sh_new_strdup stbds_sh_new_strdup
+
+#define stralloc    stbds_stralloc
+#define strreset    stbds_strreset
+#endif
+
+#if defined(STBDS_REALLOC) && !defined(STBDS_FREE) || !defined(STBDS_REALLOC) && defined(STBDS_FREE)
+#error "You must define both STBDS_REALLOC and STBDS_FREE, or neither."
+#endif
+#if !defined(STBDS_REALLOC) && !defined(STBDS_FREE)
+#include <stdlib.h>
+#define STBDS_REALLOC(c,p,s) realloc(p,s)
+#define STBDS_FREE(c,p)      free(p)
+#endif
+
+#ifdef _MSC_VER
+#define STBDS_NOTUSED(v)  (void)(v)
+#else
+#define STBDS_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// for security against attackers, seed the library with a random number, at least time() but stronger is better
+extern void stbds_rand_seed(size_t seed);
+
+// these are the hash functions used internally if you want to test them or use them for other purposes
+extern size_t stbds_hash_bytes(void *p, size_t len, size_t seed);
+extern size_t stbds_hash_string(char *str, size_t seed);
+
+// this is a simple string arena allocator, initialize with e.g. 'stbds_string_arena my_arena={0}'.
+typedef struct stbds_string_arena stbds_string_arena;
+extern char * stbds_stralloc(stbds_string_arena *a, char *str);
+extern void   stbds_strreset(stbds_string_arena *a);
+
+// have to #define STBDS_UNIT_TESTS to call this
+extern void stbds_unit_tests(void);
+
+///////////////
+//
+// Everything below here is implementation details
+//
+
+extern void * stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap);
+extern void   stbds_arrfreef(void *a);
+extern void   stbds_hmfree_func(void *p, size_t elemsize);
+extern void * stbds_hmget_key(void *a, size_t elemsize, void *key, size_t keysize, int mode);
+extern void * stbds_hmget_key_ts(void *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode);
+extern void * stbds_hmput_default(void *a, size_t elemsize);
+extern void * stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int mode);
+extern void * stbds_hmdel_key(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode);
+extern void * stbds_shmode_func(size_t elemsize, int mode);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define STBDS_HAS_TYPEOF
+#ifdef __cplusplus
+//#define STBDS_HAS_LITERAL_ARRAY  // this is currently broken for clang
+#endif
+#endif
+
+#if !defined(__cplusplus)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define STBDS_HAS_LITERAL_ARRAY
+#endif
+#endif
+
+// this macro takes the address of the argument, but on gcc/clang can accept rvalues
+#if defined(STBDS_HAS_LITERAL_ARRAY) && defined(STBDS_HAS_TYPEOF)
+  #if __clang__
+  #define STBDS_ADDRESSOF(typevar, value)     ((__typeof__(typevar)[1]){value}) // literal array decays to pointer to value
+  #else
+  #define STBDS_ADDRESSOF(typevar, value)     ((typeof(typevar)[1]){value}) // literal array decays to pointer to value
+  #endif
+#else
+#define STBDS_ADDRESSOF(typevar, value)     &(value)
+#endif
+
+#define STBDS_OFFSETOF(var,field)           ((char *) &(var)->field - (char *) (var))
+
+#define stbds_header(t)  ((stbds_array_header *) (t) - 1)
+#define stbds_temp(t)    stbds_header(t)->temp
+#define stbds_temp_key(t) (*(char **) stbds_header(t)->hash_table)
+
+#define stbds_arrsetcap(a,n)   (stbds_arrgrow(a,0,n))
+#define stbds_arrsetlen(a,n)   ((stbds_arrcap(a) < (size_t) (n) ? stbds_arrsetcap((a),(size_t)(n)),0 : 0), (a) ? stbds_header(a)->length = (size_t) (n) : 0)
+#define stbds_arrcap(a)        ((a) ? stbds_header(a)->capacity : 0)
+#define stbds_arrlen(a)        ((a) ? (ptrdiff_t) stbds_header(a)->length : 0)
+#define stbds_arrlenu(a)       ((a) ?             stbds_header(a)->length : 0)
+#define stbds_arrput(a,v)      (stbds_arrmaybegrow(a,1), (a)[stbds_header(a)->length++] = (v))
+#define stbds_arrpush          stbds_arrput  // synonym
+#define stbds_arrpop(a)        (stbds_header(a)->length--, (a)[stbds_header(a)->length])
+#define stbds_arraddn(a,n)     ((void)(stbds_arraddnindex(a, n)))    // deprecated, use one of the following instead:
+#define stbds_arraddnptr(a,n)  (stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)]) : (a))
+#define stbds_arraddnindex(a,n)(stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), stbds_header(a)->length-(n)) : stbds_arrlen(a))
+#define stbds_arraddnoff       stbds_arraddnindex
+#define stbds_arrlast(a)       ((a)[stbds_header(a)->length-1])
+#define stbds_arrfree(a)       ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL)
+#define stbds_arrdel(a,i)      stbds_arrdeln(a,i,1)
+#define stbds_arrdeln(a,i,n)   (memmove(&(a)[i], &(a)[(i)+(n)], sizeof *(a) * (stbds_header(a)->length-(n)-(i))), stbds_header(a)->length -= (n))
+#define stbds_arrdelswap(a,i)  ((a)[i] = stbds_arrlast(a), stbds_header(a)->length -= 1)
+#define stbds_arrinsn(a,i,n)   (stbds_arraddn((a),(n)), memmove(&(a)[(i)+(n)], &(a)[i], sizeof *(a) * (stbds_header(a)->length-(n)-(i))))
+#define stbds_arrins(a,i,v)    (stbds_arrinsn((a),(i),1), (a)[i]=(v))
+
+#define stbds_arrmaybegrow(a,n)  ((!(a) || stbds_header(a)->length + (n) > stbds_header(a)->capacity) \
+                                  ? (stbds_arrgrow(a,n,0),0) : 0)
+
+#define stbds_arrgrow(a,b,c)   ((a) = stbds_arrgrowf_wrapper((a), sizeof *(a), (b), (c)))
+
+#define stbds_hmput(t, k, v) \
+    ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, 0),   \
+     (t)[stbds_temp((t)-1)].key = (k),    \
+     (t)[stbds_temp((t)-1)].value = (v))
+
+#define stbds_hmputs(t, s) \
+    ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), &(s).key, sizeof (s).key, STBDS_HM_BINARY), \
+     (t)[stbds_temp((t)-1)] = (s))
+
+#define stbds_hmgeti(t,k) \
+    ((t) = stbds_hmget_key_wrapper((t), sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, STBDS_HM_BINARY), \
+      stbds_temp((t)-1))
+
+#define stbds_hmgeti_ts(t,k,temp) \
+    ((t) = stbds_hmget_key_ts_wrapper((t), sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, &(temp), STBDS_HM_BINARY), \
+      (temp))
+
+#define stbds_hmgetp(t, k) \
+    ((void) stbds_hmgeti(t,k), &(t)[stbds_temp((t)-1)])
+
+#define stbds_hmgetp_ts(t, k, temp) \
+    ((void) stbds_hmgeti_ts(t,k,temp), &(t)[temp])
+
+#define stbds_hmdel(t,k) \
+    (((t) = stbds_hmdel_key_wrapper((t),sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, STBDS_OFFSETOF((t),key), STBDS_HM_BINARY)),(t)?stbds_temp((t)-1):0)
+
+#define stbds_hmdefault(t, v) \
+    ((t) = stbds_hmput_default_wrapper((t), sizeof *(t)), (t)[-1].value = (v))
+
+#define stbds_hmdefaults(t, s) \
+    ((t) = stbds_hmput_default_wrapper((t), sizeof *(t)), (t)[-1] = (s))
+
+#define stbds_hmfree(p)        \
+    ((void) ((p) != NULL ? stbds_hmfree_func((p)-1,sizeof*(p)),0 : 0),(p)=NULL)
+
+#define stbds_hmgets(t, k)    (*stbds_hmgetp(t,k))
+#define stbds_hmget(t, k)     (stbds_hmgetp(t,k)->value)
+#define stbds_hmget_ts(t, k, temp)  (stbds_hmgetp_ts(t,k,temp)->value)
+#define stbds_hmlen(t)        ((t) ? (ptrdiff_t) stbds_header((t)-1)->length-1 : 0)
+#define stbds_hmlenu(t)       ((t) ?             stbds_header((t)-1)->length-1 : 0)
+#define stbds_hmgetp_null(t,k)  (stbds_hmgeti(t,k) == -1 ? NULL : &(t)[stbds_temp((t)-1)])
+
+#define stbds_shput(t, k, v) \
+    ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING),   \
+     (t)[stbds_temp((t)-1)].value = (v))
+
+#define stbds_shputi(t, k, v) \
+    ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING),   \
+     (t)[stbds_temp((t)-1)].value = (v), stbds_temp((t)-1))
+
+#define stbds_shputs(t, s) \
+    ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (s).key, sizeof (s).key, STBDS_HM_STRING), \
+     (t)[stbds_temp((t)-1)] = (s), \
+     (t)[stbds_temp((t)-1)].key = stbds_temp_key((t)-1)) // above line overwrites whole structure, so must rewrite key here if it was allocated internally
+
+#define stbds_pshput(t, p) \
+    ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (p)->key, sizeof (p)->key, STBDS_HM_PTR_TO_STRING), \
+     (t)[stbds_temp((t)-1)] = (p))
+
+#define stbds_shgeti(t,k) \
+     ((t) = stbds_hmget_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING), \
+      stbds_temp((t)-1))
+
+#define stbds_pshgeti(t,k) \
+     ((t) = stbds_hmget_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (*(t))->key, STBDS_HM_PTR_TO_STRING), \
+      stbds_temp((t)-1))
+
+#define stbds_shgetp(t, k) \
+    ((void) stbds_shgeti(t,k), &(t)[stbds_temp((t)-1)])
+
+#define stbds_pshget(t, k) \
+    ((void) stbds_pshgeti(t,k), (t)[stbds_temp((t)-1)])
+
+#define stbds_shdel(t,k) \
+    (((t) = stbds_hmdel_key_wrapper((t),sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_OFFSETOF((t),key), STBDS_HM_STRING)),(t)?stbds_temp((t)-1):0)
+#define stbds_pshdel(t,k) \
+    (((t) = stbds_hmdel_key_wrapper((t),sizeof *(t), (void*) (k), sizeof (*(t))->key, STBDS_OFFSETOF(*(t),key), STBDS_HM_PTR_TO_STRING)),(t)?stbds_temp((t)-1):0)
+
+#define stbds_sh_new_arena(t)  \
+    ((t) = stbds_shmode_func_wrapper(t, sizeof *(t), STBDS_SH_ARENA))
+#define stbds_sh_new_strdup(t) \
+    ((t) = stbds_shmode_func_wrapper(t, sizeof *(t), STBDS_SH_STRDUP))
+
+#define stbds_shdefault(t, v)  stbds_hmdefault(t,v)
+#define stbds_shdefaults(t, s) stbds_hmdefaults(t,s)
+
+#define stbds_shfree       stbds_hmfree
+#define stbds_shlenu       stbds_hmlenu
+
+#define stbds_shgets(t, k) (*stbds_shgetp(t,k))
+#define stbds_shget(t, k)  (stbds_shgetp(t,k)->value)
+#define stbds_shgetp_null(t,k)  (stbds_shgeti(t,k) == -1 ? NULL : &(t)[stbds_temp((t)-1)])
+#define stbds_shlen        stbds_hmlen
+
+typedef struct
+{
+  size_t      length;
+  size_t      capacity;
+  void      * hash_table;
+  ptrdiff_t   temp;
+} stbds_array_header;
+
+typedef struct stbds_string_block
+{
+  struct stbds_string_block *next;
+  char storage[8];
+} stbds_string_block;
+
+struct stbds_string_arena
+{
+  stbds_string_block *storage;
+  size_t remaining;
+  unsigned char block;
+  unsigned char mode;  // this isn't used by the string arena itself
+};
+
+#define STBDS_HM_BINARY         0
+#define STBDS_HM_STRING         1
+
+enum
+{
+   STBDS_SH_NONE,
+   STBDS_SH_DEFAULT,
+   STBDS_SH_STRDUP,
+   STBDS_SH_ARENA
+};
+
+#ifdef __cplusplus
+// in C we use implicit assignment from these void*-returning functions to T*.
+// in C++ these templates make the same code work
+template<class T> static T * stbds_arrgrowf_wrapper(T *a, size_t elemsize, size_t addlen, size_t min_cap) {
+  return (T*)stbds_arrgrowf((void *)a, elemsize, addlen, min_cap);
+}
+template<class T> static T * stbds_hmget_key_wrapper(T *a, size_t elemsize, void *key, size_t keysize, int mode) {
+  return (T*)stbds_hmget_key((void*)a, elemsize, key, keysize, mode);
+}
+template<class T> static T * stbds_hmget_key_ts_wrapper(T *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode) {
+  return (T*)stbds_hmget_key_ts((void*)a, elemsize, key, keysize, temp, mode);
+}
+template<class T> static T * stbds_hmput_default_wrapper(T *a, size_t elemsize) {
+  return (T*)stbds_hmput_default((void *)a, elemsize);
+}
+template<class T> static T * stbds_hmput_key_wrapper(T *a, size_t elemsize, void *key, size_t keysize, int mode) {
+  return (T*)stbds_hmput_key((void*)a, elemsize, key, keysize, mode);
+}
+template<class T> static T * stbds_hmdel_key_wrapper(T *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode){
+  return (T*)stbds_hmdel_key((void*)a, elemsize, key, keysize, keyoffset, mode);
+}
+template<class T> static T * stbds_shmode_func_wrapper(T *, size_t elemsize, int mode) {
+  return (T*)stbds_shmode_func(elemsize, mode);
+}
+#else
+#define stbds_arrgrowf_wrapper            stbds_arrgrowf
+#define stbds_hmget_key_wrapper           stbds_hmget_key
+#define stbds_hmget_key_ts_wrapper        stbds_hmget_key_ts
+#define stbds_hmput_default_wrapper       stbds_hmput_default
+#define stbds_hmput_key_wrapper           stbds_hmput_key
+#define stbds_hmdel_key_wrapper           stbds_hmdel_key
+#define stbds_shmode_func_wrapper(t,e,m)  stbds_shmode_func(e,m)
+#endif
+
+#endif // INCLUDE_STB_DS_H
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//   IMPLEMENTATION
+//
+
+#ifdef STB_DS_IMPLEMENTATION
+#include <assert.h>
+#include <string.h>
+
+#ifndef STBDS_ASSERT
+#define STBDS_ASSERT_WAS_UNDEFINED
+#define STBDS_ASSERT(x)   ((void) 0)
+#endif
+
+#ifdef STBDS_STATISTICS
+#define STBDS_STATS(x)   x
+size_t stbds_array_grow;
+size_t stbds_hash_grow;
+size_t stbds_hash_shrink;
+size_t stbds_hash_rebuild;
+size_t stbds_hash_probes;
+size_t stbds_hash_alloc;
+size_t stbds_rehash_probes;
+size_t stbds_rehash_items;
+#else
+#define STBDS_STATS(x)
+#endif
+
+//
+// stbds_arr implementation
+//
+
+//int *prev_allocs[65536];
+//int num_prev;
+
+void *stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap)
+{
+  stbds_array_header temp={0}; // force debugging
+  void *b;
+  size_t min_len = stbds_arrlen(a) + addlen;
+  (void) sizeof(temp);
+
+  // compute the minimum capacity needed
+  if (min_len > min_cap)
+    min_cap = min_len;
+
+  if (min_cap <= stbds_arrcap(a))
+    return a;
+
+  // increase needed capacity to guarantee O(1) amortized
+  if (min_cap < 2 * stbds_arrcap(a))
+    min_cap = 2 * stbds_arrcap(a);
+  else if (min_cap < 4)
+    min_cap = 4;
+
+  //if (num_prev < 65536) if (a) prev_allocs[num_prev++] = (int *) ((char *) a+1);
+  //if (num_prev == 2201)
+  //  num_prev = num_prev;
+  b = STBDS_REALLOC(NULL, (a) ? stbds_header(a) : 0, elemsize * min_cap + sizeof(stbds_array_header));
+  //if (num_prev < 65536) prev_allocs[num_prev++] = (int *) (char *) b;
+  b = (char *) b + sizeof(stbds_array_header);
+  if (a == NULL) {
+    stbds_header(b)->length = 0;
+    stbds_header(b)->hash_table = 0;
+    stbds_header(b)->temp = 0;
+  } else {
+    STBDS_STATS(++stbds_array_grow);
+  }
+  stbds_header(b)->capacity = min_cap;
+
+  return b;
+}
+
+void stbds_arrfreef(void *a)
+{
+  STBDS_FREE(NULL, stbds_header(a));
+}
+
+//
+// stbds_hm hash table implementation
+//
+
+#ifdef STBDS_INTERNAL_SMALL_BUCKET
+#define STBDS_BUCKET_LENGTH      4
+#else
+#define STBDS_BUCKET_LENGTH      8
+#endif
+
+#define STBDS_BUCKET_SHIFT      (STBDS_BUCKET_LENGTH == 8 ? 3 : 2)
+#define STBDS_BUCKET_MASK       (STBDS_BUCKET_LENGTH-1)
+#define STBDS_CACHE_LINE_SIZE   64
+
+#define STBDS_ALIGN_FWD(n,a)   (((n) + (a) - 1) & ~((a)-1))
+
+typedef struct
+{
+   size_t    hash [STBDS_BUCKET_LENGTH];
+   ptrdiff_t index[STBDS_BUCKET_LENGTH];
+} stbds_hash_bucket; // in 32-bit, this is one 64-byte cache line; in 64-bit, each array is one 64-byte cache line
+
+typedef struct
+{
+  char * temp_key; // this MUST be the first field of the hash table
+  size_t slot_count;
+  size_t used_count;
+  size_t used_count_threshold;
+  size_t used_count_shrink_threshold;
+  size_t tombstone_count;
+  size_t tombstone_count_threshold;
+  size_t seed;
+  size_t slot_count_log2;
+  stbds_string_arena string;
+  stbds_hash_bucket *storage; // not a separate allocation, just 64-byte aligned storage after this struct
+} stbds_hash_index;
+
+#define STBDS_INDEX_EMPTY    -1
+#define STBDS_INDEX_DELETED  -2
+#define STBDS_INDEX_IN_USE(x)  ((x) >= 0)
+
+#define STBDS_HASH_EMPTY      0
+#define STBDS_HASH_DELETED    1
+
+static size_t stbds_hash_seed=0x31415926;
+
+void stbds_rand_seed(size_t seed)
+{
+  stbds_hash_seed = seed;
+}
+
+#define stbds_load_32_or_64(var, temp, v32, v64_hi, v64_lo)                                          \
+  temp = v64_lo ^ v32, temp <<= 16, temp <<= 16, temp >>= 16, temp >>= 16, /* discard if 32-bit */   \
+  var = v64_hi, var <<= 16, var <<= 16,                                    /* discard if 32-bit */   \
+  var ^= temp ^ v32
+
+#define STBDS_SIZE_T_BITS           ((sizeof (size_t)) * 8)
+
+static size_t stbds_probe_position(size_t hash, size_t slot_count, size_t slot_log2)
+{
+  size_t pos;
+  STBDS_NOTUSED(slot_log2);
+  pos = hash & (slot_count-1);
+  #ifdef STBDS_INTERNAL_BUCKET_START
+  pos &= ~STBDS_BUCKET_MASK;
+  #endif
+  return pos;
+}
+
+static size_t stbds_log2(size_t slot_count)
+{
+  size_t n=0;
+  while (slot_count > 1) {
+    slot_count >>= 1;
+    ++n;
+  }
+  return n;
+}
+
+static stbds_hash_index *stbds_make_hash_index(size_t slot_count, stbds_hash_index *ot)
+{
+  stbds_hash_index *t;
+  t = (stbds_hash_index *) STBDS_REALLOC(NULL,0,(slot_count >> STBDS_BUCKET_SHIFT) * sizeof(stbds_hash_bucket) + sizeof(stbds_hash_index) + STBDS_CACHE_LINE_SIZE-1);
+  t->storage = (stbds_hash_bucket *) STBDS_ALIGN_FWD((size_t) (t+1), STBDS_CACHE_LINE_SIZE);
+  t->slot_count = slot_count;
+  t->slot_count_log2 = stbds_log2(slot_count);
+  t->tombstone_count = 0;
+  t->used_count = 0;
+
+  #if 0 // A1
+  t->used_count_threshold        = slot_count*12/16; // if 12/16th of table is occupied, grow
+  t->tombstone_count_threshold   = slot_count* 2/16; // if tombstones are 2/16th of table, rebuild
+  t->used_count_shrink_threshold = slot_count* 4/16; // if table is only 4/16th full, shrink
+  #elif 1 // A2
+  //t->used_count_threshold        = slot_count*12/16; // if 12/16th of table is occupied, grow
+  //t->tombstone_count_threshold   = slot_count* 3/16; // if tombstones are 3/16th of table, rebuild
+  //t->used_count_shrink_threshold = slot_count* 4/16; // if table is only 4/16th full, shrink
+
+  // compute without overflowing
+  t->used_count_threshold        = slot_count - (slot_count>>2);
+  t->tombstone_count_threshold   = (slot_count>>3) + (slot_count>>4);
+  t->used_count_shrink_threshold = slot_count >> 2;
+
+  #elif 0 // B1
+  t->used_count_threshold        = slot_count*13/16; // if 13/16th of table is occupied, grow
+  t->tombstone_count_threshold   = slot_count* 2/16; // if tombstones are 2/16th of table, rebuild
+  t->used_count_shrink_threshold = slot_count* 5/16; // if table is only 5/16th full, shrink
+  #else // C1
+  t->used_count_threshold        = slot_count*14/16; // if 14/16th of table is occupied, grow
+  t->tombstone_count_threshold   = slot_count* 2/16; // if tombstones are 2/16th of table, rebuild
+  t->used_count_shrink_threshold = slot_count* 6/16; // if table is only 6/16th full, shrink
+  #endif
+  // Following statistics were measured on a Core i7-6700 @ 4.00Ghz, compiled with clang 7.0.1 -O2
+    // Note that the larger tables have high variance as they were run fewer times
+  //     A1            A2          B1           C1
+  //    0.10ms :     0.10ms :     0.10ms :     0.11ms :      2,000 inserts creating 2K table
+  //    0.96ms :     0.95ms :     0.97ms :     1.04ms :     20,000 inserts creating 20K table
+  //   14.48ms :    14.46ms :    10.63ms :    11.00ms :    200,000 inserts creating 200K table
+  //  195.74ms :   196.35ms :   203.69ms :   214.92ms :  2,000,000 inserts creating 2M table
+  // 2193.88ms :  2209.22ms :  2285.54ms :  2437.17ms : 20,000,000 inserts creating 20M table
+  //   65.27ms :    53.77ms :    65.33ms :    65.47ms : 500,000 inserts & deletes in 2K table
+  //   72.78ms :    62.45ms :    71.95ms :    72.85ms : 500,000 inserts & deletes in 20K table
+  //   89.47ms :    77.72ms :    96.49ms :    96.75ms : 500,000 inserts & deletes in 200K table
+  //   97.58ms :    98.14ms :    97.18ms :    97.53ms : 500,000 inserts & deletes in 2M table
+  //  118.61ms :   119.62ms :   120.16ms :   118.86ms : 500,000 inserts & deletes in 20M table
+  //  192.11ms :   194.39ms :   196.38ms :   195.73ms : 500,000 inserts & deletes in 200M table
+
+  if (slot_count <= STBDS_BUCKET_LENGTH)
+    t->used_count_shrink_threshold = 0;
+  // to avoid infinite loop, we need to guarantee that at least one slot is empty and will terminate probes
+  STBDS_ASSERT(t->used_count_threshold + t->tombstone_count_threshold < t->slot_count);
+  STBDS_STATS(++stbds_hash_alloc);
+  if (ot) {
+    t->string = ot->string;
+    // reuse old seed so we can reuse old hashes so below "copy out old data" doesn't do any hashing
+    t->seed = ot->seed;
+  } else {
+    size_t a,b,temp;
+    memset(&t->string, 0, sizeof(t->string));
+    t->seed = stbds_hash_seed;
+    // LCG
+    // in 32-bit, a =          2147001325   b =  715136305
+    // in 64-bit, a = 2862933555777941757   b = 3037000493
+    stbds_load_32_or_64(a,temp, 2147001325, 0x27bb2ee6, 0x87b0b0fd);
+    stbds_load_32_or_64(b,temp,  715136305,          0, 0xb504f32d);
+    stbds_hash_seed = stbds_hash_seed  * a + b;
+  }
+
+  {
+    size_t i,j;
+    for (i=0; i < slot_count >> STBDS_BUCKET_SHIFT; ++i) {
+      stbds_hash_bucket *b = &t->storage[i];
+      for (j=0; j < STBDS_BUCKET_LENGTH; ++j)
+        b->hash[j] = STBDS_HASH_EMPTY;
+      for (j=0; j < STBDS_BUCKET_LENGTH; ++j)
+        b->index[j] = STBDS_INDEX_EMPTY;
+    }
+  }
+
+  // copy out the old data, if any
+  if (ot) {
+    size_t i,j;
+    t->used_count = ot->used_count;
+    for (i=0; i < ot->slot_count >> STBDS_BUCKET_SHIFT; ++i) {
+      stbds_hash_bucket *ob = &ot->storage[i];
+      for (j=0; j < STBDS_BUCKET_LENGTH; ++j) {
+        if (STBDS_INDEX_IN_USE(ob->index[j])) {
+          size_t hash = ob->hash[j];
+          size_t pos = stbds_probe_position(hash, t->slot_count, t->slot_count_log2);
+          size_t step = STBDS_BUCKET_LENGTH;
+          STBDS_STATS(++stbds_rehash_items);
+          for (;;) {
+            size_t limit,z;
+            stbds_hash_bucket *bucket;
+            bucket = &t->storage[pos >> STBDS_BUCKET_SHIFT];
+            STBDS_STATS(++stbds_rehash_probes);
+
+            for (z=pos & STBDS_BUCKET_MASK; z < STBDS_BUCKET_LENGTH; ++z) {
+              if (bucket->hash[z] == 0) {
+                bucket->hash[z] = hash;
+                bucket->index[z] = ob->index[j];
+                goto done;
+              }
+            }
+
+            limit = pos & STBDS_BUCKET_MASK;
+            for (z = 0; z < limit; ++z) {
+              if (bucket->hash[z] == 0) {
+                bucket->hash[z] = hash;
+                bucket->index[z] = ob->index[j];
+                goto done;
+              }
+            }
+
+            pos += step;                  // quadratic probing
+            step += STBDS_BUCKET_LENGTH;
+            pos &= (t->slot_count-1);
+          }
+        }
+       done:
+        ;
+      }
+    }
+  }
+
+  return t;
+}
+
+#define STBDS_ROTATE_LEFT(val, n)   (((val) << (n)) | ((val) >> (STBDS_SIZE_T_BITS - (n))))
+#define STBDS_ROTATE_RIGHT(val, n)  (((val) >> (n)) | ((val) << (STBDS_SIZE_T_BITS - (n))))
+
+size_t stbds_hash_string(char *str, size_t seed)
+{
+  size_t hash = seed;
+  while (*str)
+     hash = STBDS_ROTATE_LEFT(hash, 9) + (unsigned char) *str++;
+
+  // Thomas Wang 64-to-32 bit mix function, hopefully also works in 32 bits
+  hash ^= seed;
+  hash = (~hash) + (hash << 18);
+  hash ^= hash ^ STBDS_ROTATE_RIGHT(hash,31);
+  hash = hash * 21;
+  hash ^= hash ^ STBDS_ROTATE_RIGHT(hash,11);
+  hash += (hash << 6);
+  hash ^= STBDS_ROTATE_RIGHT(hash,22);
+  return hash+seed;
+}
+
+#ifdef STBDS_SIPHASH_2_4
+#define STBDS_SIPHASH_C_ROUNDS 2
+#define STBDS_SIPHASH_D_ROUNDS 4
+typedef int STBDS_SIPHASH_2_4_can_only_be_used_in_64_bit_builds[sizeof(size_t) == 8 ? 1 : -1];
+#endif
+
+#ifndef STBDS_SIPHASH_C_ROUNDS
+#define STBDS_SIPHASH_C_ROUNDS 1
+#endif
+#ifndef STBDS_SIPHASH_D_ROUNDS
+#define STBDS_SIPHASH_D_ROUNDS 1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4127) // conditional expression is constant, for do..while(0) and sizeof()==
+#endif
+
+static size_t stbds_siphash_bytes(void *p, size_t len, size_t seed)
+{
+  unsigned char *d = (unsigned char *) p;
+  size_t i,j;
+  size_t v0,v1,v2,v3, data;
+
+  // hash that works on 32- or 64-bit registers without knowing which we have
+  // (computes different results on 32-bit and 64-bit platform)
+  // derived from siphash, but on 32-bit platforms very different as it uses 4 32-bit state not 4 64-bit
+  v0 = ((((size_t) 0x736f6d65 << 16) << 16) + 0x70736575) ^  seed;
+  v1 = ((((size_t) 0x646f7261 << 16) << 16) + 0x6e646f6d) ^ ~seed;
+  v2 = ((((size_t) 0x6c796765 << 16) << 16) + 0x6e657261) ^  seed;
+  v3 = ((((size_t) 0x74656462 << 16) << 16) + 0x79746573) ^ ~seed;
+
+  #ifdef STBDS_TEST_SIPHASH_2_4
+  // hardcoded with key material in the siphash test vectors
+  v0 ^= 0x0706050403020100ull ^  seed;
+  v1 ^= 0x0f0e0d0c0b0a0908ull ^ ~seed;
+  v2 ^= 0x0706050403020100ull ^  seed;
+  v3 ^= 0x0f0e0d0c0b0a0908ull ^ ~seed;
+  #endif
+
+  #define STBDS_SIPROUND() \
+    do {                   \
+      v0 += v1; v1 = STBDS_ROTATE_LEFT(v1, 13);  v1 ^= v0; v0 = STBDS_ROTATE_LEFT(v0,STBDS_SIZE_T_BITS/2); \
+      v2 += v3; v3 = STBDS_ROTATE_LEFT(v3, 16);  v3 ^= v2;                                                 \
+      v2 += v1; v1 = STBDS_ROTATE_LEFT(v1, 17);  v1 ^= v2; v2 = STBDS_ROTATE_LEFT(v2,STBDS_SIZE_T_BITS/2); \
+      v0 += v3; v3 = STBDS_ROTATE_LEFT(v3, 21);  v3 ^= v0;                                                 \
+    } while (0)
+
+  for (i=0; i+sizeof(size_t) <= len; i += sizeof(size_t), d += sizeof(size_t)) {
+    data = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24);
+    data |= (size_t) (d[4] | (d[5] << 8) | (d[6] << 16) | (d[7] << 24)) << 16 << 16; // discarded if size_t == 4
+
+    v3 ^= data;
+    for (j=0; j < STBDS_SIPHASH_C_ROUNDS; ++j)
+      STBDS_SIPROUND();
+    v0 ^= data;
+  }
+  data = len << (STBDS_SIZE_T_BITS-8);
+  switch (len - i) {
+    case 7: data |= ((size_t) d[6] << 24) << 24; // fall through
+    case 6: data |= ((size_t) d[5] << 20) << 20; // fall through
+    case 5: data |= ((size_t) d[4] << 16) << 16; // fall through
+    case 4: data |= (d[3] << 24); // fall through
+    case 3: data |= (d[2] << 16); // fall through
+    case 2: data |= (d[1] << 8); // fall through
+    case 1: data |= d[0]; // fall through
+    case 0: break;
+  }
+  v3 ^= data;
+  for (j=0; j < STBDS_SIPHASH_C_ROUNDS; ++j)
+    STBDS_SIPROUND();
+  v0 ^= data;
+  v2 ^= 0xff;
+  for (j=0; j < STBDS_SIPHASH_D_ROUNDS; ++j)
+    STBDS_SIPROUND();
+
+#ifdef STBDS_SIPHASH_2_4
+  return v0^v1^v2^v3;
+#else
+  return v1^v2^v3; // slightly stronger since v0^v3 in above cancels out final round operation? I tweeted at the authors of SipHash about this but they didn't reply
+#endif
+}
+
+size_t stbds_hash_bytes(void *p, size_t len, size_t seed)
+{
+#ifdef STBDS_SIPHASH_2_4
+  return stbds_siphash_bytes(p,len,seed);
+#else
+  unsigned char *d = (unsigned char *) p;
+
+  if (len == 4) {
+    unsigned int hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24);
+    #if 0
+    // HASH32-A  Bob Jenkin's hash function w/o large constants
+    hash ^= seed;
+    hash -= (hash<<6);
+    hash ^= (hash>>17);
+    hash -= (hash<<9);
+    hash ^= seed;
+    hash ^= (hash<<4);
+    hash -= (hash<<3);
+    hash ^= (hash<<10);
+    hash ^= (hash>>15);
+    #elif 1
+    // HASH32-BB  Bob Jenkin's presumably-accidental version of Thomas Wang hash with rotates turned into shifts.
+    // Note that converting these back to rotates makes it run a lot slower, presumably due to collisions, so I'm
+    // not really sure what's going on.
+    hash ^= seed;
+    hash = (hash ^ 61) ^ (hash >> 16);
+    hash = hash + (hash << 3);
+    hash = hash ^ (hash >> 4);
+    hash = hash * 0x27d4eb2d;
+    hash ^= seed;
+    hash = hash ^ (hash >> 15);
+    #else  // HASH32-C   -  Murmur3
+    hash ^= seed;
+    hash *= 0xcc9e2d51;
+    hash = (hash << 17) | (hash >> 15);
+    hash *= 0x1b873593;
+    hash ^= seed;
+    hash = (hash << 19) | (hash >> 13);
+    hash = hash*5 + 0xe6546b64;
+    hash ^= hash >> 16;
+    hash *= 0x85ebca6b;
+    hash ^= seed;
+    hash ^= hash >> 13;
+    hash *= 0xc2b2ae35;
+    hash ^= hash >> 16;
+    #endif
+    // Following statistics were measured on a Core i7-6700 @ 4.00Ghz, compiled with clang 7.0.1 -O2
+    // Note that the larger tables have high variance as they were run fewer times
+    //  HASH32-A   //  HASH32-BB  //  HASH32-C
+    //    0.10ms   //    0.10ms   //    0.10ms :      2,000 inserts creating 2K table
+    //    0.96ms   //    0.95ms   //    0.99ms :     20,000 inserts creating 20K table
+    //   14.69ms   //   14.43ms   //   14.97ms :    200,000 inserts creating 200K table
+    //  199.99ms   //  195.36ms   //  202.05ms :  2,000,000 inserts creating 2M table
+    // 2234.84ms   // 2187.74ms   // 2240.38ms : 20,000,000 inserts creating 20M table
+    //   55.68ms   //   53.72ms   //   57.31ms : 500,000 inserts & deletes in 2K table
+    //   63.43ms   //   61.99ms   //   65.73ms : 500,000 inserts & deletes in 20K table
+    //   80.04ms   //   77.96ms   //   81.83ms : 500,000 inserts & deletes in 200K table
+    //  100.42ms   //   97.40ms   //  102.39ms : 500,000 inserts & deletes in 2M table
+    //  119.71ms   //  120.59ms   //  121.63ms : 500,000 inserts & deletes in 20M table
+    //  185.28ms   //  195.15ms   //  187.74ms : 500,000 inserts & deletes in 200M table
+    //   15.58ms   //   14.79ms   //   15.52ms : 200,000 inserts creating 200K table with varying key spacing
+
+    return (((size_t) hash << 16 << 16) | hash) ^ seed;
+  } else if (len == 8 && sizeof(size_t) == 8) {
+    size_t hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24);
+    hash |= (size_t) (d[4] | (d[5] << 8) | (d[6] << 16) | (d[7] << 24)) << 16 << 16; // avoid warning if size_t == 4
+    hash ^= seed;
+    hash = (~hash) + (hash << 21);
+    hash ^= STBDS_ROTATE_RIGHT(hash,24);
+    hash *= 265;
+    hash ^= STBDS_ROTATE_RIGHT(hash,14);
+    hash ^= seed;
+    hash *= 21;
+    hash ^= STBDS_ROTATE_RIGHT(hash,28);
+    hash += (hash << 31);
+    hash = (~hash) + (hash << 18);
+    return hash;
+  } else {
+    return stbds_siphash_bytes(p,len,seed);
+  }
+#endif
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+
+static int stbds_is_key_equal(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode, size_t i)
+{
+  if (mode >= STBDS_HM_STRING)
+    return 0==strcmp((char *) key, * (char **) ((char *) a + elemsize*i + keyoffset));
+  else
+    return 0==memcmp(key, (char *) a + elemsize*i + keyoffset, keysize);
+}
+
+#define STBDS_HASH_TO_ARR(x,elemsize) ((char*) (x) - (elemsize))
+#define STBDS_ARR_TO_HASH(x,elemsize) ((char*) (x) + (elemsize))
+
+#define stbds_hash_table(a)  ((stbds_hash_index *) stbds_header(a)->hash_table)
+
+void stbds_hmfree_func(void *a, size_t elemsize)
+{
+  if (a == NULL) return;
+  if (stbds_hash_table(a) != NULL) {
+    if (stbds_hash_table(a)->string.mode == STBDS_SH_STRDUP) {
+      size_t i;
+      // skip 0th element, which is default
+      for (i=1; i < stbds_header(a)->length; ++i)
+        STBDS_FREE(NULL, *(char**) ((char *) a + elemsize*i));
+    }
+    stbds_strreset(&stbds_hash_table(a)->string);
+  }
+  STBDS_FREE(NULL, stbds_header(a)->hash_table);
+  STBDS_FREE(NULL, stbds_header(a));
+}
+
+static ptrdiff_t stbds_hm_find_slot(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode)
+{
+  void *raw_a = STBDS_HASH_TO_ARR(a,elemsize);
+  stbds_hash_index *table = stbds_hash_table(raw_a);
+  size_t hash = mode >= STBDS_HM_STRING ? stbds_hash_string((char*)key,table->seed) : stbds_hash_bytes(key, keysize,table->seed);
+  size_t step = STBDS_BUCKET_LENGTH;
+  size_t limit,i;
+  size_t pos;
+  stbds_hash_bucket *bucket;
+
+  if (hash < 2) hash += 2; // stored hash values are forbidden from being 0, so we can detect empty slots
+
+  pos = stbds_probe_position(hash, table->slot_count, table->slot_count_log2);
+
+  for (;;) {
+    STBDS_STATS(++stbds_hash_probes);
+    bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT];
+
+    // start searching from pos to end of bucket, this should help performance on small hash tables that fit in cache
+    for (i=pos & STBDS_BUCKET_MASK; i < STBDS_BUCKET_LENGTH; ++i) {
+      if (bucket->hash[i] == hash) {
+        if (stbds_is_key_equal(a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) {
+          return (pos & ~STBDS_BUCKET_MASK)+i;
+        }
+      } else if (bucket->hash[i] == STBDS_HASH_EMPTY) {
+        return -1;
+      }
+    }
+
+    // search from beginning of bucket to pos
+    limit = pos & STBDS_BUCKET_MASK;
+    for (i = 0; i < limit; ++i) {
+      if (bucket->hash[i] == hash) {
+        if (stbds_is_key_equal(a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) {
+          return (pos & ~STBDS_BUCKET_MASK)+i;
+        }
+      } else if (bucket->hash[i] == STBDS_HASH_EMPTY) {
+        return -1;
+      }
+    }
+
+    // quadratic probing
+    pos += step;
+    step += STBDS_BUCKET_LENGTH;
+    pos &= (table->slot_count-1);
+  }
+  /* NOTREACHED */
+}
+
+void * stbds_hmget_key_ts(void *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode)
+{
+  size_t keyoffset = 0;
+  if (a == NULL) {
+    // make it non-empty so we can return a temp
+    a = stbds_arrgrowf(0, elemsize, 0, 1);
+    stbds_header(a)->length += 1;
+    memset(a, 0, elemsize);
+    *temp = STBDS_INDEX_EMPTY;
+    // adjust a to point after the default element
+    return STBDS_ARR_TO_HASH(a,elemsize);
+  } else {
+    stbds_hash_index *table;
+    void *raw_a = STBDS_HASH_TO_ARR(a,elemsize);
+    // adjust a to point to the default element
+    table = (stbds_hash_index *) stbds_header(raw_a)->hash_table;
+    if (table == 0) {
+      *temp = -1;
+    } else {
+      ptrdiff_t slot = stbds_hm_find_slot(a, elemsize, key, keysize, keyoffset, mode);
+      if (slot < 0) {
+        *temp = STBDS_INDEX_EMPTY;
+      } else {
+        stbds_hash_bucket *b = &table->storage[slot >> STBDS_BUCKET_SHIFT];
+        *temp = b->index[slot & STBDS_BUCKET_MASK];
+      }
+    }
+    return a;
+  }
+}
+
+void * stbds_hmget_key(void *a, size_t elemsize, void *key, size_t keysize, int mode)
+{
+  ptrdiff_t temp;
+  void *p = stbds_hmget_key_ts(a, elemsize, key, keysize, &temp, mode);
+  stbds_temp(STBDS_HASH_TO_ARR(p,elemsize)) = temp;
+  return p;
+}
+
+void * stbds_hmput_default(void *a, size_t elemsize)
+{
+  // three cases:
+  //   a is NULL <- allocate
+  //   a has a hash table but no entries, because of shmode <- grow
+  //   a has entries <- do nothing
+  if (a == NULL || stbds_header(STBDS_HASH_TO_ARR(a,elemsize))->length == 0) {
+    a = stbds_arrgrowf(a ? STBDS_HASH_TO_ARR(a,elemsize) : NULL, elemsize, 0, 1);
+    stbds_header(a)->length += 1;
+    memset(a, 0, elemsize);
+    a=STBDS_ARR_TO_HASH(a,elemsize);
+  }
+  return a;
+}
+
+static char *stbds_strdup(char *str);
+
+void *stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int mode)
+{
+  size_t keyoffset=0;
+  void *raw_a;
+  stbds_hash_index *table;
+
+  if (a == NULL) {
+    a = stbds_arrgrowf(0, elemsize, 0, 1);
+    memset(a, 0, elemsize);
+    stbds_header(a)->length += 1;
+    // adjust a to point AFTER the default element
+    a = STBDS_ARR_TO_HASH(a,elemsize);
+  }
+
+  // adjust a to point to the default element
+  raw_a = a;
+  a = STBDS_HASH_TO_ARR(a,elemsize);
+
+  table = (stbds_hash_index *) stbds_header(a)->hash_table;
+
+  if (table == NULL || table->used_count >= table->used_count_threshold) {
+    stbds_hash_index *nt;
+    size_t slot_count;
+
+    slot_count = (table == NULL) ? STBDS_BUCKET_LENGTH : table->slot_count*2;
+    nt = stbds_make_hash_index(slot_count, table);
+    if (table)
+      STBDS_FREE(NULL, table);
+    else
+      nt->string.mode = mode >= STBDS_HM_STRING ? STBDS_SH_DEFAULT : 0;
+    stbds_header(a)->hash_table = table = nt;
+    STBDS_STATS(++stbds_hash_grow);
+  }
+
+  // we iterate hash table explicitly because we want to track if we saw a tombstone
+  {
+    size_t hash = mode >= STBDS_HM_STRING ? stbds_hash_string((char*)key,table->seed) : stbds_hash_bytes(key, keysize,table->seed);
+    size_t step = STBDS_BUCKET_LENGTH;
+    size_t pos;
+    ptrdiff_t tombstone = -1;
+    stbds_hash_bucket *bucket;
+
+    // stored hash values are forbidden from being 0, so we can detect empty slots to early out quickly
+    if (hash < 2) hash += 2;
+
+    pos = stbds_probe_position(hash, table->slot_count, table->slot_count_log2);
+
+    for (;;) {
+      size_t limit, i;
+      STBDS_STATS(++stbds_hash_probes);
+      bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT];
+
+      // start searching from pos to end of bucket
+      for (i=pos & STBDS_BUCKET_MASK; i < STBDS_BUCKET_LENGTH; ++i) {
+        if (bucket->hash[i] == hash) {
+          if (stbds_is_key_equal(raw_a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) {
+            stbds_temp(a) = bucket->index[i];
+            if (mode >= STBDS_HM_STRING)
+              stbds_temp_key(a) = * (char **) ((char *) raw_a + elemsize*bucket->index[i] + keyoffset);
+            return STBDS_ARR_TO_HASH(a,elemsize);
+          }
+        } else if (bucket->hash[i] == 0) {
+          pos = (pos & ~STBDS_BUCKET_MASK) + i;
+          goto found_empty_slot;
+        } else if (tombstone < 0) {
+          if (bucket->index[i] == STBDS_INDEX_DELETED)
+            tombstone = (ptrdiff_t) ((pos & ~STBDS_BUCKET_MASK) + i);
+        }
+      }
+
+      // search from beginning of bucket to pos
+      limit = pos & STBDS_BUCKET_MASK;
+      for (i = 0; i < limit; ++i) {
+        if (bucket->hash[i] == hash) {
+          if (stbds_is_key_equal(raw_a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) {
+            stbds_temp(a) = bucket->index[i];
+            return STBDS_ARR_TO_HASH(a,elemsize);
+          }
+        } else if (bucket->hash[i] == 0) {
+          pos = (pos & ~STBDS_BUCKET_MASK) + i;
+          goto found_empty_slot;
+        } else if (tombstone < 0) {
+          if (bucket->index[i] == STBDS_INDEX_DELETED)
+            tombstone = (ptrdiff_t) ((pos & ~STBDS_BUCKET_MASK) + i);
+        }
+      }
+
+      // quadratic probing
+      pos += step;
+      step += STBDS_BUCKET_LENGTH;
+      pos &= (table->slot_count-1);
+    }
+   found_empty_slot:
+    if (tombstone >= 0) {
+      pos = tombstone;
+      --table->tombstone_count;
+    }
+    ++table->used_count;
+
+    {
+      ptrdiff_t i = (ptrdiff_t) stbds_arrlen(a);
+      // we want to do stbds_arraddn(1), but we can't use the macros since we don't have something of the right type
+      if ((size_t) i+1 > stbds_arrcap(a))
+        *(void **) &a = stbds_arrgrowf(a, elemsize, 1, 0);
+      raw_a = STBDS_ARR_TO_HASH(a,elemsize);
+
+      STBDS_ASSERT((size_t) i+1 <= stbds_arrcap(a));
+      stbds_header(a)->length = i+1;
+      bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT];
+      bucket->hash[pos & STBDS_BUCKET_MASK] = hash;
+      bucket->index[pos & STBDS_BUCKET_MASK] = i-1;
+      stbds_temp(a) = i-1;
+
+      switch (table->string.mode) {
+         case STBDS_SH_STRDUP:  stbds_temp_key(a) = *(char **) ((char *) a + elemsize*i) = stbds_strdup((char*) key); break;
+         case STBDS_SH_ARENA:   stbds_temp_key(a) = *(char **) ((char *) a + elemsize*i) = stbds_stralloc(&table->string, (char*)key); break;
+         case STBDS_SH_DEFAULT: stbds_temp_key(a) = *(char **) ((char *) a + elemsize*i) = (char *) key; break;
+         default:                memcpy((char *) a + elemsize*i, key, keysize); break;
+      }
+    }
+    return STBDS_ARR_TO_HASH(a,elemsize);
+  }
+}
+
+void * stbds_shmode_func(size_t elemsize, int mode)
+{
+  void *a = stbds_arrgrowf(0, elemsize, 0, 1);
+  stbds_hash_index *h;
+  memset(a, 0, elemsize);
+  stbds_header(a)->length = 1;
+  stbds_header(a)->hash_table = h = (stbds_hash_index *) stbds_make_hash_index(STBDS_BUCKET_LENGTH, NULL);
+  h->string.mode = (unsigned char) mode;
+  return STBDS_ARR_TO_HASH(a,elemsize);
+}
+
+void * stbds_hmdel_key(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode)
+{
+  if (a == NULL) {
+    return 0;
+  } else {
+    stbds_hash_index *table;
+    void *raw_a = STBDS_HASH_TO_ARR(a,elemsize);
+    table = (stbds_hash_index *) stbds_header(raw_a)->hash_table;
+    stbds_temp(raw_a) = 0;
+    if (table == 0) {
+      return a;
+    } else {
+      ptrdiff_t slot;
+      slot = stbds_hm_find_slot(a, elemsize, key, keysize, keyoffset, mode);
+      if (slot < 0)
+        return a;
+      else {
+        stbds_hash_bucket *b = &table->storage[slot >> STBDS_BUCKET_SHIFT];
+        int i = slot & STBDS_BUCKET_MASK;
+        ptrdiff_t old_index = b->index[i];
+        ptrdiff_t final_index = (ptrdiff_t) stbds_arrlen(raw_a)-1-1; // minus one for the raw_a vs a, and minus one for 'last'
+        STBDS_ASSERT(slot < (ptrdiff_t) table->slot_count);
+        --table->used_count;
+        ++table->tombstone_count;
+        stbds_temp(raw_a) = 1;
+        STBDS_ASSERT(table->used_count >= 0);
+        //STBDS_ASSERT(table->tombstone_count < table->slot_count/4);
+        b->hash[i] = STBDS_HASH_DELETED;
+        b->index[i] = STBDS_INDEX_DELETED;
+
+        if (mode == STBDS_HM_STRING && table->string.mode == STBDS_SH_STRDUP)
+          STBDS_FREE(NULL, *(char**) ((char *) a+elemsize*old_index));
+
+        // if indices are the same, memcpy is a no-op, but back-pointer-fixup will fail, so skip
+        if (old_index != final_index) {
+          // swap delete
+          memmove((char*) a + elemsize*old_index, (char*) a + elemsize*final_index, elemsize);
+
+          // now find the slot for the last element
+          if (mode == STBDS_HM_STRING)
+            slot = stbds_hm_find_slot(a, elemsize, *(char**) ((char *) a+elemsize*old_index + keyoffset), keysize, keyoffset, mode);
+          else
+            slot = stbds_hm_find_slot(a, elemsize,  (char* ) a+elemsize*old_index + keyoffset, keysize, keyoffset, mode);
+          STBDS_ASSERT(slot >= 0);
+          b = &table->storage[slot >> STBDS_BUCKET_SHIFT];
+          i = slot & STBDS_BUCKET_MASK;
+          STBDS_ASSERT(b->index[i] == final_index);
+          b->index[i] = old_index;
+        }
+        stbds_header(raw_a)->length -= 1;
+
+        if (table->used_count < table->used_count_shrink_threshold && table->slot_count > STBDS_BUCKET_LENGTH) {
+          stbds_header(raw_a)->hash_table = stbds_make_hash_index(table->slot_count>>1, table);
+          STBDS_FREE(NULL, table);
+          STBDS_STATS(++stbds_hash_shrink);
+        } else if (table->tombstone_count > table->tombstone_count_threshold) {
+          stbds_header(raw_a)->hash_table = stbds_make_hash_index(table->slot_count   , table);
+          STBDS_FREE(NULL, table);
+          STBDS_STATS(++stbds_hash_rebuild);
+        }
+
+        return a;
+      }
+    }
+  }
+  /* NOTREACHED */
+}
+
+static char *stbds_strdup(char *str)
+{
+  // to keep replaceable allocator simple, we don't want to use strdup.
+  // rolling our own also avoids problem of strdup vs _strdup
+  size_t len = strlen(str)+1;
+  char *p = (char*) STBDS_REALLOC(NULL, 0, len);
+  memmove(p, str, len);
+  return p;
+}
+
+#ifndef STBDS_STRING_ARENA_BLOCKSIZE_MIN
+#define STBDS_STRING_ARENA_BLOCKSIZE_MIN  512u
+#endif
+#ifndef STBDS_STRING_ARENA_BLOCKSIZE_MAX
+#define STBDS_STRING_ARENA_BLOCKSIZE_MAX  (1u<<20)
+#endif
+
+char *stbds_stralloc(stbds_string_arena *a, char *str)
+{
+  char *p;
+  size_t len = strlen(str)+1;
+  if (len > a->remaining) {
+    // compute the next blocksize
+    size_t blocksize = a->block;
+
+    // size is 512, 512, 1024, 1024, 2048, 2048, 4096, 4096, etc., so that
+    // there are log(SIZE) allocations to free when we destroy the table
+    blocksize = (size_t) (STBDS_STRING_ARENA_BLOCKSIZE_MIN) << (blocksize>>1);
+
+    // if size is under 1M, advance to next blocktype
+    if (blocksize < (size_t)(STBDS_STRING_ARENA_BLOCKSIZE_MAX))
+      ++a->block;
+
+    if (len > blocksize) {
+      // if string is larger than blocksize, then just allocate the full size.
+      // note that we still advance string_block so block size will continue
+      // increasing, so e.g. if somebody only calls this with 1000-long strings,
+      // eventually the arena will start doubling and handling those as well
+      stbds_string_block *sb = (stbds_string_block *) STBDS_REALLOC(NULL, 0, sizeof(*sb)-8 + len);
+      memmove(sb->storage, str, len);
+      if (a->storage) {
+        // insert it after the first element, so that we don't waste the space there
+        sb->next = a->storage->next;
+        a->storage->next = sb;
+      } else {
+        sb->next = 0;
+        a->storage = sb;
+        a->remaining = 0; // this is redundant, but good for clarity
+      }
+      return sb->storage;
+    } else {
+      stbds_string_block *sb = (stbds_string_block *) STBDS_REALLOC(NULL, 0, sizeof(*sb)-8 + blocksize);
+      sb->next = a->storage;
+      a->storage = sb;
+      a->remaining = blocksize;
+    }
+  }
+
+  STBDS_ASSERT(len <= a->remaining);
+  p = a->storage->storage + a->remaining - len;
+  a->remaining -= len;
+  memmove(p, str, len);
+  return p;
+}
+
+void stbds_strreset(stbds_string_arena *a)
+{
+  stbds_string_block *x,*y;
+  x = a->storage;
+  while (x) {
+    y = x->next;
+    STBDS_FREE(NULL, x);
+    x = y;
+  }
+  memset(a, 0, sizeof(*a));
+}
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//   UNIT TESTS
+//
+
+#ifdef STBDS_UNIT_TESTS
+#include <stdio.h>
+#ifdef STBDS_ASSERT_WAS_UNDEFINED
+#undef STBDS_ASSERT
+#endif
+#ifndef STBDS_ASSERT
+#define STBDS_ASSERT assert
+#include <assert.h>
+#endif
+
+typedef struct { int key,b,c,d; } stbds_struct;
+typedef struct { int key[2],b,c,d; } stbds_struct2;
+
+static char buffer[256];
+char *strkey(int n)
+{
+#if defined(_WIN32) && defined(__STDC_WANT_SECURE_LIB__)
+   sprintf_s(buffer, sizeof(buffer), "test_%d", n);
+#else
+   sprintf(buffer, "test_%d", n);
+#endif
+   return buffer;
+}
+
+void stbds_unit_tests(void)
+{
+#if defined(_MSC_VER) && _MSC_VER <= 1200 && defined(__cplusplus)
+  // VC6 C++ doesn't like the template<> trick on unnamed structures, so do nothing!
+  STBDS_ASSERT(0);
+#else
+  const int testsize = 100000;
+  const int testsize2 = testsize/20;
+  int *arr=NULL;
+  struct { int   key;        int value; }  *intmap  = NULL;
+  struct { char *key;        int value; }  *strmap  = NULL, s;
+  struct { stbds_struct key; int value; }  *map     = NULL;
+  stbds_struct                             *map2    = NULL;
+  stbds_struct2                            *map3    = NULL;
+  stbds_string_arena                        sa      = { 0 };
+  int key3[2] = { 1,2 };
+  ptrdiff_t temp;
+
+  int i,j;
+
+  STBDS_ASSERT(arrlen(arr)==0);
+  for (i=0; i < 20000; i += 50) {
+    for (j=0; j < i; ++j)
+      arrpush(arr,j);
+    arrfree(arr);
+  }
+
+  for (i=0; i < 4; ++i) {
+    arrpush(arr,1); arrpush(arr,2); arrpush(arr,3); arrpush(arr,4);
+    arrdel(arr,i);
+    arrfree(arr);
+    arrpush(arr,1); arrpush(arr,2); arrpush(arr,3); arrpush(arr,4);
+    arrdelswap(arr,i);
+    arrfree(arr);
+  }
+
+  for (i=0; i < 5; ++i) {
+    arrpush(arr,1); arrpush(arr,2); arrpush(arr,3); arrpush(arr,4);
+    stbds_arrins(arr,i,5);
+    STBDS_ASSERT(arr[i] == 5);
+    if (i < 4)
+      STBDS_ASSERT(arr[4] == 4);
+    arrfree(arr);
+  }
+
+  i = 1;
+  STBDS_ASSERT(hmgeti(intmap,i) == -1);
+  hmdefault(intmap, -2);
+  STBDS_ASSERT(hmgeti(intmap, i) == -1);
+  STBDS_ASSERT(hmget (intmap, i) == -2);
+  for (i=0; i < testsize; i+=2)
+    hmput(intmap, i, i*5);
+  for (i=0; i < testsize; i+=1) {
+    if (i & 1) STBDS_ASSERT(hmget(intmap, i) == -2 );
+    else       STBDS_ASSERT(hmget(intmap, i) == i*5);
+    if (i & 1) STBDS_ASSERT(hmget_ts(intmap, i, temp) == -2 );
+    else       STBDS_ASSERT(hmget_ts(intmap, i, temp) == i*5);
+  }
+  for (i=0; i < testsize; i+=2)
+    hmput(intmap, i, i*3);
+  for (i=0; i < testsize; i+=1)
+    if (i & 1) STBDS_ASSERT(hmget(intmap, i) == -2 );
+    else       STBDS_ASSERT(hmget(intmap, i) == i*3);
+  for (i=2; i < testsize; i+=4)
+    hmdel(intmap, i); // delete half the entries
+  for (i=0; i < testsize; i+=1)
+    if (i & 3) STBDS_ASSERT(hmget(intmap, i) == -2 );
+    else       STBDS_ASSERT(hmget(intmap, i) == i*3);
+  for (i=0; i < testsize; i+=1)
+    hmdel(intmap, i); // delete the rest of the entries
+  for (i=0; i < testsize; i+=1)
+    STBDS_ASSERT(hmget(intmap, i) == -2 );
+  hmfree(intmap);
+  for (i=0; i < testsize; i+=2)
+    hmput(intmap, i, i*3);
+  hmfree(intmap);
+
+  #if defined(__clang__) || defined(__GNUC__)
+  #ifndef __cplusplus
+  intmap = NULL;
+  hmput(intmap, 15, 7);
+  hmput(intmap, 11, 3);
+  hmput(intmap,  9, 5);
+  STBDS_ASSERT(hmget(intmap, 9) == 5);
+  STBDS_ASSERT(hmget(intmap, 11) == 3);
+  STBDS_ASSERT(hmget(intmap, 15) == 7);
+  #endif
+  #endif
+
+  for (i=0; i < testsize; ++i)
+    stralloc(&sa, strkey(i));
+  strreset(&sa);
+
+  {
+    s.key = "a", s.value = 1;
+    shputs(strmap, s);
+    STBDS_ASSERT(*strmap[0].key == 'a');
+    STBDS_ASSERT(strmap[0].key == s.key);
+    STBDS_ASSERT(strmap[0].value == s.value);
+    shfree(strmap);
+  }
+
+  {
+    s.key = "a", s.value = 1;
+    sh_new_strdup(strmap);
+    shputs(strmap, s);
+    STBDS_ASSERT(*strmap[0].key == 'a');
+    STBDS_ASSERT(strmap[0].key != s.key);
+    STBDS_ASSERT(strmap[0].value == s.value);
+    shfree(strmap);
+  }
+
+  {
+    s.key = "a", s.value = 1;
+    sh_new_arena(strmap);
+    shputs(strmap, s);
+    STBDS_ASSERT(*strmap[0].key == 'a');
+    STBDS_ASSERT(strmap[0].key != s.key);
+    STBDS_ASSERT(strmap[0].value == s.value);
+    shfree(strmap);
+  }
+
+  for (j=0; j < 2; ++j) {
+    STBDS_ASSERT(shgeti(strmap,"foo") == -1);
+    if (j == 0)
+      sh_new_strdup(strmap);
+    else
+      sh_new_arena(strmap);
+    STBDS_ASSERT(shgeti(strmap,"foo") == -1);
+    shdefault(strmap, -2);
+    STBDS_ASSERT(shgeti(strmap,"foo") == -1);
+    for (i=0; i < testsize; i+=2)
+      shput(strmap, strkey(i), i*3);
+    for (i=0; i < testsize; i+=1)
+      if (i & 1) STBDS_ASSERT(shget(strmap, strkey(i)) == -2 );
+      else       STBDS_ASSERT(shget(strmap, strkey(i)) == i*3);
+    for (i=2; i < testsize; i+=4)
+      shdel(strmap, strkey(i)); // delete half the entries
+    for (i=0; i < testsize; i+=1)
+      if (i & 3) STBDS_ASSERT(shget(strmap, strkey(i)) == -2 );
+      else       STBDS_ASSERT(shget(strmap, strkey(i)) == i*3);
+    for (i=0; i < testsize; i+=1)
+      shdel(strmap, strkey(i)); // delete the rest of the entries
+    for (i=0; i < testsize; i+=1)
+      STBDS_ASSERT(shget(strmap, strkey(i)) == -2 );
+    shfree(strmap);
+  }
+
+  {
+    struct { char *key; char value; } *hash = NULL;
+    char name[4] = "jen";
+    shput(hash, "bob"   , 'h');
+    shput(hash, "sally" , 'e');
+    shput(hash, "fred"  , 'l');
+    shput(hash, "jen"   , 'x');
+    shput(hash, "doug"  , 'o');
+
+    shput(hash, name    , 'l');
+    shfree(hash);
+  }
+
+  for (i=0; i < testsize; i += 2) {
+    stbds_struct s = { i,i*2,i*3,i*4 };
+    hmput(map, s, i*5);
+  }
+
+  for (i=0; i < testsize; i += 1) {
+    stbds_struct s = { i,i*2,i*3  ,i*4 };
+    stbds_struct t = { i,i*2,i*3+1,i*4 };
+    if (i & 1) STBDS_ASSERT(hmget(map, s) == 0);
+    else       STBDS_ASSERT(hmget(map, s) == i*5);
+    if (i & 1) STBDS_ASSERT(hmget_ts(map, s, temp) == 0);
+    else       STBDS_ASSERT(hmget_ts(map, s, temp) == i*5);
+    //STBDS_ASSERT(hmget(map, t.key) == 0);
+  }
+
+  for (i=0; i < testsize; i += 2) {
+    stbds_struct s = { i,i*2,i*3,i*4 };
+    hmputs(map2, s);
+  }
+  hmfree(map);
+
+  for (i=0; i < testsize; i += 1) {
+    stbds_struct s = { i,i*2,i*3,i*4 };
+    stbds_struct t = { i,i*2,i*3+1,i*4 };
+    if (i & 1) STBDS_ASSERT(hmgets(map2, s.key).d == 0);
+    else       STBDS_ASSERT(hmgets(map2, s.key).d == i*4);
+    //STBDS_ASSERT(hmgetp(map2, t.key) == 0);
+  }
+  hmfree(map2);
+
+  for (i=0; i < testsize; i += 2) {
+    stbds_struct2 s = { { i,i*2 }, i*3,i*4, i*5 };
+    hmputs(map3, s);
+  }
+  for (i=0; i < testsize; i += 1) {
+    stbds_struct2 s = { { i,i*2}, i*3, i*4, i*5 };
+    stbds_struct2 t = { { i,i*2}, i*3+1, i*4, i*5 };
+    if (i & 1) STBDS_ASSERT(hmgets(map3, s.key).d == 0);
+    else       STBDS_ASSERT(hmgets(map3, s.key).d == i*5);
+    //STBDS_ASSERT(hmgetp(map3, t.key) == 0);
+  }
+#endif
+}
+#endif
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2019 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_dxt.h b/lib/stb/stb_dxt.h
new file mode 100644
index 0000000..6150a87
--- /dev/null
+++ b/lib/stb/stb_dxt.h
@@ -0,0 +1,719 @@
+// stb_dxt.h - v1.12 - DXT1/DXT5 compressor - public domain
+// original by fabian "ryg" giesen - ported to C by stb
+// use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
+//
+// USAGE:
+//   call stb_compress_dxt_block() for every block (you must pad)
+//     source should be a 4x4 block of RGBA data in row-major order;
+//     Alpha channel is not stored if you specify alpha=0 (but you
+//     must supply some constant alpha in the alpha channel).
+//     You can turn on dithering and "high quality" using mode.
+//
+// version history:
+//   v1.12  - (ryg) fix bug in single-color table generator
+//   v1.11  - (ryg) avoid racy global init, better single-color tables, remove dither
+//   v1.10  - (i.c) various small quality improvements
+//   v1.09  - (stb) update documentation re: surprising alpha channel requirement
+//   v1.08  - (stb) fix bug in dxt-with-alpha block
+//   v1.07  - (stb) bc4; allow not using libc; add STB_DXT_STATIC
+//   v1.06  - (stb) fix to known-broken 1.05
+//   v1.05  - (stb) support bc5/3dc (Arvids Kokins), use extern "C" in C++ (Pavel Krajcevski)
+//   v1.04  - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
+//            single color match fix (allow for inexact color interpolation);
+//            optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
+//   v1.03  - (stb) endianness support
+//   v1.02  - (stb) fix alpha encoding bug
+//   v1.01  - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
+//   v1.00  - (stb) first release
+//
+// contributors:
+//   Rich Geldreich (more accurate index selection)
+//   Kevin Schmidt (#defines for "freestanding" compilation)
+//   github:ppiastucki (BC4 support)
+//   Ignacio Castano - improve DXT endpoint quantization
+//   Alan Hickman - static table initialization
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+#ifndef STB_INCLUDE_STB_DXT_H
+#define STB_INCLUDE_STB_DXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef STB_DXT_STATIC
+#define STBDDEF static
+#else
+#define STBDDEF extern
+#endif
+
+// compression mode (bitflags)
+#define STB_DXT_NORMAL    0
+#define STB_DXT_DITHER    1   // use dithering. was always dubious, now deprecated. does nothing!
+#define STB_DXT_HIGHQUAL  2   // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
+
+STBDDEF void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src_rgba_four_bytes_per_pixel, int alpha, int mode);
+STBDDEF void stb_compress_bc4_block(unsigned char *dest, const unsigned char *src_r_one_byte_per_pixel);
+STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src_rg_two_byte_per_pixel);
+
+#define STB_COMPRESS_DXT_BLOCK
+
+#ifdef __cplusplus
+}
+#endif
+#endif // STB_INCLUDE_STB_DXT_H
+
+#ifdef STB_DXT_IMPLEMENTATION
+
+// configuration options for DXT encoder. set them in the project/makefile or just define
+// them at the top.
+
+// STB_DXT_USE_ROUNDING_BIAS
+//     use a rounding bias during color interpolation. this is closer to what "ideal"
+//     interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
+//     implicitly had this turned on.
+//
+//     in case you're targeting a specific type of hardware (e.g. console programmers):
+//     NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
+//     to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
+//     you also see "(a*5 + b*3) / 8" on some old GPU designs.
+// #define STB_DXT_USE_ROUNDING_BIAS
+
+#include <stdlib.h>
+
+#if !defined(STBD_FABS)
+#include <math.h>
+#endif
+
+#ifndef STBD_FABS
+#define STBD_FABS(x)          fabs(x)
+#endif
+
+static const unsigned char stb__OMatch5[256][2] = {
+   {  0,  0 }, {  0,  0 }, {  0,  1 }, {  0,  1 }, {  1,  0 }, {  1,  0 }, {  1,  0 }, {  1,  1 },
+   {  1,  1 }, {  1,  1 }, {  1,  2 }, {  0,  4 }, {  2,  1 }, {  2,  1 }, {  2,  1 }, {  2,  2 },
+   {  2,  2 }, {  2,  2 }, {  2,  3 }, {  1,  5 }, {  3,  2 }, {  3,  2 }, {  4,  0 }, {  3,  3 },
+   {  3,  3 }, {  3,  3 }, {  3,  4 }, {  3,  4 }, {  3,  4 }, {  3,  5 }, {  4,  3 }, {  4,  3 },
+   {  5,  2 }, {  4,  4 }, {  4,  4 }, {  4,  5 }, {  4,  5 }, {  5,  4 }, {  5,  4 }, {  5,  4 },
+   {  6,  3 }, {  5,  5 }, {  5,  5 }, {  5,  6 }, {  4,  8 }, {  6,  5 }, {  6,  5 }, {  6,  5 },
+   {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  7 }, {  5,  9 }, {  7,  6 }, {  7,  6 }, {  8,  4 },
+   {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  8 }, {  7,  8 }, {  7,  8 }, {  7,  9 }, {  8,  7 },
+   {  8,  7 }, {  9,  6 }, {  8,  8 }, {  8,  8 }, {  8,  9 }, {  8,  9 }, {  9,  8 }, {  9,  8 },
+   {  9,  8 }, { 10,  7 }, {  9,  9 }, {  9,  9 }, {  9, 10 }, {  8, 12 }, { 10,  9 }, { 10,  9 },
+   { 10,  9 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 11 }, {  9, 13 }, { 11, 10 }, { 11, 10 },
+   { 12,  8 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 12 }, { 11, 12 }, { 11, 12 }, { 11, 13 },
+   { 12, 11 }, { 12, 11 }, { 13, 10 }, { 12, 12 }, { 12, 12 }, { 12, 13 }, { 12, 13 }, { 13, 12 },
+   { 13, 12 }, { 13, 12 }, { 14, 11 }, { 13, 13 }, { 13, 13 }, { 13, 14 }, { 12, 16 }, { 14, 13 },
+   { 14, 13 }, { 14, 13 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 15 }, { 13, 17 }, { 15, 14 },
+   { 15, 14 }, { 16, 12 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 16 }, { 15, 16 }, { 15, 16 },
+   { 15, 17 }, { 16, 15 }, { 16, 15 }, { 17, 14 }, { 16, 16 }, { 16, 16 }, { 16, 17 }, { 16, 17 },
+   { 17, 16 }, { 17, 16 }, { 17, 16 }, { 18, 15 }, { 17, 17 }, { 17, 17 }, { 17, 18 }, { 16, 20 },
+   { 18, 17 }, { 18, 17 }, { 18, 17 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 19 }, { 17, 21 },
+   { 19, 18 }, { 19, 18 }, { 20, 16 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 20 }, { 19, 20 },
+   { 19, 20 }, { 19, 21 }, { 20, 19 }, { 20, 19 }, { 21, 18 }, { 20, 20 }, { 20, 20 }, { 20, 21 },
+   { 20, 21 }, { 21, 20 }, { 21, 20 }, { 21, 20 }, { 22, 19 }, { 21, 21 }, { 21, 21 }, { 21, 22 },
+   { 20, 24 }, { 22, 21 }, { 22, 21 }, { 22, 21 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 23 },
+   { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 24 },
+   { 23, 24 }, { 23, 24 }, { 23, 25 }, { 24, 23 }, { 24, 23 }, { 25, 22 }, { 24, 24 }, { 24, 24 },
+   { 24, 25 }, { 24, 25 }, { 25, 24 }, { 25, 24 }, { 25, 24 }, { 26, 23 }, { 25, 25 }, { 25, 25 },
+   { 25, 26 }, { 24, 28 }, { 26, 25 }, { 26, 25 }, { 26, 25 }, { 26, 26 }, { 26, 26 }, { 26, 26 },
+   { 26, 27 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 27, 27 }, { 27, 27 }, { 27, 27 },
+   { 27, 28 }, { 27, 28 }, { 27, 28 }, { 27, 29 }, { 28, 27 }, { 28, 27 }, { 29, 26 }, { 28, 28 },
+   { 28, 28 }, { 28, 29 }, { 28, 29 }, { 29, 28 }, { 29, 28 }, { 29, 28 }, { 30, 27 }, { 29, 29 },
+   { 29, 29 }, { 29, 30 }, { 29, 30 }, { 30, 29 }, { 30, 29 }, { 30, 29 }, { 30, 30 }, { 30, 30 },
+   { 30, 30 }, { 30, 31 }, { 30, 31 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 },
+};
+static const unsigned char stb__OMatch6[256][2] = {
+   {  0,  0 }, {  0,  1 }, {  1,  0 }, {  1,  1 }, {  1,  1 }, {  1,  2 }, {  2,  1 }, {  2,  2 },
+   {  2,  2 }, {  2,  3 }, {  3,  2 }, {  3,  3 }, {  3,  3 }, {  3,  4 }, {  4,  3 }, {  4,  4 },
+   {  4,  4 }, {  4,  5 }, {  5,  4 }, {  5,  5 }, {  5,  5 }, {  5,  6 }, {  6,  5 }, {  6,  6 },
+   {  6,  6 }, {  6,  7 }, {  7,  6 }, {  7,  7 }, {  7,  7 }, {  7,  8 }, {  8,  7 }, {  8,  8 },
+   {  8,  8 }, {  8,  9 }, {  9,  8 }, {  9,  9 }, {  9,  9 }, {  9, 10 }, { 10,  9 }, { 10, 10 },
+   { 10, 10 }, { 10, 11 }, { 11, 10 }, {  8, 16 }, { 11, 11 }, { 11, 12 }, { 12, 11 }, {  9, 17 },
+   { 12, 12 }, { 12, 13 }, { 13, 12 }, { 11, 16 }, { 13, 13 }, { 13, 14 }, { 14, 13 }, { 12, 17 },
+   { 14, 14 }, { 14, 15 }, { 15, 14 }, { 14, 16 }, { 15, 15 }, { 15, 16 }, { 16, 14 }, { 16, 15 },
+   { 17, 14 }, { 16, 16 }, { 16, 17 }, { 17, 16 }, { 18, 15 }, { 17, 17 }, { 17, 18 }, { 18, 17 },
+   { 20, 14 }, { 18, 18 }, { 18, 19 }, { 19, 18 }, { 21, 15 }, { 19, 19 }, { 19, 20 }, { 20, 19 },
+   { 20, 20 }, { 20, 20 }, { 20, 21 }, { 21, 20 }, { 21, 21 }, { 21, 21 }, { 21, 22 }, { 22, 21 },
+   { 22, 22 }, { 22, 22 }, { 22, 23 }, { 23, 22 }, { 23, 23 }, { 23, 23 }, { 23, 24 }, { 24, 23 },
+   { 24, 24 }, { 24, 24 }, { 24, 25 }, { 25, 24 }, { 25, 25 }, { 25, 25 }, { 25, 26 }, { 26, 25 },
+   { 26, 26 }, { 26, 26 }, { 26, 27 }, { 27, 26 }, { 24, 32 }, { 27, 27 }, { 27, 28 }, { 28, 27 },
+   { 25, 33 }, { 28, 28 }, { 28, 29 }, { 29, 28 }, { 27, 32 }, { 29, 29 }, { 29, 30 }, { 30, 29 },
+   { 28, 33 }, { 30, 30 }, { 30, 31 }, { 31, 30 }, { 30, 32 }, { 31, 31 }, { 31, 32 }, { 32, 30 },
+   { 32, 31 }, { 33, 30 }, { 32, 32 }, { 32, 33 }, { 33, 32 }, { 34, 31 }, { 33, 33 }, { 33, 34 },
+   { 34, 33 }, { 36, 30 }, { 34, 34 }, { 34, 35 }, { 35, 34 }, { 37, 31 }, { 35, 35 }, { 35, 36 },
+   { 36, 35 }, { 36, 36 }, { 36, 36 }, { 36, 37 }, { 37, 36 }, { 37, 37 }, { 37, 37 }, { 37, 38 },
+   { 38, 37 }, { 38, 38 }, { 38, 38 }, { 38, 39 }, { 39, 38 }, { 39, 39 }, { 39, 39 }, { 39, 40 },
+   { 40, 39 }, { 40, 40 }, { 40, 40 }, { 40, 41 }, { 41, 40 }, { 41, 41 }, { 41, 41 }, { 41, 42 },
+   { 42, 41 }, { 42, 42 }, { 42, 42 }, { 42, 43 }, { 43, 42 }, { 40, 48 }, { 43, 43 }, { 43, 44 },
+   { 44, 43 }, { 41, 49 }, { 44, 44 }, { 44, 45 }, { 45, 44 }, { 43, 48 }, { 45, 45 }, { 45, 46 },
+   { 46, 45 }, { 44, 49 }, { 46, 46 }, { 46, 47 }, { 47, 46 }, { 46, 48 }, { 47, 47 }, { 47, 48 },
+   { 48, 46 }, { 48, 47 }, { 49, 46 }, { 48, 48 }, { 48, 49 }, { 49, 48 }, { 50, 47 }, { 49, 49 },
+   { 49, 50 }, { 50, 49 }, { 52, 46 }, { 50, 50 }, { 50, 51 }, { 51, 50 }, { 53, 47 }, { 51, 51 },
+   { 51, 52 }, { 52, 51 }, { 52, 52 }, { 52, 52 }, { 52, 53 }, { 53, 52 }, { 53, 53 }, { 53, 53 },
+   { 53, 54 }, { 54, 53 }, { 54, 54 }, { 54, 54 }, { 54, 55 }, { 55, 54 }, { 55, 55 }, { 55, 55 },
+   { 55, 56 }, { 56, 55 }, { 56, 56 }, { 56, 56 }, { 56, 57 }, { 57, 56 }, { 57, 57 }, { 57, 57 },
+   { 57, 58 }, { 58, 57 }, { 58, 58 }, { 58, 58 }, { 58, 59 }, { 59, 58 }, { 59, 59 }, { 59, 59 },
+   { 59, 60 }, { 60, 59 }, { 60, 60 }, { 60, 60 }, { 60, 61 }, { 61, 60 }, { 61, 61 }, { 61, 61 },
+   { 61, 62 }, { 62, 61 }, { 62, 62 }, { 62, 62 }, { 62, 63 }, { 63, 62 }, { 63, 63 }, { 63, 63 },
+};
+
+static int stb__Mul8Bit(int a, int b)
+{
+  int t = a*b + 128;
+  return (t + (t >> 8)) >> 8;
+}
+
+static void stb__From16Bit(unsigned char *out, unsigned short v)
+{
+   int rv = (v & 0xf800) >> 11;
+   int gv = (v & 0x07e0) >>  5;
+   int bv = (v & 0x001f) >>  0;
+
+   // expand to 8 bits via bit replication
+   out[0] = (rv * 33) >> 2;
+   out[1] = (gv * 65) >> 4;
+   out[2] = (bv * 33) >> 2;
+   out[3] = 0;
+}
+
+static unsigned short stb__As16Bit(int r, int g, int b)
+{
+   return (unsigned short)((stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31));
+}
+
+// linear interpolation at 1/3 point between a and b, using desired rounding type
+static int stb__Lerp13(int a, int b)
+{
+#ifdef STB_DXT_USE_ROUNDING_BIAS
+   // with rounding bias
+   return a + stb__Mul8Bit(b-a, 0x55);
+#else
+   // without rounding bias
+   // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
+   return (2*a + b) / 3;
+#endif
+}
+
+// lerp RGB color
+static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
+{
+   out[0] = (unsigned char)stb__Lerp13(p1[0], p2[0]);
+   out[1] = (unsigned char)stb__Lerp13(p1[1], p2[1]);
+   out[2] = (unsigned char)stb__Lerp13(p1[2], p2[2]);
+}
+
+/****************************************************************************/
+
+static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
+{
+   stb__From16Bit(color+ 0, c0);
+   stb__From16Bit(color+ 4, c1);
+   stb__Lerp13RGB(color+ 8, color+0, color+4);
+   stb__Lerp13RGB(color+12, color+4, color+0);
+}
+
+// The color matching function
+static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color)
+{
+   unsigned int mask = 0;
+   int dirr = color[0*4+0] - color[1*4+0];
+   int dirg = color[0*4+1] - color[1*4+1];
+   int dirb = color[0*4+2] - color[1*4+2];
+   int dots[16];
+   int stops[4];
+   int i;
+   int c0Point, halfPoint, c3Point;
+
+   for(i=0;i<16;i++)
+      dots[i] = block[i*4+0]*dirr + block[i*4+1]*dirg + block[i*4+2]*dirb;
+
+   for(i=0;i<4;i++)
+      stops[i] = color[i*4+0]*dirr + color[i*4+1]*dirg + color[i*4+2]*dirb;
+
+   // think of the colors as arranged on a line; project point onto that line, then choose
+   // next color out of available ones. we compute the crossover points for "best color in top
+   // half"/"best in bottom half" and then the same inside that subinterval.
+   //
+   // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
+   // but it's very close and a lot faster.
+   // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
+
+   c0Point   = (stops[1] + stops[3]);
+   halfPoint = (stops[3] + stops[2]);
+   c3Point   = (stops[2] + stops[0]);
+
+   for (i=15;i>=0;i--) {
+      int dot = dots[i]*2;
+      mask <<= 2;
+
+      if(dot < halfPoint)
+         mask |= (dot < c0Point) ? 1 : 3;
+      else
+         mask |= (dot < c3Point) ? 2 : 0;
+   }
+
+   return mask;
+}
+
+// The color optimization function. (Clever code, part 1)
+static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
+{
+  int mind,maxd;
+  unsigned char *minp, *maxp;
+  double magn;
+  int v_r,v_g,v_b;
+  static const int nIterPower = 4;
+  float covf[6],vfr,vfg,vfb;
+
+  // determine color distribution
+  int cov[6];
+  int mu[3],min[3],max[3];
+  int ch,i,iter;
+
+  for(ch=0;ch<3;ch++)
+  {
+    const unsigned char *bp = ((const unsigned char *) block) + ch;
+    int muv,minv,maxv;
+
+    muv = minv = maxv = bp[0];
+    for(i=4;i<64;i+=4)
+    {
+      muv += bp[i];
+      if (bp[i] < minv) minv = bp[i];
+      else if (bp[i] > maxv) maxv = bp[i];
+    }
+
+    mu[ch] = (muv + 8) >> 4;
+    min[ch] = minv;
+    max[ch] = maxv;
+  }
+
+  // determine covariance matrix
+  for (i=0;i<6;i++)
+     cov[i] = 0;
+
+  for (i=0;i<16;i++)
+  {
+    int r = block[i*4+0] - mu[0];
+    int g = block[i*4+1] - mu[1];
+    int b = block[i*4+2] - mu[2];
+
+    cov[0] += r*r;
+    cov[1] += r*g;
+    cov[2] += r*b;
+    cov[3] += g*g;
+    cov[4] += g*b;
+    cov[5] += b*b;
+  }
+
+  // convert covariance matrix to float, find principal axis via power iter
+  for(i=0;i<6;i++)
+    covf[i] = cov[i] / 255.0f;
+
+  vfr = (float) (max[0] - min[0]);
+  vfg = (float) (max[1] - min[1]);
+  vfb = (float) (max[2] - min[2]);
+
+  for(iter=0;iter<nIterPower;iter++)
+  {
+    float r = vfr*covf[0] + vfg*covf[1] + vfb*covf[2];
+    float g = vfr*covf[1] + vfg*covf[3] + vfb*covf[4];
+    float b = vfr*covf[2] + vfg*covf[4] + vfb*covf[5];
+
+    vfr = r;
+    vfg = g;
+    vfb = b;
+  }
+
+  magn = STBD_FABS(vfr);
+  if (STBD_FABS(vfg) > magn) magn = STBD_FABS(vfg);
+  if (STBD_FABS(vfb) > magn) magn = STBD_FABS(vfb);
+
+   if(magn < 4.0f) { // too small, default to luminance
+      v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
+      v_g = 587;
+      v_b = 114;
+   } else {
+      magn = 512.0 / magn;
+      v_r = (int) (vfr * magn);
+      v_g = (int) (vfg * magn);
+      v_b = (int) (vfb * magn);
+   }
+
+   minp = maxp = block;
+   mind = maxd = block[0]*v_r + block[1]*v_g + block[2]*v_b;
+   // Pick colors at extreme points
+   for(i=1;i<16;i++)
+   {
+      int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
+
+      if (dot < mind) {
+         mind = dot;
+         minp = block+i*4;
+      }
+
+      if (dot > maxd) {
+         maxd = dot;
+         maxp = block+i*4;
+      }
+   }
+
+   *pmax16 = stb__As16Bit(maxp[0],maxp[1],maxp[2]);
+   *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
+}
+
+static const float stb__midpoints5[32] = {
+   0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
+   0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
+};
+
+static const float stb__midpoints6[64] = {
+   0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
+   0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
+   0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
+   0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
+};
+
+static unsigned short stb__Quantize5(float x)
+{
+   unsigned short q;
+   x = x < 0 ? 0 : x > 1 ? 1 : x;  // saturate
+   q = (unsigned short)(x * 31);
+   q += (x > stb__midpoints5[q]);
+   return q;
+}
+
+static unsigned short stb__Quantize6(float x)
+{
+   unsigned short q;
+   x = x < 0 ? 0 : x > 1 ? 1 : x;  // saturate
+   q = (unsigned short)(x * 63);
+   q += (x > stb__midpoints6[q]);
+   return q;
+}
+
+// The refinement function. (Clever code, part 2)
+// Tries to optimize colors to suit block contents better.
+// (By solving a least squares system via normal equations+Cramer's rule)
+static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
+{
+   static const int w1Tab[4] = { 3,0,2,1 };
+   static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
+   // ^some magic to save a lot of multiplies in the accumulating loop...
+   // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
+
+   float f;
+   unsigned short oldMin, oldMax, min16, max16;
+   int i, akku = 0, xx,xy,yy;
+   int At1_r,At1_g,At1_b;
+   int At2_r,At2_g,At2_b;
+   unsigned int cm = mask;
+
+   oldMin = *pmin16;
+   oldMax = *pmax16;
+
+   if((mask ^ (mask<<2)) < 4) // all pixels have the same index?
+   {
+      // yes, linear system would be singular; solve using optimal
+      // single-color match on average color
+      int r = 8, g = 8, b = 8;
+      for (i=0;i<16;++i) {
+         r += block[i*4+0];
+         g += block[i*4+1];
+         b += block[i*4+2];
+      }
+
+      r >>= 4; g >>= 4; b >>= 4;
+
+      max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
+      min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
+   } else {
+      At1_r = At1_g = At1_b = 0;
+      At2_r = At2_g = At2_b = 0;
+      for (i=0;i<16;++i,cm>>=2) {
+         int step = cm&3;
+         int w1 = w1Tab[step];
+         int r = block[i*4+0];
+         int g = block[i*4+1];
+         int b = block[i*4+2];
+
+         akku    += prods[step];
+         At1_r   += w1*r;
+         At1_g   += w1*g;
+         At1_b   += w1*b;
+         At2_r   += r;
+         At2_g   += g;
+         At2_b   += b;
+      }
+
+      At2_r = 3*At2_r - At1_r;
+      At2_g = 3*At2_g - At1_g;
+      At2_b = 3*At2_b - At1_b;
+
+      // extract solutions and decide solvability
+      xx = akku >> 16;
+      yy = (akku >> 8) & 0xff;
+      xy = (akku >> 0) & 0xff;
+
+      f = 3.0f / 255.0f / (xx*yy - xy*xy);
+
+      max16 =  stb__Quantize5((At1_r*yy - At2_r * xy) * f) << 11;
+      max16 |= stb__Quantize6((At1_g*yy - At2_g * xy) * f) << 5;
+      max16 |= stb__Quantize5((At1_b*yy - At2_b * xy) * f) << 0;
+
+      min16 =  stb__Quantize5((At2_r*xx - At1_r * xy) * f) << 11;
+      min16 |= stb__Quantize6((At2_g*xx - At1_g * xy) * f) << 5;
+      min16 |= stb__Quantize5((At2_b*xx - At1_b * xy) * f) << 0;
+   }
+
+   *pmin16 = min16;
+   *pmax16 = max16;
+   return oldMin != min16 || oldMax != max16;
+}
+
+// Color block compression
+static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
+{
+   unsigned int mask;
+   int i;
+   int refinecount;
+   unsigned short max16, min16;
+   unsigned char color[4*4];
+
+   refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
+
+   // check if block is constant
+   for (i=1;i<16;i++)
+      if (((unsigned int *) block)[i] != ((unsigned int *) block)[0])
+         break;
+
+   if(i == 16) { // constant color
+      int r = block[0], g = block[1], b = block[2];
+      mask  = 0xaaaaaaaa;
+      max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
+      min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
+   } else {
+      // first step: PCA+map along principal axis
+      stb__OptimizeColorsBlock(block,&max16,&min16);
+      if (max16 != min16) {
+         stb__EvalColors(color,max16,min16);
+         mask = stb__MatchColorsBlock(block,color);
+      } else
+         mask = 0;
+
+      // third step: refine (multiple times if requested)
+      for (i=0;i<refinecount;i++) {
+         unsigned int lastmask = mask;
+
+         if (stb__RefineBlock(block,&max16,&min16,mask)) {
+            if (max16 != min16) {
+               stb__EvalColors(color,max16,min16);
+               mask = stb__MatchColorsBlock(block,color);
+            } else {
+               mask = 0;
+               break;
+            }
+         }
+
+         if(mask == lastmask)
+            break;
+      }
+  }
+
+  // write the color block
+  if(max16 < min16)
+  {
+     unsigned short t = min16;
+     min16 = max16;
+     max16 = t;
+     mask ^= 0x55555555;
+  }
+
+  dest[0] = (unsigned char) (max16);
+  dest[1] = (unsigned char) (max16 >> 8);
+  dest[2] = (unsigned char) (min16);
+  dest[3] = (unsigned char) (min16 >> 8);
+  dest[4] = (unsigned char) (mask);
+  dest[5] = (unsigned char) (mask >> 8);
+  dest[6] = (unsigned char) (mask >> 16);
+  dest[7] = (unsigned char) (mask >> 24);
+}
+
+// Alpha block compression (this is easy for a change)
+static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src, int stride)
+{
+   int i,dist,bias,dist4,dist2,bits,mask;
+
+   // find min/max color
+   int mn,mx;
+   mn = mx = src[0];
+
+   for (i=1;i<16;i++)
+   {
+      if (src[i*stride] < mn) mn = src[i*stride];
+      else if (src[i*stride] > mx) mx = src[i*stride];
+   }
+
+   // encode them
+   dest[0] = (unsigned char)mx;
+   dest[1] = (unsigned char)mn;
+   dest += 2;
+
+   // determine bias and emit color indices
+   // given the choice of mx/mn, these indices are optimal:
+   // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
+   dist = mx-mn;
+   dist4 = dist*4;
+   dist2 = dist*2;
+   bias = (dist < 8) ? (dist - 1) : (dist/2 + 2);
+   bias -= mn * 7;
+   bits = 0,mask=0;
+
+   for (i=0;i<16;i++) {
+      int a = src[i*stride]*7 + bias;
+      int ind,t;
+
+      // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
+      t = (a >= dist4) ? -1 : 0; ind =  t & 4; a -= dist4 & t;
+      t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
+      ind += (a >= dist);
+
+      // turn linear scale into DXT index (0/1 are extremal pts)
+      ind = -ind & 7;
+      ind ^= (2 > ind);
+
+      // write index
+      mask |= ind << bits;
+      if((bits += 3) >= 8) {
+         *dest++ = (unsigned char)mask;
+         mask >>= 8;
+         bits -= 8;
+      }
+   }
+}
+
+void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
+{
+   unsigned char data[16][4];
+   if (alpha) {
+      int i;
+      stb__CompressAlphaBlock(dest,(unsigned char*) src+3, 4);
+      dest += 8;
+      // make a new copy of the data in which alpha is opaque,
+      // because code uses a fast test for color constancy
+      memcpy(data, src, 4*16);
+      for (i=0; i < 16; ++i)
+         data[i][3] = 255;
+      src = &data[0][0];
+   }
+
+   stb__CompressColorBlock(dest,(unsigned char*) src,mode);
+}
+
+void stb_compress_bc4_block(unsigned char *dest, const unsigned char *src)
+{
+   stb__CompressAlphaBlock(dest,(unsigned char*) src, 1);
+}
+
+void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src)
+{
+   stb__CompressAlphaBlock(dest,(unsigned char*) src,2);
+   stb__CompressAlphaBlock(dest + 8,(unsigned char*) src+1,2);
+}
+#endif // STB_DXT_IMPLEMENTATION
+
+// Compile with STB_DXT_IMPLEMENTATION and STB_DXT_GENERATE_TABLES
+// defined to generate the tables above.
+#ifdef STB_DXT_GENERATE_TABLES
+#include <stdio.h>
+
+int main()
+{
+   int i, j;
+   const char *omatch_names[] = { "stb__OMatch5", "stb__OMatch6" };
+   int dequant_mults[2] = { 33*4, 65 }; // .4 fixed-point dequant multipliers
+
+   // optimal endpoint tables
+   for (i = 0; i < 2; ++i) {
+      int dequant = dequant_mults[i];
+      int size = i ? 64 : 32;
+      printf("static const unsigned char %s[256][2] = {\n", omatch_names[i]);
+      for (int j = 0; j < 256; ++j) {
+         int mn, mx;
+         int best_mn = 0, best_mx = 0;
+         int best_err = 256 * 100;
+         for (mn=0;mn<size;mn++) {
+            for (mx=0;mx<size;mx++) {
+               int mine = (mn * dequant) >> 4;
+               int maxe = (mx * dequant) >> 4;
+               int err = abs(stb__Lerp13(maxe, mine) - j) * 100;
+
+               // DX10 spec says that interpolation must be within 3% of "correct" result,
+               // add this as error term. Normally we'd expect a random distribution of
+               // +-1.5% error, but nowhere in the spec does it say that the error has to be
+               // unbiased - better safe than sorry.
+               err += abs(maxe - mine) * 3;
+
+               if(err < best_err) {
+                  best_mn = mn;
+                  best_mx = mx;
+                  best_err = err;
+               }
+            }
+         }
+         if ((j % 8) == 0) printf("  "); // 2 spaces, third is done below
+         printf(" { %2d, %2d },", best_mx, best_mn);
+         if ((j % 8) == 7) printf("\n");
+      }
+      printf("};\n");
+   }
+
+   return 0;
+}
+#endif
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_easy_font.h b/lib/stb/stb_easy_font.h
new file mode 100644
index 0000000..b663258
--- /dev/null
+++ b/lib/stb/stb_easy_font.h
@@ -0,0 +1,305 @@
+// stb_easy_font.h - v1.1 - bitmap font for 3D rendering - public domain
+// Sean Barrett, Feb 2015
+//
+//    Easy-to-deploy,
+//    reasonably compact,
+//    extremely inefficient performance-wise,
+//    crappy-looking,
+//    ASCII-only,
+//    bitmap font for use in 3D APIs.
+//
+// Intended for when you just want to get some text displaying
+// in a 3D app as quickly as possible.
+//
+// Doesn't use any textures, instead builds characters out of quads.
+//
+// DOCUMENTATION:
+//
+//   int stb_easy_font_width(char *text)
+//   int stb_easy_font_height(char *text)
+//
+//      Takes a string and returns the horizontal size and the
+//      vertical size (which can vary if 'text' has newlines).
+//
+//   int stb_easy_font_print(float x, float y,
+//                           char *text, unsigned char color[4],
+//                           void *vertex_buffer, int vbuf_size)
+//
+//      Takes a string (which can contain '\n') and fills out a
+//      vertex buffer with renderable data to draw the string.
+//      Output data assumes increasing x is rightwards, increasing y
+//      is downwards.
+//
+//      The vertex data is divided into quads, i.e. there are four
+//      vertices in the vertex buffer for each quad.
+//
+//      The vertices are stored in an interleaved format:
+//
+//         x:float
+//         y:float
+//         z:float
+//         color:uint8[4]
+//
+//      You can ignore z and color if you get them from elsewhere
+//      This format was chosen in the hopes it would make it
+//      easier for you to reuse existing vertex-buffer-drawing code.
+//
+//      If you pass in NULL for color, it becomes 255,255,255,255.
+//
+//      Returns the number of quads.
+//
+//      If the buffer isn't large enough, it will truncate.
+//      Expect it to use an average of ~270 bytes per character.
+//
+//      If your API doesn't draw quads, build a reusable index
+//      list that allows you to render quads as indexed triangles.
+//
+//   void stb_easy_font_spacing(float spacing)
+//
+//      Use positive values to expand the space between characters,
+//      and small negative values (no smaller than -1.5) to contract
+//      the space between characters.
+//
+//      E.g. spacing = 1 adds one "pixel" of spacing between the
+//      characters. spacing = -1 is reasonable but feels a bit too
+//      compact to me; -0.5 is a reasonable compromise as long as
+//      you're scaling the font up.
+//
+// LICENSE
+//
+//   See end of file for license information.
+//
+// VERSION HISTORY
+//
+//   (2020-02-02)  1.1   make everything static so can compile it in more than one src file
+//   (2017-01-15)  1.0   space character takes same space as numbers; fix bad spacing of 'f'
+//   (2016-01-22)  0.7   width() supports multiline text; add height()
+//   (2015-09-13)  0.6   #include <math.h>; updated license
+//   (2015-02-01)  0.5   First release
+//
+// CONTRIBUTORS
+//
+//   github:vassvik    --  bug report
+//   github:podsvirov  --  fix multiple definition errors
+
+#if 0
+// SAMPLE CODE:
+//
+//    Here's sample code for old OpenGL; it's a lot more complicated
+//    to make work on modern APIs, and that's your problem.
+//
+void print_string(float x, float y, char *text, float r, float g, float b)
+{
+  static char buffer[99999]; // ~500 chars
+  int num_quads;
+
+  num_quads = stb_easy_font_print(x, y, text, NULL, buffer, sizeof(buffer));
+
+  glColor3f(r,g,b);
+  glEnableClientState(GL_VERTEX_ARRAY);
+  glVertexPointer(2, GL_FLOAT, 16, buffer);
+  glDrawArrays(GL_QUADS, 0, num_quads*4);
+  glDisableClientState(GL_VERTEX_ARRAY);
+}
+#endif
+
+#ifndef INCLUDE_STB_EASY_FONT_H
+#define INCLUDE_STB_EASY_FONT_H
+
+#include <stdlib.h>
+#include <math.h>
+
+static struct stb_easy_font_info_struct {
+    unsigned char advance;
+    unsigned char h_seg;
+    unsigned char v_seg;
+} stb_easy_font_charinfo[96] = {
+    {  6,  0,  0 },  {  3,  0,  0 },  {  5,  1,  1 },  {  7,  1,  4 },
+    {  7,  3,  7 },  {  7,  6, 12 },  {  7,  8, 19 },  {  4, 16, 21 },
+    {  4, 17, 22 },  {  4, 19, 23 },  { 23, 21, 24 },  { 23, 22, 31 },
+    { 20, 23, 34 },  { 22, 23, 36 },  { 19, 24, 36 },  { 21, 25, 36 },
+    {  6, 25, 39 },  {  6, 27, 43 },  {  6, 28, 45 },  {  6, 30, 49 },
+    {  6, 33, 53 },  {  6, 34, 57 },  {  6, 40, 58 },  {  6, 46, 59 },
+    {  6, 47, 62 },  {  6, 55, 64 },  { 19, 57, 68 },  { 20, 59, 68 },
+    { 21, 61, 69 },  { 22, 66, 69 },  { 21, 68, 69 },  {  7, 73, 69 },
+    {  9, 75, 74 },  {  6, 78, 81 },  {  6, 80, 85 },  {  6, 83, 90 },
+    {  6, 85, 91 },  {  6, 87, 95 },  {  6, 90, 96 },  {  7, 92, 97 },
+    {  6, 96,102 },  {  5, 97,106 },  {  6, 99,107 },  {  6,100,110 },
+    {  6,100,115 },  {  7,101,116 },  {  6,101,121 },  {  6,101,125 },
+    {  6,102,129 },  {  7,103,133 },  {  6,104,140 },  {  6,105,145 },
+    {  7,107,149 },  {  6,108,151 },  {  7,109,155 },  {  7,109,160 },
+    {  7,109,165 },  {  7,118,167 },  {  6,118,172 },  {  4,120,176 },
+    {  6,122,177 },  {  4,122,181 },  { 23,124,182 },  { 22,129,182 },
+    {  4,130,182 },  { 22,131,183 },  {  6,133,187 },  { 22,135,191 },
+    {  6,137,192 },  { 22,139,196 },  {  6,144,197 },  { 22,147,198 },
+    {  6,150,202 },  { 19,151,206 },  { 21,152,207 },  {  6,155,209 },
+    {  3,160,210 },  { 23,160,211 },  { 22,164,216 },  { 22,165,220 },
+    { 22,167,224 },  { 22,169,228 },  { 21,171,232 },  { 21,173,233 },
+    {  5,178,233 },  { 22,179,234 },  { 23,180,238 },  { 23,180,243 },
+    { 23,180,248 },  { 22,189,248 },  { 22,191,252 },  {  5,196,252 },
+    {  3,203,252 },  {  5,203,253 },  { 22,210,253 },  {  0,214,253 },
+};
+
+static unsigned char stb_easy_font_hseg[214] = {
+   97,37,69,84,28,51,2,18,10,49,98,41,65,25,81,105,33,9,97,1,97,37,37,36,
+    81,10,98,107,3,100,3,99,58,51,4,99,58,8,73,81,10,50,98,8,73,81,4,10,50,
+    98,8,25,33,65,81,10,50,17,65,97,25,33,25,49,9,65,20,68,1,65,25,49,41,
+    11,105,13,101,76,10,50,10,50,98,11,99,10,98,11,50,99,11,50,11,99,8,57,
+    58,3,99,99,107,10,10,11,10,99,11,5,100,41,65,57,41,65,9,17,81,97,3,107,
+    9,97,1,97,33,25,9,25,41,100,41,26,82,42,98,27,83,42,98,26,51,82,8,41,
+    35,8,10,26,82,114,42,1,114,8,9,73,57,81,41,97,18,8,8,25,26,26,82,26,82,
+    26,82,41,25,33,82,26,49,73,35,90,17,81,41,65,57,41,65,25,81,90,114,20,
+    84,73,57,41,49,25,33,65,81,9,97,1,97,25,33,65,81,57,33,25,41,25,
+};
+
+static unsigned char stb_easy_font_vseg[253] = {
+   4,2,8,10,15,8,15,33,8,15,8,73,82,73,57,41,82,10,82,18,66,10,21,29,1,65,
+    27,8,27,9,65,8,10,50,97,74,66,42,10,21,57,41,29,25,14,81,73,57,26,8,8,
+    26,66,3,8,8,15,19,21,90,58,26,18,66,18,105,89,28,74,17,8,73,57,26,21,
+    8,42,41,42,8,28,22,8,8,30,7,8,8,26,66,21,7,8,8,29,7,7,21,8,8,8,59,7,8,
+    8,15,29,8,8,14,7,57,43,10,82,7,7,25,42,25,15,7,25,41,15,21,105,105,29,
+    7,57,57,26,21,105,73,97,89,28,97,7,57,58,26,82,18,57,57,74,8,30,6,8,8,
+    14,3,58,90,58,11,7,74,43,74,15,2,82,2,42,75,42,10,67,57,41,10,7,2,42,
+    74,106,15,2,35,8,8,29,7,8,8,59,35,51,8,8,15,35,30,35,8,8,30,7,8,8,60,
+    36,8,45,7,7,36,8,43,8,44,21,8,8,44,35,8,8,43,23,8,8,43,35,8,8,31,21,15,
+    20,8,8,28,18,58,89,58,26,21,89,73,89,29,20,8,8,30,7,
+};
+
+typedef struct
+{
+   unsigned char c[4];
+} stb_easy_font_color;
+
+static int stb_easy_font_draw_segs(float x, float y, unsigned char *segs, int num_segs, int vertical, stb_easy_font_color c, char *vbuf, int vbuf_size, int offset)
+{
+    int i,j;
+    for (i=0; i < num_segs; ++i) {
+        int len = segs[i] & 7;
+        x += (float) ((segs[i] >> 3) & 1);
+        if (len && offset+64 <= vbuf_size) {
+            float y0 = y + (float) (segs[i]>>4);
+            for (j=0; j < 4; ++j) {
+                * (float *) (vbuf+offset+0) = x  + (j==1 || j==2 ? (vertical ? 1 : len) : 0);
+                * (float *) (vbuf+offset+4) = y0 + (    j >= 2   ? (vertical ? len : 1) : 0);
+                * (float *) (vbuf+offset+8) = 0.f;
+                * (stb_easy_font_color *) (vbuf+offset+12) = c;
+                offset += 16;
+            }
+        }
+    }
+    return offset;
+}
+
+static float stb_easy_font_spacing_val = 0;
+static void stb_easy_font_spacing(float spacing)
+{
+   stb_easy_font_spacing_val = spacing;
+}
+
+static int stb_easy_font_print(float x, float y, char *text, unsigned char color[4], void *vertex_buffer, int vbuf_size)
+{
+    char *vbuf = (char *) vertex_buffer;
+    float start_x = x;
+    int offset = 0;
+
+    stb_easy_font_color c = { 255,255,255,255 }; // use structure copying to avoid needing depending on memcpy()
+    if (color) { c.c[0] = color[0]; c.c[1] = color[1]; c.c[2] = color[2]; c.c[3] = color[3]; }
+
+    while (*text && offset < vbuf_size) {
+        if (*text == '\n') {
+            y += 12;
+            x = start_x;
+        } else {
+            unsigned char advance = stb_easy_font_charinfo[*text-32].advance;
+            float y_ch = advance & 16 ? y+1 : y;
+            int h_seg, v_seg, num_h, num_v;
+            h_seg = stb_easy_font_charinfo[*text-32  ].h_seg;
+            v_seg = stb_easy_font_charinfo[*text-32  ].v_seg;
+            num_h = stb_easy_font_charinfo[*text-32+1].h_seg - h_seg;
+            num_v = stb_easy_font_charinfo[*text-32+1].v_seg - v_seg;
+            offset = stb_easy_font_draw_segs(x, y_ch, &stb_easy_font_hseg[h_seg], num_h, 0, c, vbuf, vbuf_size, offset);
+            offset = stb_easy_font_draw_segs(x, y_ch, &stb_easy_font_vseg[v_seg], num_v, 1, c, vbuf, vbuf_size, offset);
+            x += advance & 15;
+            x += stb_easy_font_spacing_val;
+        }
+        ++text;
+    }
+    return (unsigned) offset/64;
+}
+
+static int stb_easy_font_width(char *text)
+{
+    float len = 0;
+    float max_len = 0;
+    while (*text) {
+        if (*text == '\n') {
+            if (len > max_len) max_len = len;
+            len = 0;
+        } else {
+            len += stb_easy_font_charinfo[*text-32].advance & 15;
+            len += stb_easy_font_spacing_val;
+        }
+        ++text;
+    }
+    if (len > max_len) max_len = len;
+    return (int) ceil(max_len);
+}
+
+static int stb_easy_font_height(char *text)
+{
+    float y = 0;
+    int nonempty_line=0;
+    while (*text) {
+        if (*text == '\n') {
+            y += 12;
+            nonempty_line = 0;
+        } else {
+            nonempty_line = 1;
+        }
+        ++text;
+    }
+    return (int) ceil(y + (nonempty_line ? 12 : 0));
+}
+#endif
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_herringbone_wang_tile.h b/lib/stb/stb_herringbone_wang_tile.h
new file mode 100644
index 0000000..5517941
--- /dev/null
+++ b/lib/stb/stb_herringbone_wang_tile.h
@@ -0,0 +1,1221 @@
+/* stbhw - v0.7 -  http://nothings.org/gamedev/herringbone
+   Herringbone Wang Tile Generator - Sean Barrett 2014 - public domain
+
+== LICENSE ==============================
+
+This software is dual-licensed to the public domain and under the following
+license: you are granted a perpetual, irrevocable license to copy, modify,
+publish, and distribute this file as you see fit.
+
+== WHAT IT IS ===========================
+
+ This library is an SDK for Herringbone Wang Tile generation:
+
+      http://nothings.org/gamedev/herringbone
+
+ The core design is that you use this library offline to generate a
+ "template" of the tiles you'll create. You then edit those tiles, then
+ load the created tile image file back into this library and use it at
+ runtime to generate "maps".
+
+ You cannot load arbitrary tile image files with this library; it is
+ only designed to load image files made from the template it created.
+ It stores a binary description of the tile sizes & constraints in a
+ few pixels, and uses those to recover the rules, rather than trying
+ to parse the tiles themselves.
+
+ You *can* use this library to generate from arbitrary tile sets, but
+ only by loading the tile set and specifying the constraints explicitly
+ yourself.
+
+== COMPILING ============================
+
+ 1. #define STB_HERRINGBONE_WANG_TILE_IMPLEMENTATION before including this
+    header file in *one* source file to create the implementation
+    in that source file.
+
+ 2. optionally #define STB_HBWANG_RAND() to be a random number
+    generator. if you don't define it, it will use rand(),
+    and you need to seed srand() yourself.
+
+ 3. optionally #define STB_HBWANG_ASSERT(x), otherwise
+    it will use assert()
+
+ 4. optionally #define STB_HBWANG_STATIC to force all symbols to be
+    static instead of public, so they are only accesible
+    in the source file that creates the implementation
+
+ 5. optionally #define STB_HBWANG_NO_REPITITION_REDUCTION to disable
+    the code that tries to reduce having the same tile appear
+    adjacent to itself in wang-corner-tile mode (e.g. imagine
+    if you were doing something where 90% of things should be
+    the same grass tile, you need to disable this system)
+
+ 6. optionally define STB_HBWANG_MAX_X and STB_HBWANG_MAX_Y
+    to be the max dimensions of the generated map in multiples
+    of the wang tile's short side's length (e.g. if you
+    have 20x10 wang tiles, so short_side_len=10, and you
+    have MAX_X is 17, then the largest map you can generate
+    is 170 pixels wide). The defaults are 100x100. This
+    is used to define static arrays which affect memory
+    usage.
+
+== USING ================================
+
+  To use the map generator, you need a tileset. You can download
+  some sample tilesets from http://nothings.org/gamedev/herringbone
+
+  Then see the "sample application" below.
+
+  You can also use this file to generate templates for
+  tilesets which you then hand-edit to create the data.
+
+
+== MEMORY MANAGEMENT ====================
+
+  The tileset loader allocates memory with malloc(). The map
+  generator does no memory allocation, so e.g. you can load
+  tilesets at startup and never free them and never do any
+  further allocation.
+
+
+== SAMPLE APPLICATION ===================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"        // http://nothings.org/stb_image.c
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"  // http://nothings.org/stb/stb_image_write.h
+
+#define STB_HBWANG_IMPLEMENTATION
+#include "stb_hbwang.h"
+
+int main(int argc, char **argv)
+{
+   unsigned char *data;
+   int xs,ys, w,h;
+   stbhw_tileset ts;
+
+   if (argc != 4) {
+      fprintf(stderr, "Usage: mapgen {tile-file} {xsize} {ysize}\n"
+                      "generates file named 'test_map.png'\n");
+      exit(1);
+   }
+   data = stbi_load(argv[1], &w, &h, NULL, 3);
+   xs = atoi(argv[2]);
+   ys = atoi(argv[3]);
+   if (data == NULL) {
+      fprintf(stderr, "Error opening or parsing '%s' as an image file\n", argv[1]);
+      exit(1);
+   }
+   if (xs < 1 || xs > 1000) {
+      fprintf(stderr, "xsize invalid or out of range\n");
+      exit(1);
+   }
+   if (ys < 1 || ys > 1000) {
+      fprintf(stderr, "ysize invalid or out of range\n");
+      exit(1);
+   }
+
+   stbhw_build_tileset_from_image(&ts, data, w*3, w, h);
+   free(data);
+
+   // allocate a buffer to create the final image to
+   data = malloc(3 * xs * ys);
+
+   srand(time(NULL));
+   stbhw_generate_image(&ts, NULL, data, xs*3, xs, ys);
+
+   stbi_write_png("test_map.png", xs, ys, 3, data, xs*3);
+
+   stbhw_free_tileset(&ts);
+   free(data);
+
+   return 0;
+}
+
+== VERSION HISTORY ===================
+
+   0.7   2019-03-04   - fix warnings
+	0.6   2014-08-17   - fix broken map-maker
+	0.5   2014-07-07   - initial release
+
+*/
+
+//////////////////////////////////////////////////////////////////////////////
+//                                                                          //
+//                         HEADER FILE SECTION                              //
+//                                                                          //
+
+#ifndef INCLUDE_STB_HWANG_H
+#define INCLUDE_STB_HWANG_H
+
+#ifdef STB_HBWANG_STATIC
+#define STBHW_EXTERN static
+#else
+#ifdef __cplusplus
+#define STBHW_EXTERN extern "C"
+#else
+#define STBHW_EXTERN extern
+#endif
+#endif
+
+typedef struct stbhw_tileset stbhw_tileset;
+
+// returns description of last error produced by any function (not thread-safe)
+STBHW_EXTERN const char *stbhw_get_last_error(void);
+
+// build a tileset from an image that conforms to a template created by this
+// library. (you allocate storage for stbhw_tileset and function fills it out;
+// memory for individual tiles are malloc()ed).
+// returns non-zero on success, 0 on error
+STBHW_EXTERN int stbhw_build_tileset_from_image(stbhw_tileset *ts,
+                     unsigned char *pixels, int stride_in_bytes, int w, int h);
+
+// free a tileset built by stbhw_build_tileset_from_image
+STBHW_EXTERN void stbhw_free_tileset(stbhw_tileset *ts);
+
+// generate a map that is w * h pixels (3-bytes each)
+// returns non-zero on success, 0 on error
+// not thread-safe (uses a global data structure to avoid memory management)
+// weighting should be NULL, as non-NULL weighting is currently untested
+STBHW_EXTERN int stbhw_generate_image(stbhw_tileset *ts, int **weighting,
+                     unsigned char *pixels, int stride_in_bytes, int w, int h);
+
+//////////////////////////////////////
+//
+// TILESET DATA STRUCTURE
+//
+// if you use the image-to-tileset system from this file, you
+// don't need to worry about these data structures. but if you
+// want to build/load a tileset yourself, you'll need to fill
+// these out.
+
+typedef struct
+{
+   // the edge or vertex constraints, according to diagram below
+   signed char a,b,c,d,e,f;
+
+   // The herringbone wang tile data; it is a bitmap which is either
+   // w=2*short_sidelen,h=short_sidelen, or w=short_sidelen,h=2*short_sidelen.
+   // it is always RGB, stored row-major, with no padding between rows.
+   // (allocate stbhw_tile structure to be large enough for the pixel data)
+   unsigned char pixels[1];
+} stbhw_tile;
+
+struct stbhw_tileset
+{
+   int is_corner;
+   int num_color[6];  // number of colors for each of 6 edge types or 4 corner types
+   int short_side_len;
+   stbhw_tile **h_tiles;
+   stbhw_tile **v_tiles;
+   int num_h_tiles, max_h_tiles;
+   int num_v_tiles, max_v_tiles;
+};
+
+///////////////  TEMPLATE GENERATOR  //////////////////////////
+
+// when requesting a template, you fill out this data
+typedef struct
+{
+   int is_corner;      // using corner colors or edge colors?
+   int short_side_len; // rectangles is 2n x n, n = short_side_len
+   int num_color[6];   // see below diagram for meaning of the index to this;
+                       // 6 values if edge (!is_corner), 4 values if is_corner
+                       // legal numbers: 1..8 if edge, 1..4 if is_corner
+   int num_vary_x;     // additional number of variations along x axis in the template
+   int num_vary_y;     // additional number of variations along y axis in the template
+   int corner_type_color_template[4][4];
+      // if corner_type_color_template[s][t] is non-zero, then any
+      // corner of type s generated as color t will get a little
+      // corner sample markup in the template image data
+
+} stbhw_config;
+
+// computes the size needed for the template image
+STBHW_EXTERN void stbhw_get_template_size(stbhw_config *c, int *w, int *h);
+
+// generates a template image, assuming data is 3*w*h bytes long, RGB format
+STBHW_EXTERN int stbhw_make_template(stbhw_config *c, unsigned char *data, int w, int h, int stride_in_bytes);
+
+#endif//INCLUDE_STB_HWANG_H
+
+
+// TILE CONSTRAINT TYPES
+//
+// there are 4 "types" of corners and 6 types of edges.
+// you can configure the tileset to have different numbers
+// of colors for each type of color or edge.
+//
+// corner types:
+//
+//                     0---*---1---*---2---*---3
+//                     |       |               |
+//                     *       *               *
+//                     |       |               |
+//     1---*---2---*---3       0---*---1---*---2
+//     |               |       |
+//     *               *       *
+//     |               |       |
+//     0---*---1---*---2---*---3
+//
+//
+//  edge types:
+//
+//     *---2---*---3---*      *---0---*
+//     |               |      |       |
+//     1               4      5       1
+//     |               |      |       |
+//     *---0---*---2---*      *       *
+//                            |       |
+//                            4       5
+//                            |       |
+//                            *---3---*
+//
+// TILE CONSTRAINTS
+//
+// each corner/edge has a color; this shows the name
+// of the variable containing the color
+//
+// corner constraints:
+//
+//                        a---*---d
+//                        |       |
+//                        *       *
+//                        |       |
+//     a---*---b---*---c  b       e
+//     |               |  |       |
+//     *               *  *       *
+//     |               |  |       |
+//     d---*---e---*---f  c---*---f
+//
+//
+//  edge constraints:
+//
+//     *---a---*---b---*      *---a---*
+//     |               |      |       |
+//     c               d      b       c
+//     |               |      |       |
+//     *---e---*---f---*      *       *
+//                            |       |
+//                            d       e
+//                            |       |
+//                            *---f---*
+//
+
+
+//////////////////////////////////////////////////////////////////////////////
+//                                                                          //
+//                       IMPLEMENTATION SECTION                             //
+//                                                                          //
+
+#ifdef STB_HERRINGBONE_WANG_TILE_IMPLEMENTATION
+
+
+#include <string.h> // memcpy
+#include <stdlib.h> // malloc
+
+#ifndef STB_HBWANG_RAND
+#include <stdlib.h>
+#define STB_HBWANG_RAND()  (rand() >> 4)
+#endif
+
+#ifndef STB_HBWANG_ASSERT
+#include <assert.h>
+#define STB_HBWANG_ASSERT(x)  assert(x)
+#endif
+
+// map size
+#ifndef STB_HBWANG_MAX_X
+#define STB_HBWANG_MAX_X  100
+#endif
+
+#ifndef STB_HBWANG_MAX_Y
+#define STB_HBWANG_MAX_Y  100
+#endif
+
+// global variables for color assignments
+// @MEMORY change these to just store last two/three rows
+//         and keep them on the stack
+static signed char c_color[STB_HBWANG_MAX_Y+6][STB_HBWANG_MAX_X+6];
+static signed char v_color[STB_HBWANG_MAX_Y+6][STB_HBWANG_MAX_X+5];
+static signed char h_color[STB_HBWANG_MAX_Y+5][STB_HBWANG_MAX_X+6];
+
+static const char *stbhw_error;
+STBHW_EXTERN const char *stbhw_get_last_error(void)
+{
+   const char *temp = stbhw_error;
+   stbhw_error = 0;
+   return temp;
+}
+
+
+
+
+/////////////////////////////////////////////////////////////
+//
+//  SHARED TEMPLATE-DESCRIPTION CODE
+//
+//  Used by both template generator and tileset parser; by
+//  using the same code, they are locked in sync and we don't
+//  need to try to do more sophisticated parsing of edge color
+//  markup or something.
+
+typedef void stbhw__process_rect(struct stbhw__process *p, int xpos, int ypos,
+                                 int a, int b, int c, int d, int e, int f);
+
+typedef struct stbhw__process
+{
+   stbhw_tileset *ts;
+   stbhw_config *c;
+   stbhw__process_rect *process_h_rect;
+   stbhw__process_rect *process_v_rect;
+   unsigned char *data;
+   int stride,w,h;
+} stbhw__process;
+
+static void stbhw__process_h_row(stbhw__process *p,
+                           int xpos, int ypos,
+                           int a0, int a1,
+                           int b0, int b1,
+                           int c0, int c1,
+                           int d0, int d1,
+                           int e0, int e1,
+                           int f0, int f1,
+                           int variants)
+{
+   int a,b,c,d,e,f,v;
+
+   for (v=0; v < variants; ++v)
+      for (f=f0; f <= f1; ++f)
+         for (e=e0; e <= e1; ++e)
+            for (d=d0; d <= d1; ++d)
+               for (c=c0; c <= c1; ++c)
+                  for (b=b0; b <= b1; ++b)
+                     for (a=a0; a <= a1; ++a) {
+                        p->process_h_rect(p, xpos, ypos, a,b,c,d,e,f);
+                        xpos += 2*p->c->short_side_len + 3;
+                     }
+}
+
+static void stbhw__process_v_row(stbhw__process *p,
+                           int xpos, int ypos,
+                           int a0, int a1,
+                           int b0, int b1,
+                           int c0, int c1,
+                           int d0, int d1,
+                           int e0, int e1,
+                           int f0, int f1,
+                           int variants)
+{
+   int a,b,c,d,e,f,v;
+
+   for (v=0; v < variants; ++v)
+      for (f=f0; f <= f1; ++f)
+         for (e=e0; e <= e1; ++e)
+            for (d=d0; d <= d1; ++d)
+               for (c=c0; c <= c1; ++c)
+                  for (b=b0; b <= b1; ++b)
+                     for (a=a0; a <= a1; ++a) {
+                        p->process_v_rect(p, xpos, ypos, a,b,c,d,e,f);
+                        xpos += p->c->short_side_len+3;
+                     }
+}
+
+static void stbhw__get_template_info(stbhw_config *c, int *w, int *h, int *h_count, int *v_count)
+{
+   int size_x,size_y;
+   int horz_count,vert_count;
+
+   if (c->is_corner) {
+      int horz_w = c->num_color[1] * c->num_color[2] * c->num_color[3] * c->num_vary_x;
+      int horz_h = c->num_color[0] * c->num_color[1] * c->num_color[2] * c->num_vary_y;
+
+      int vert_w = c->num_color[0] * c->num_color[3] * c->num_color[2] * c->num_vary_y;
+      int vert_h = c->num_color[1] * c->num_color[0] * c->num_color[3] * c->num_vary_x;
+
+      int horz_x = horz_w * (2*c->short_side_len + 3);
+      int horz_y = horz_h * (  c->short_side_len + 3);
+
+      int vert_x = vert_w * (  c->short_side_len + 3);
+      int vert_y = vert_h * (2*c->short_side_len + 3);
+
+      horz_count = horz_w * horz_h;
+      vert_count = vert_w * vert_h;
+
+      size_x = horz_x > vert_x ? horz_x : vert_x;
+      size_y = 2 + horz_y + 2 + vert_y;
+   } else {
+      int horz_w = c->num_color[0] * c->num_color[1] * c->num_color[2] * c->num_vary_x;
+      int horz_h = c->num_color[3] * c->num_color[4] * c->num_color[2] * c->num_vary_y;
+
+      int vert_w = c->num_color[0] * c->num_color[5] * c->num_color[1] * c->num_vary_y;
+      int vert_h = c->num_color[3] * c->num_color[4] * c->num_color[5] * c->num_vary_x;
+
+      int horz_x = horz_w * (2*c->short_side_len + 3);
+      int horz_y = horz_h * (  c->short_side_len + 3);
+
+      int vert_x = vert_w * (  c->short_side_len + 3);
+      int vert_y = vert_h * (2*c->short_side_len + 3);
+
+      horz_count = horz_w * horz_h;
+      vert_count = vert_w * vert_h;
+
+      size_x = horz_x > vert_x ? horz_x : vert_x;
+      size_y = 2 + horz_y + 2 + vert_y;
+   }
+   if (w) *w = size_x;
+   if (h) *h = size_y;
+   if (h_count) *h_count = horz_count;
+   if (v_count) *v_count = vert_count;
+}
+
+STBHW_EXTERN void stbhw_get_template_size(stbhw_config *c, int *w, int *h)
+{
+   stbhw__get_template_info(c, w, h, NULL, NULL);
+}
+
+static int stbhw__process_template(stbhw__process *p)
+{
+   int i,j,k,q, ypos;
+   int size_x, size_y;
+   stbhw_config *c = p->c;
+
+   stbhw__get_template_info(c, &size_x, &size_y, NULL, NULL);
+
+   if (p->w < size_x || p->h < size_y) {
+      stbhw_error = "image too small for configuration";
+      return 0;
+   }
+
+   if (c->is_corner) {
+      ypos = 2;
+      for (k=0; k < c->num_color[2]; ++k) {
+         for (j=0; j < c->num_color[1]; ++j) {
+            for (i=0; i < c->num_color[0]; ++i) {
+               for (q=0; q < c->num_vary_y; ++q) {
+                  stbhw__process_h_row(p, 0,ypos,
+                     0,c->num_color[1]-1, 0,c->num_color[2]-1, 0,c->num_color[3]-1,
+                     i,i, j,j, k,k,
+                     c->num_vary_x);
+                  ypos += c->short_side_len + 3;
+               }
+            }
+         }
+      }
+      ypos += 2;
+      for (k=0; k < c->num_color[3]; ++k) {
+         for (j=0; j < c->num_color[0]; ++j) {
+            for (i=0; i < c->num_color[1]; ++i) {
+               for (q=0; q < c->num_vary_x; ++q) {
+                  stbhw__process_v_row(p, 0,ypos,
+                     0,c->num_color[0]-1, 0,c->num_color[3]-1, 0,c->num_color[2]-1,
+                     i,i, j,j, k,k,
+                     c->num_vary_y);
+                  ypos += (c->short_side_len*2) + 3;
+               }
+            }
+         }
+      }
+      assert(ypos == size_y);
+   } else {
+      ypos = 2;
+      for (k=0; k < c->num_color[3]; ++k) {
+         for (j=0; j < c->num_color[4]; ++j) {
+            for (i=0; i < c->num_color[2]; ++i) {
+               for (q=0; q < c->num_vary_y; ++q) {
+                  stbhw__process_h_row(p, 0,ypos,
+                     0,c->num_color[2]-1, k,k,
+                     0,c->num_color[1]-1, j,j,
+                     0,c->num_color[0]-1, i,i,
+                     c->num_vary_x);
+                  ypos += c->short_side_len + 3;
+               }
+            }
+         }
+      }
+      ypos += 2;
+      for (k=0; k < c->num_color[3]; ++k) {
+         for (j=0; j < c->num_color[4]; ++j) {
+            for (i=0; i < c->num_color[5]; ++i) {
+               for (q=0; q < c->num_vary_x; ++q) {
+                  stbhw__process_v_row(p, 0,ypos,
+                     0,c->num_color[0]-1, i,i,
+                     0,c->num_color[1]-1, j,j,
+                     0,c->num_color[5]-1, k,k,
+                     c->num_vary_y);
+                  ypos += (c->short_side_len*2) + 3;
+               }
+            }
+         }
+      }
+      assert(ypos == size_y);
+   }
+   return 1;
+}
+
+
+/////////////////////////////////////////////////////////////
+//
+//  MAP GENERATOR
+//
+
+static void stbhw__draw_pixel(unsigned char *output, int stride, int x, int y, unsigned char c[3])
+{
+   memcpy(output + y*stride + x*3, c, 3);
+}
+
+static void stbhw__draw_h_tile(unsigned char *output, int stride, int xmax, int ymax, int x, int y, stbhw_tile *h, int sz)
+{
+   int i,j;
+   for (j=0; j < sz; ++j)
+      if (y+j >= 0 && y+j < ymax)
+         for (i=0; i < sz*2; ++i)
+            if (x+i >= 0 && x+i < xmax)
+               stbhw__draw_pixel(output,stride, x+i,y+j, &h->pixels[(j*sz*2 + i)*3]);
+}
+
+static void stbhw__draw_v_tile(unsigned char *output, int stride, int xmax, int ymax, int x, int y, stbhw_tile *h, int sz)
+{
+   int i,j;
+   for (j=0; j < sz*2; ++j)
+      if (y+j >= 0 && y+j < ymax)
+         for (i=0; i < sz; ++i)
+            if (x+i >= 0 && x+i < xmax)
+               stbhw__draw_pixel(output,stride, x+i,y+j, &h->pixels[(j*sz + i)*3]);
+}
+
+
+// randomly choose a tile that fits constraints for a given spot, and update the constraints
+static stbhw_tile * stbhw__choose_tile(stbhw_tile **list, int numlist,
+                                      signed char *a, signed char *b, signed char *c,
+                                      signed char *d, signed char *e, signed char *f,
+                                      int **weighting)
+{
+   int i,n,m = 1<<30,pass;
+   for (pass=0; pass < 2; ++pass) {
+      n=0;
+      // pass #1:
+      //   count number of variants that match this partial set of constraints
+      // pass #2:
+      //   stop on randomly selected match
+      for (i=0; i < numlist; ++i) {
+         stbhw_tile *h = list[i];
+         if ((*a < 0 || *a == h->a) &&
+             (*b < 0 || *b == h->b) &&
+             (*c < 0 || *c == h->c) &&
+             (*d < 0 || *d == h->d) &&
+             (*e < 0 || *e == h->e) &&
+             (*f < 0 || *f == h->f)) {
+            if (weighting)
+               n += weighting[0][i];
+            else
+               n += 1;
+            if (n > m) {
+               // use list[i]
+               // update constraints to reflect what we placed
+               *a = h->a;
+               *b = h->b;
+               *c = h->c;
+               *d = h->d;
+               *e = h->e;
+               *f = h->f;
+               return h;
+            }
+         }
+      }
+      if (n == 0) {
+         stbhw_error = "couldn't find tile matching constraints";
+         return NULL;
+      }
+      m = STB_HBWANG_RAND() % n;
+   }
+   STB_HBWANG_ASSERT(0);
+   return NULL;
+}
+
+static int stbhw__match(int x, int y)
+{
+   return c_color[y][x] == c_color[y+1][x+1];
+}
+
+static int stbhw__weighted(int num_options, int *weights)
+{
+   int k, total, choice;
+   total = 0;
+   for (k=0; k < num_options; ++k)
+      total += weights[k];
+   choice = STB_HBWANG_RAND() % total;
+   total = 0;
+   for (k=0; k < num_options; ++k) {
+      total += weights[k];
+      if (choice < total)
+         break;
+   }
+   STB_HBWANG_ASSERT(k < num_options);
+   return k;
+}
+
+static int stbhw__change_color(int old_color, int num_options, int *weights)
+{
+   if (weights) {
+      int k, total, choice;
+      total = 0;
+      for (k=0; k < num_options; ++k)
+         if (k != old_color)
+            total += weights[k];
+      choice = STB_HBWANG_RAND() % total;
+      total = 0;
+      for (k=0; k < num_options; ++k) {
+         if (k != old_color) {
+            total += weights[k];
+            if (choice < total)
+               break;
+         }
+      }
+      STB_HBWANG_ASSERT(k < num_options);
+      return k;
+   } else {
+      int offset = 1+STB_HBWANG_RAND() % (num_options-1);
+      return (old_color+offset) % num_options;
+   }
+}
+
+
+
+// generate a map that is w * h pixels (3-bytes each)
+// returns 1 on success, 0 on error
+STBHW_EXTERN int stbhw_generate_image(stbhw_tileset *ts, int **weighting, unsigned char *output, int stride, int w, int h)
+{
+   int sidelen = ts->short_side_len;
+   int xmax = (w / sidelen) + 6;
+   int ymax = (h / sidelen) + 6;
+   if (xmax > STB_HBWANG_MAX_X+6 || ymax > STB_HBWANG_MAX_Y+6) {
+      stbhw_error = "increase STB_HBWANG_MAX_X/Y";
+      return 0;
+   }
+
+   if (ts->is_corner) {
+      int i,j, ypos;
+      int *cc = ts->num_color;
+
+      for (j=0; j < ymax; ++j) {
+         for (i=0; i < xmax; ++i) {
+            int p = (i-j+1)&3; // corner type
+            if (weighting==NULL || weighting[p]==0 || cc[p] == 1)
+               c_color[j][i] = STB_HBWANG_RAND() % cc[p];
+            else
+               c_color[j][i] = stbhw__weighted(cc[p], weighting[p]);
+         }
+      }
+      #ifndef STB_HBWANG_NO_REPITITION_REDUCTION
+      // now go back through and make sure we don't have adjancent 3x2 vertices that are identical,
+      // to avoid really obvious repetition (which happens easily with extreme weights)
+      for (j=0; j < ymax-3; ++j) {
+         for (i=0; i < xmax-3; ++i) {
+            //int p = (i-j+1) & 3; // corner type   // unused, not sure what the intent was so commenting it out
+            STB_HBWANG_ASSERT(i+3 < STB_HBWANG_MAX_X+6);
+            STB_HBWANG_ASSERT(j+3 < STB_HBWANG_MAX_Y+6);
+            if (stbhw__match(i,j) && stbhw__match(i,j+1) && stbhw__match(i,j+2)
+                && stbhw__match(i+1,j) && stbhw__match(i+1,j+1) && stbhw__match(i+1,j+2)) {
+               int p = ((i+1)-(j+1)+1) & 3;
+               if (cc[p] > 1)
+                  c_color[j+1][i+1] = stbhw__change_color(c_color[j+1][i+1], cc[p], weighting ? weighting[p] : NULL);
+            }
+            if (stbhw__match(i,j) && stbhw__match(i+1,j) && stbhw__match(i+2,j)
+                && stbhw__match(i,j+1) && stbhw__match(i+1,j+1) && stbhw__match(i+2,j+1)) {
+               int p = ((i+2)-(j+1)+1) & 3;
+               if (cc[p] > 1)
+                  c_color[j+1][i+2] = stbhw__change_color(c_color[j+1][i+2], cc[p], weighting ? weighting[p] : NULL);
+            }
+         }
+      }
+      #endif
+
+      ypos = -1 * sidelen;
+      for (j = -1; ypos < h; ++j) {
+         // a general herringbone row consists of:
+         //    horizontal left block, the bottom of a previous vertical, the top of a new vertical
+         int phase = (j & 3);
+         // displace horizontally according to pattern
+         if (phase == 0) {
+            i = 0;
+         } else {
+            i = phase-4;
+         }
+         for (;; i += 4) {
+            int xpos = i * sidelen;
+            if (xpos >= w)
+               break;
+            // horizontal left-block
+            if (xpos + sidelen*2 >= 0 && ypos >= 0) {
+               stbhw_tile *t = stbhw__choose_tile(
+                  ts->h_tiles, ts->num_h_tiles,
+                  &c_color[j+2][i+2], &c_color[j+2][i+3], &c_color[j+2][i+4],
+                  &c_color[j+3][i+2], &c_color[j+3][i+3], &c_color[j+3][i+4],
+                  weighting
+               );
+               if (t == NULL)
+                  return 0;
+               stbhw__draw_h_tile(output,stride,w,h, xpos, ypos, t, sidelen);
+            }
+            xpos += sidelen * 2;
+            // now we're at the end of a previous vertical one
+            xpos += sidelen;
+            // now we're at the start of a new vertical one
+            if (xpos < w) {
+               stbhw_tile *t = stbhw__choose_tile(
+                  ts->v_tiles, ts->num_v_tiles,
+                  &c_color[j+2][i+5], &c_color[j+3][i+5], &c_color[j+4][i+5],
+                  &c_color[j+2][i+6], &c_color[j+3][i+6], &c_color[j+4][i+6],
+                  weighting
+               );
+               if (t == NULL)
+                  return 0;
+               stbhw__draw_v_tile(output,stride,w,h, xpos, ypos,  t, sidelen);
+            }
+         }
+         ypos += sidelen;
+      }
+   } else {
+      // @TODO edge-color repetition reduction
+      int i,j, ypos;
+      memset(v_color, -1, sizeof(v_color));
+      memset(h_color, -1, sizeof(h_color));
+
+      ypos = -1 * sidelen;
+      for (j = -1; ypos<h; ++j) {
+         // a general herringbone row consists of:
+         //    horizontal left block, the bottom of a previous vertical, the top of a new vertical
+         int phase = (j & 3);
+         // displace horizontally according to pattern
+         if (phase == 0) {
+            i = 0;
+         } else {
+            i = phase-4;
+         }
+         for (;; i += 4) {
+            int xpos = i * sidelen;
+            if (xpos >= w)
+               break;
+            // horizontal left-block
+            if (xpos + sidelen*2 >= 0 && ypos >= 0) {
+               stbhw_tile *t = stbhw__choose_tile(
+                  ts->h_tiles, ts->num_h_tiles,
+                  &h_color[j+2][i+2], &h_color[j+2][i+3],
+                  &v_color[j+2][i+2], &v_color[j+2][i+4],
+                  &h_color[j+3][i+2], &h_color[j+3][i+3],
+                  weighting
+               );
+               if (t == NULL) return 0;
+               stbhw__draw_h_tile(output,stride,w,h, xpos, ypos, t, sidelen);
+            }
+            xpos += sidelen * 2;
+            // now we're at the end of a previous vertical one
+            xpos += sidelen;
+            // now we're at the start of a new vertical one
+            if (xpos < w) {
+               stbhw_tile *t = stbhw__choose_tile(
+                  ts->v_tiles, ts->num_v_tiles,
+                  &h_color[j+2][i+5],
+                  &v_color[j+2][i+5], &v_color[j+2][i+6],
+                  &v_color[j+3][i+5], &v_color[j+3][i+6],
+                  &h_color[j+4][i+5],
+                  weighting
+               );
+               if (t == NULL) return 0;
+               stbhw__draw_v_tile(output,stride,w,h, xpos, ypos,  t, sidelen);
+            }
+         }
+         ypos += sidelen;
+      }
+   }
+   return 1;
+}
+
+static void stbhw__parse_h_rect(stbhw__process *p, int xpos, int ypos,
+                            int a, int b, int c, int d, int e, int f)
+{
+   int len = p->c->short_side_len;
+   stbhw_tile *h = (stbhw_tile *) malloc(sizeof(*h)-1 + 3 * (len*2) * len);
+   int i,j;
+   ++xpos;
+   ++ypos;
+   h->a = a, h->b = b, h->c = c, h->d = d, h->e = e, h->f = f;
+   for (j=0; j < len; ++j)
+      for (i=0; i < len*2; ++i)
+         memcpy(h->pixels + j*(3*len*2) + i*3, p->data+(ypos+j)*p->stride+(xpos+i)*3, 3);
+   STB_HBWANG_ASSERT(p->ts->num_h_tiles < p->ts->max_h_tiles);
+   p->ts->h_tiles[p->ts->num_h_tiles++] = h;
+}
+
+static void stbhw__parse_v_rect(stbhw__process *p, int xpos, int ypos,
+                            int a, int b, int c, int d, int e, int f)
+{
+   int len = p->c->short_side_len;
+   stbhw_tile *h = (stbhw_tile *) malloc(sizeof(*h)-1 + 3 * (len*2) * len);
+   int i,j;
+   ++xpos;
+   ++ypos;
+   h->a = a, h->b = b, h->c = c, h->d = d, h->e = e, h->f = f;
+   for (j=0; j < len*2; ++j)
+      for (i=0; i < len; ++i)
+         memcpy(h->pixels + j*(3*len) + i*3, p->data+(ypos+j)*p->stride+(xpos+i)*3, 3);
+   STB_HBWANG_ASSERT(p->ts->num_v_tiles < p->ts->max_v_tiles);
+   p->ts->v_tiles[p->ts->num_v_tiles++] = h;
+}
+
+STBHW_EXTERN int stbhw_build_tileset_from_image(stbhw_tileset *ts, unsigned char *data, int stride, int w, int h)
+{
+   int i, h_count, v_count;
+   unsigned char header[9];
+   stbhw_config c = { 0 };
+   stbhw__process p = { 0 };
+
+   // extract binary header
+
+   // remove encoding that makes it more visually obvious it encodes actual data
+   for (i=0; i < 9; ++i)
+      header[i] = data[w*3 - 1 - i] ^ (i*55);
+
+   // extract header info
+   if (header[7] == 0xc0) {
+      // corner-type
+      c.is_corner = 1;
+      for (i=0; i < 4; ++i)
+         c.num_color[i] = header[i];
+      c.num_vary_x = header[4];
+      c.num_vary_y = header[5];
+      c.short_side_len = header[6];
+   } else {
+      c.is_corner = 0;
+      // edge-type
+      for (i=0; i < 6; ++i)
+         c.num_color[i] = header[i];
+      c.num_vary_x = header[6];
+      c.num_vary_y = header[7];
+      c.short_side_len = header[8];
+   }
+
+   if (c.num_vary_x < 0 || c.num_vary_x > 64 || c.num_vary_y < 0 || c.num_vary_y > 64)
+      return 0;
+   if (c.short_side_len == 0)
+      return 0;
+   if (c.num_color[0] > 32 || c.num_color[1] > 32 || c.num_color[2] > 32 || c.num_color[3] > 32)
+      return 0;
+
+   stbhw__get_template_info(&c, NULL, NULL, &h_count, &v_count);
+
+   ts->is_corner = c.is_corner;
+   ts->short_side_len = c.short_side_len;
+   memcpy(ts->num_color, c.num_color, sizeof(ts->num_color));
+
+   ts->max_h_tiles = h_count;
+   ts->max_v_tiles = v_count;
+
+   ts->num_h_tiles = ts->num_v_tiles = 0;
+
+   ts->h_tiles = (stbhw_tile **) malloc(sizeof(*ts->h_tiles) * h_count);
+   ts->v_tiles = (stbhw_tile **) malloc(sizeof(*ts->v_tiles) * v_count);
+
+   p.ts = ts;
+   p.data = data;
+   p.stride = stride;
+   p.process_h_rect = stbhw__parse_h_rect;
+   p.process_v_rect = stbhw__parse_v_rect;
+   p.w = w;
+   p.h = h;
+   p.c = &c;
+
+   // load all the tiles out of the image
+   return stbhw__process_template(&p);
+}
+
+STBHW_EXTERN void stbhw_free_tileset(stbhw_tileset *ts)
+{
+   int i;
+   for (i=0; i < ts->num_h_tiles; ++i)
+      free(ts->h_tiles[i]);
+   for (i=0; i < ts->num_v_tiles; ++i)
+      free(ts->v_tiles[i]);
+   free(ts->h_tiles);
+   free(ts->v_tiles);
+   ts->h_tiles = NULL;
+   ts->v_tiles = NULL;
+   ts->num_h_tiles = ts->max_h_tiles = 0;
+   ts->num_v_tiles = ts->max_v_tiles = 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//               GENERATOR
+//
+//
+
+
+// shared code
+
+static void stbhw__set_pixel(unsigned char *data, int stride, int xpos, int ypos, unsigned char color[3])
+{
+   memcpy(data + ypos*stride + xpos*3, color, 3);
+}
+
+static void stbhw__stbhw__set_pixel_whiten(unsigned char *data, int stride, int xpos, int ypos, unsigned char color[3])
+{
+   unsigned char c2[3];
+   int i;
+   for (i=0; i < 3; ++i)
+      c2[i] = (color[i]*2 + 255)/3;
+   memcpy(data + ypos*stride + xpos*3, c2, 3);
+}
+
+
+static unsigned char stbhw__black[3] = { 0,0,0 };
+
+// each edge set gets its own unique color variants
+// used http://phrogz.net/css/distinct-colors.html to generate this set,
+// but it's not very good and needs to be revised
+
+static unsigned char stbhw__color[7][8][3] =
+{
+   { {255,51,51}  , {143,143,29}, {0,199,199}, {159,119,199},     {0,149,199}  , {143, 0,143}, {255,128,0}, {64,255,0},  },
+   { {235,255,30 }, {255,0,255},  {199,139,119},  {29,143, 57},    {143,0,71}   , { 0,143,143}, {0,99,199}, {143,71,0},  },
+   { {0,149,199}  , {143, 0,143}, {255,128,0}, {64,255,0},        {255,191,0}  , {51,255,153}, {0,0,143}, {199,119,159},},
+   { {143,0,71}   , { 0,143,143}, {0,99,199}, {143,71,0},         {255,190,153}, { 0,255,255}, {128,0,255}, {255,51,102},},
+   { {255,191,0}  , {51,255,153}, {0,0,143}, {199,119,159},       {255,51,51}  , {143,143,29}, {0,199,199}, {159,119,199},},
+   { {255,190,153}, { 0,255,255}, {128,0,255}, {255,51,102},      {235,255,30 }, {255,0,255}, {199,139,119},  {29,143, 57}, },
+
+   { {40,40,40 },  { 90,90,90 }, { 150,150,150 }, { 200,200,200 },
+     { 255,90,90 }, { 160,160,80}, { 50,150,150 }, { 200,50,200 } },
+};
+
+static void stbhw__draw_hline(unsigned char *data, int stride, int xpos, int ypos, int color, int len, int slot)
+{
+   int i;
+   int j = len * 6 / 16;
+   int k = len * 10 / 16;
+   for (i=0; i < len; ++i)
+      stbhw__set_pixel(data, stride, xpos+i, ypos, stbhw__black);
+   if (k-j < 2) {
+      j = len/2 - 1;
+      k = j+2;
+      if (len & 1)
+         ++k;
+   }
+   for (i=j; i < k; ++i)
+      stbhw__stbhw__set_pixel_whiten(data, stride, xpos+i, ypos, stbhw__color[slot][color]);
+}
+
+static void stbhw__draw_vline(unsigned char *data, int stride, int xpos, int ypos, int color, int len, int slot)
+{
+   int i;
+   int j = len * 6 / 16;
+   int k = len * 10 / 16;
+   for (i=0; i < len; ++i)
+      stbhw__set_pixel(data, stride, xpos, ypos+i, stbhw__black);
+   if (k-j < 2) {
+      j = len/2 - 1;
+      k = j+2;
+      if (len & 1)
+         ++k;
+   }
+   for (i=j; i < k; ++i)
+      stbhw__stbhw__set_pixel_whiten(data, stride, xpos, ypos+i, stbhw__color[slot][color]);
+}
+
+//                 0--*--1--*--2--*--3
+//                 |     |           |
+//                 *     *           *
+//                 |     |           |
+//     1--*--2--*--3     0--*--1--*--2
+//     |           |     |
+//     *           *     *
+//     |           |     |
+//     0--*--1--*--2--*--3
+//
+// variables while enumerating (no correspondence between corners
+// of the types is implied by these variables)
+//
+//     a-----b-----c      a-----d
+//     |           |      |     |
+//     |           |      |     |
+//     |           |      |     |
+//     d-----e-----f      b     e
+//                        |     |
+//                        |     |
+//                        |     |
+//                        c-----f
+//
+
+unsigned char stbhw__corner_colors[4][4][3] =
+{
+   { { 255,0,0 }, { 200,200,200 }, { 100,100,200 }, { 255,200,150 }, },
+   { { 0,0,255 }, { 255,255,0 },   { 100,200,100 }, { 150,255,200 }, },
+   { { 255,0,255 }, { 80,80,80 },  { 200,100,100 }, { 200,150,255 }, },
+   { { 0,255,255 }, { 0,255,0 },   { 200,120,200 }, { 255,200,200 }, },
+};
+
+int stbhw__corner_colors_to_edge_color[4][4] =
+{
+   // 0   1   2   3
+   {  0,  1,  4,  9, }, // 0
+   {  2,  3,  5, 10, }, // 1
+   {  6,  7,  8, 11, }, // 2
+   { 12, 13, 14, 15, }, // 3
+};
+
+#define stbhw__c2e stbhw__corner_colors_to_edge_color
+
+static void stbhw__draw_clipped_corner(unsigned char *data, int stride, int xpos, int ypos, int w, int h, int x, int y)
+{
+   static unsigned char template_color[3] = { 167,204,204 };
+   int i,j;
+   for (j = -2; j <= 1; ++j) {
+      for (i = -2; i <= 1; ++i) {
+         if ((i == -2 || i == 1) && (j == -2 || j == 1))
+            continue;
+         else {
+            if (x+i < 1 || x+i > w) continue;
+            if (y+j < 1 || y+j > h) continue;
+            stbhw__set_pixel(data, stride, xpos+x+i, ypos+y+j, template_color);
+
+         }
+      }
+   }
+}
+
+static void stbhw__edge_process_h_rect(stbhw__process *p, int xpos, int ypos,
+                            int a, int b, int c, int d, int e, int f)
+{
+   int len = p->c->short_side_len;
+   stbhw__draw_hline(p->data, p->stride, xpos+1        , ypos        , a, len, 2);
+   stbhw__draw_hline(p->data, p->stride, xpos+  len+1  , ypos        , b, len, 3);
+   stbhw__draw_vline(p->data, p->stride, xpos          , ypos+1      , c, len, 1);
+   stbhw__draw_vline(p->data, p->stride, xpos+2*len+1  , ypos+1      , d, len, 4);
+   stbhw__draw_hline(p->data, p->stride, xpos+1        , ypos + len+1, e, len, 0);
+   stbhw__draw_hline(p->data, p->stride, xpos + len+1  , ypos + len+1, f, len, 2);
+}
+
+static void stbhw__edge_process_v_rect(stbhw__process *p, int xpos, int ypos,
+                            int a, int b, int c, int d, int e, int f)
+{
+   int len = p->c->short_side_len;
+   stbhw__draw_hline(p->data, p->stride, xpos+1      , ypos          , a, len, 0);
+   stbhw__draw_vline(p->data, p->stride, xpos        , ypos+1        , b, len, 5);
+   stbhw__draw_vline(p->data, p->stride, xpos + len+1, ypos+1        , c, len, 1);
+   stbhw__draw_vline(p->data, p->stride, xpos        , ypos +   len+1, d, len, 4);
+   stbhw__draw_vline(p->data, p->stride, xpos + len+1, ypos +   len+1, e, len, 5);
+   stbhw__draw_hline(p->data, p->stride, xpos+1      , ypos + 2*len+1, f, len, 3);
+}
+
+static void stbhw__corner_process_h_rect(stbhw__process *p, int xpos, int ypos,
+                            int a, int b, int c, int d, int e, int f)
+{
+   int len = p->c->short_side_len;
+
+   stbhw__draw_hline(p->data, p->stride, xpos+1        , ypos        , stbhw__c2e[a][b], len, 2);
+   stbhw__draw_hline(p->data, p->stride, xpos+  len+1  , ypos        , stbhw__c2e[b][c], len, 3);
+   stbhw__draw_vline(p->data, p->stride, xpos          , ypos+1      , stbhw__c2e[a][d], len, 1);
+   stbhw__draw_vline(p->data, p->stride, xpos+2*len+1  , ypos+1      , stbhw__c2e[c][f], len, 4);
+   stbhw__draw_hline(p->data, p->stride, xpos+1        , ypos + len+1, stbhw__c2e[d][e], len, 0);
+   stbhw__draw_hline(p->data, p->stride, xpos + len+1  , ypos + len+1, stbhw__c2e[e][f], len, 2);
+
+   if (p->c->corner_type_color_template[1][a]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len*2,len, 1,1);
+   if (p->c->corner_type_color_template[2][b]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len*2,len, len+1,1);
+   if (p->c->corner_type_color_template[3][c]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len*2,len, len*2+1,1);
+
+   if (p->c->corner_type_color_template[0][d]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len*2,len, 1,len+1);
+   if (p->c->corner_type_color_template[1][e]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len*2,len, len+1,len+1);
+   if (p->c->corner_type_color_template[2][f]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len*2,len, len*2+1,len+1);
+
+   stbhw__set_pixel(p->data, p->stride, xpos        , ypos, stbhw__corner_colors[1][a]);
+   stbhw__set_pixel(p->data, p->stride, xpos+len    , ypos, stbhw__corner_colors[2][b]);
+   stbhw__set_pixel(p->data, p->stride, xpos+2*len+1, ypos, stbhw__corner_colors[3][c]);
+   stbhw__set_pixel(p->data, p->stride, xpos        , ypos+len+1, stbhw__corner_colors[0][d]);
+   stbhw__set_pixel(p->data, p->stride, xpos+len    , ypos+len+1, stbhw__corner_colors[1][e]);
+   stbhw__set_pixel(p->data, p->stride, xpos+2*len+1, ypos+len+1, stbhw__corner_colors[2][f]);
+}
+
+static void stbhw__corner_process_v_rect(stbhw__process *p, int xpos, int ypos,
+                            int a, int b, int c, int d, int e, int f)
+{
+   int len = p->c->short_side_len;
+
+   stbhw__draw_hline(p->data, p->stride, xpos+1      , ypos          , stbhw__c2e[a][d], len, 0);
+   stbhw__draw_vline(p->data, p->stride, xpos        , ypos+1        , stbhw__c2e[a][b], len, 5);
+   stbhw__draw_vline(p->data, p->stride, xpos + len+1, ypos+1        , stbhw__c2e[d][e], len, 1);
+   stbhw__draw_vline(p->data, p->stride, xpos        , ypos +   len+1, stbhw__c2e[b][c], len, 4);
+   stbhw__draw_vline(p->data, p->stride, xpos + len+1, ypos +   len+1, stbhw__c2e[e][f], len, 5);
+   stbhw__draw_hline(p->data, p->stride, xpos+1      , ypos + 2*len+1, stbhw__c2e[c][f], len, 3);
+
+   if (p->c->corner_type_color_template[0][a]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len,len*2, 1,1);
+   if (p->c->corner_type_color_template[3][b]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len,len*2, 1,len+1);
+   if (p->c->corner_type_color_template[2][c]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len,len*2, 1,len*2+1);
+
+   if (p->c->corner_type_color_template[1][d]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len,len*2, len+1,1);
+   if (p->c->corner_type_color_template[0][e]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len,len*2, len+1,len+1);
+   if (p->c->corner_type_color_template[3][f]) stbhw__draw_clipped_corner(p->data,p->stride, xpos,ypos, len,len*2, len+1,len*2+1);
+
+   stbhw__set_pixel(p->data, p->stride, xpos      , ypos        , stbhw__corner_colors[0][a]);
+   stbhw__set_pixel(p->data, p->stride, xpos      , ypos+len    , stbhw__corner_colors[3][b]);
+   stbhw__set_pixel(p->data, p->stride, xpos      , ypos+2*len+1, stbhw__corner_colors[2][c]);
+   stbhw__set_pixel(p->data, p->stride, xpos+len+1, ypos        , stbhw__corner_colors[1][d]);
+   stbhw__set_pixel(p->data, p->stride, xpos+len+1, ypos+len    , stbhw__corner_colors[0][e]);
+   stbhw__set_pixel(p->data, p->stride, xpos+len+1, ypos+2*len+1, stbhw__corner_colors[3][f]);
+}
+
+// generates a template image, assuming data is 3*w*h bytes long, RGB format
+STBHW_EXTERN int stbhw_make_template(stbhw_config *c, unsigned char *data, int w, int h, int stride_in_bytes)
+{
+   stbhw__process p;
+   int i;
+
+   p.data = data;
+   p.w = w;
+   p.h = h;
+   p.stride = stride_in_bytes;
+   p.ts = 0;
+   p.c = c;
+
+   if (c->is_corner) {
+      p.process_h_rect = stbhw__corner_process_h_rect;
+      p.process_v_rect = stbhw__corner_process_v_rect;
+   } else {
+      p.process_h_rect = stbhw__edge_process_h_rect;
+      p.process_v_rect = stbhw__edge_process_v_rect;
+   }
+
+   for (i=0; i < p.h; ++i)
+      memset(p.data + i*p.stride, 255, 3*p.w);
+
+   if (!stbhw__process_template(&p))
+      return 0;
+
+   if (c->is_corner) {
+      // write out binary information in first line of image
+      for (i=0; i < 4; ++i)
+         data[w*3-1-i] = c->num_color[i];
+      data[w*3-1-i] = c->num_vary_x;
+      data[w*3-2-i] = c->num_vary_y;
+      data[w*3-3-i] = c->short_side_len;
+      data[w*3-4-i] = 0xc0;
+   } else {
+      for (i=0; i < 6; ++i)
+         data[w*3-1-i] = c->num_color[i];
+      data[w*3-1-i] = c->num_vary_x;
+      data[w*3-2-i] = c->num_vary_y;
+      data[w*3-3-i] = c->short_side_len;
+   }
+
+   // make it more obvious it encodes actual data
+   for (i=0; i < 9; ++i)
+      p.data[p.w*3 - 1 - i] ^= i*55;
+
+   return 1;
+}
+#endif // STB_HBWANG_IMPLEMENTATION
diff --git a/lib/stb/stb_hexwave.h b/lib/stb/stb_hexwave.h
new file mode 100644
index 0000000..480ab1b
--- /dev/null
+++ b/lib/stb/stb_hexwave.h
@@ -0,0 +1,680 @@
+// stb_hexwave - v0.5 - public domain, initial release 2021-04-01
+//
+// A flexible anti-aliased (bandlimited) digital audio oscillator.
+//
+// This library generates waveforms of a variety of shapes made of
+// line segments. It does not do envelopes, LFO effects, etc.; it
+// merely tries to solve the problem of generating an artifact-free
+// morphable digital waveform with a variety of spectra, and leaves
+// it to the user to rescale the waveform and mix multiple voices, etc.
+//
+// Compiling:
+//
+//   In one C/C++ file that #includes this file, do
+//
+//      #define STB_HEXWAVE_IMPLEMENTATION
+//      #include "stb_hexwave.h"
+//
+//   Optionally, #define STB_HEXWAVE_STATIC before including
+//   the header to cause the definitions to be private to the
+//   implementation file (i.e. to be "static" instead of "extern").
+//
+// Notes:
+//
+//   Optionally performs memory allocation during initialization,
+//   never allocates otherwise.
+//
+// License:
+//
+//   See end of file for license information.
+//
+// Usage:
+//
+//   Initialization:
+//
+//     hexwave_init(32,16,NULL); // read "header section" for alternatives
+//
+//   Create oscillator:
+//
+//     HexWave *osc = malloc(sizeof(*osc)); // or "new HexWave", or declare globally or on stack
+//     hexwave_create(osc, reflect_flag, peak_time, half_height, zero_wait);
+//       see "Waveform shapes" below for the meaning of these parameters
+//
+//   Generate audio:
+//
+//     hexwave_generate_samples(output, number_of_samples, osc, oscillator_freq)
+//       where:
+//         output is a buffer where the library will store floating point audio samples
+//         number_of_samples is the number of audio samples to generate
+//         osc is a pointer to a Hexwave
+//         oscillator_freq is the frequency of the oscillator divided by the sample rate
+//
+//       The output samples will continue from where the samples generated by the
+//       previous hexwave_generate_samples() on this oscillator ended.
+//
+//   Change oscillator waveform:
+//
+//     hexwave_change(osc, reflect_flag, peak_time, half_height, zero_wait);
+//       can call in between calls to hexwave_generate_samples
+//
+// Waveform shapes:
+//
+//   All waveforms generated by hexwave are constructed from six line segments
+//   characterized by 3 parameters.
+//
+//   See demonstration: https://www.youtube.com/watch?v=hsUCrAsDN-M
+//
+//                 reflect=0                          reflect=1
+//
+//           0-----P---1                        0-----P---1    peak_time = P
+//                 .     1                            .     1
+//                /\_    :                           /\_    :
+//               /   \_  :                          /   \_  :
+//              /      \.H                         /      \.H  half_height = H
+//             /       | :                        /       | :
+//       _____/        |_:___               _____/        | :       _____
+//           .           :   \        |         .         | :      /
+//           .           :    \       |         .         | :     /
+//           .           :     \     _/         .         \_:    /
+//           .           :      \  _/           .           :_  /
+//           .          -1       \/             .          -1 \/
+//       0 - Z - - - - 1                    0 - Z - - - - 1   zero_wait = Z
+//
+//    Classic waveforms:
+//                               peak    half    zero
+//                     reflect   time   height   wait
+//      Sawtooth          1       0       0       0
+//      Square            1       0       1       0
+//      Triangle          1       0.5     0       0
+//
+//    Some waveforms can be produced in multiple ways, which is useful when morphing
+//    into other waveforms, and there are a few more notable shapes:
+//
+//                               peak    half    zero
+//                     reflect   time   height   wait
+//      Sawtooth          1       1      any      0
+//      Sawtooth (8va)    1       0      -1       0
+//      Triangle          1       0.5     0       0
+//      Square            1       0       1       0
+//      Square            0       0       1       0
+//      Triangle          0       0.5     0       0
+//      Triangle          0       0      -1       0
+//      AlternatingSaw    0       0       0       0
+//      AlternatingSaw    0       1      any      0
+//      Stairs            0       0       1       0.5
+//
+//    The "Sawtooth (8va)" waveform is identical to a sawtooth wave with 2x the
+//    frequency, but when morphed with other values, it becomes an overtone of
+//    the base frequency.
+//
+//  Morphing waveforms:
+//
+//    Sweeping peak_time morphs the waveform while producing various spectra.
+//    Sweeping half_height effectively crossfades between two waveforms; useful, but less exciting.
+//    Sweeping zero_wait produces a similar effect no matter the reset of the waveform,
+//        a sort of high-pass/PWM effect where the wave becomes silent at zero_wait=1.
+//
+//    You can trivially morph between any two waveforms from the above table
+//    which only differ in one column.
+//
+//    Crossfade between classic waveforms:
+//                                            peak     half    zero
+//        Start         End         reflect   time    height   wait
+//        -----         ---         -------   ----    ------   ----
+//        Triangle      Square         0       0      -1..1    0
+//        Saw           Square         1       0       0..1    0
+//        Triangle      Saw            1       0.5     0..2    0
+//
+//    The last morph uses uses half-height values larger than 1, which means it will
+//    be louder and the output should be scaled down by half to compensate, or better
+//    by dynamically tracking the morph: volume_scale = 1 - half_height/4
+//
+//    Non-crossfade morph between classic waveforms, most require changing
+//    two parameters at the same time:
+//                                           peak     half    zero
+//      Start         End         reflect    time    height   wait
+//      -----         ---         -------    ----    ------   ----
+//      Square        Triangle      any      0..0.5   1..0     0
+//      Square        Saw            1       0..1     1..any   0
+//      Triangle      Saw            1     0.5..1     0..-1    0
+//
+//    Other noteworthy morphs between simple shapes:
+//                                                            peak     half    zero
+//      Start           Halfway       End          reflect    time    height   wait
+//      -----           ---------     ---          -------    ----    ------   ----
+//      Saw (8va,neg)                Saw (pos)        1       0..1      -1      0
+//      Saw (neg)                    Saw (pos)        1       0..1       0      0
+//      Triangle                     AlternatingSaw   0       0..1      -1      0
+//      AlternatingSaw  Triangle     AlternatingSaw   0       0..1       0      0
+//      Square                       AlternatingSaw   0       0..1       1      0
+//      Triangle        Triangle     AlternatingSaw   0       0..1    -1..1     0
+//      Square                       AlternatingSaw   0       0..1     1..0     0
+//      Saw (8va)       Triangle     Saw              1       0..1    -1..1     0
+//      Saw (neg)                    Saw (pos)        1       0..1     0..1     0
+//      AlternatingSaw               AlternatingSaw   0       0..1     0..any   0
+//
+//   The last entry is noteworthy because the morph from the halfway point to either
+//   endpoint sounds very different. For example, an LFO sweeping back and forth over
+//   the whole range will morph between the middle timbre and the AlternatingSaw
+//   timbre in two different ways, alternating.
+//
+//   Entries with "any" for half_height are whole families of morphs, as you can pick
+//   any value you want as the endpoint for half_height.
+//
+//   You can always morph between any two waveforms with the same value of 'reflect'
+//   by just sweeping the parameters simultaneously. There will never be artifacts
+//   and the result will always be useful, if not necessarily what you want.
+//
+//   You can vary the sound of two-parameter morphs by ramping them differently,
+//   e.g. if the morph goes from t=0..1, then square-to-triangle looks like:
+//        peak_time   = lerp(t, 0, 0.5)
+//        half_height = lerp(t, 1, 0  )
+//   but you can also do things like:
+//        peak_time   = lerp(smoothstep(t), 0, 0.5)
+//        half_height = cos(PI/2 * t)
+//
+// How it works:
+//
+//   hexwave use BLEP to bandlimit discontinuities and BLAMP
+//   to bandlimit C1 discontinuities. This is not polyBLEP
+//   (polynomial BLEP), it is table-driven BLEP. It is
+//   also not minBLEP (minimum-phase BLEP), as that complicates
+//   things for little benefit once BLAMP is involved.
+//
+//   The previous oscillator frequency is remembered, and when
+//   the frequency changes, a BLAMP is generated to remove the
+//   C1 discontinuity, which reduces artifacts for sweeps/LFO.
+//
+//   Changes to an oscillator timbre using hexwave_change() actually
+//   wait until the oscillator finishes its current cycle. All
+//   waveforms with non-zero "zero_wait" settings pass through 0
+//   and have 0-slope at the start of a cycle, which means changing
+//   the settings is artifact free at that time. (If zero_wait is 0,
+//   the code still treats it as passing through 0 with 0-slope; it'll
+//   apply the necessary fixups to make it artifact free as if it does
+//   transition to 0 with 0-slope vs. the waveform at the end of
+//   the cycle, then adds the fixups for a non-0 and non-0 slope
+//   at the start of the cycle, which cancels out if zero_wait is 0,
+//   and still does the right thing if zero_wait is 0 when the
+//   settings are updated.)
+//
+//   BLEP/BLAMP normally requires overlapping buffers, but this
+//   is hidden from the user by generating the waveform to a
+//   temporary buffer and saving the overlap regions internally
+//   between calls. (It is slightly more complicated; see code.)
+//
+//   By design all shapes have 0 DC offset; this is one reason
+//   hexwave uses zero_wait instead of standard PWM.
+//
+//   The internals of hexwave could support any arbitrary shape
+//   made of line segments, but I chose not to expose this
+//   generality in favor of a simple, easy-to-use API.
+
+#ifndef STB_INCLUDE_STB_HEXWAVE_H
+#define STB_INCLUDE_STB_HEXWAVE_H
+
+#ifndef STB_HEXWAVE_MAX_BLEP_LENGTH
+#define STB_HEXWAVE_MAX_BLEP_LENGTH   64 // good enough for anybody
+#endif
+
+#ifdef STB_HEXWAVE_STATIC
+#define STB_HEXWAVE_DEF static
+#else
+#define STB_HEXWAVE_DEF extern
+#endif
+
+typedef struct HexWave HexWave;
+
+STB_HEXWAVE_DEF void hexwave_init(int width, int oversample, float *user_buffer);
+//         width: size of BLEP, from 4..64, larger is slower & more memory but less aliasing
+//    oversample: 2+, number of subsample positions, larger uses more memory but less noise
+//   user_buffer: optional, if provided the library will perform no allocations.
+//                16*width*(oversample+1) bytes, must stay allocated as long as library is used
+//                technically it only needs:   8*( width * (oversample  + 1))
+//                                           + 8*((width *  oversample) + 1)  bytes
+//
+// width can be larger than 64 if you define STB_HEXWAVE_MAX_BLEP_LENGTH to a larger value
+
+STB_HEXWAVE_DEF void hexwave_shutdown(float *user_buffer);
+//       user_buffer: pass in same parameter as passed to hexwave_init
+
+STB_HEXWAVE_DEF void hexwave_create(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait);
+// see docs above for description
+//
+//   reflect is tested as 0 or non-zero
+//   peak_time is clamped to 0..1
+//   half_height is not clamped
+//   zero_wait is clamped to 0..1
+
+STB_HEXWAVE_DEF void hexwave_change(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait);
+// see docs
+
+STB_HEXWAVE_DEF void hexwave_generate_samples(float *output, int num_samples, HexWave *hex, float freq);
+//            output: buffer where the library will store generated floating point audio samples
+// number_of_samples: the number of audio samples to generate
+//               osc: pointer to a Hexwave initialized with 'hexwave_create'
+//   oscillator_freq: frequency of the oscillator divided by the sample rate
+
+// private:
+typedef struct
+{
+   int   reflect;
+   float peak_time;
+   float zero_wait;
+   float half_height;
+} HexWaveParameters;
+
+struct HexWave
+{
+   float t, prev_dt;
+   HexWaveParameters current, pending;
+   int have_pending;
+   float buffer[STB_HEXWAVE_MAX_BLEP_LENGTH];
+}; 
+#endif
+
+#ifdef STB_HEXWAVE_IMPLEMENTATION
+
+#ifndef STB_HEXWAVE_NO_ALLOCATION
+#include <stdlib.h> // malloc,free
+#endif
+
+#include <string.h> // memset,memcpy,memmove
+#include <math.h>   // sin,cos,fabs
+
+#define hexwave_clamp(v,a,b)   ((v) < (a) ? (a) : (v) > (b) ? (b) : (v))
+
+STB_HEXWAVE_DEF void hexwave_change(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait)
+{
+   hex->pending.reflect     = reflect;
+   hex->pending.peak_time   = hexwave_clamp(peak_time,0,1);
+   hex->pending.half_height = half_height;
+   hex->pending.zero_wait   = hexwave_clamp(zero_wait,0,1);
+   // put a barrier here to allow changing from a different thread than the generator
+   hex->have_pending        = 1;
+}
+
+STB_HEXWAVE_DEF void hexwave_create(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait)
+{
+   memset(hex, 0, sizeof(*hex));
+   hexwave_change(hex, reflect, peak_time, half_height, zero_wait);
+   hex->current = hex->pending;
+   hex->have_pending = 0;
+   hex->t = 0;
+   hex->prev_dt = 0;
+}
+
+static struct
+{
+   int width;       // width of fixup in samples
+   int oversample;  // number of oversampled versions (there's actually one more to allow lerpign)
+   float *blep;
+   float *blamp;
+} hexblep;
+
+static void hex_add_oversampled_bleplike(float *output, float time_since_transition, float scale, float *data)
+{
+   float *d1,*d2;
+   float lerpweight;
+   int i, bw = hexblep.width;
+
+   int slot = (int) (time_since_transition * hexblep.oversample);
+   if (slot >= hexblep.oversample)
+      slot = hexblep.oversample-1; // clamp in case the floats overshoot
+
+   d1 = &data[ slot   *bw];
+   d2 = &data[(slot+1)*bw];
+
+   lerpweight = time_since_transition * hexblep.oversample - slot;
+   for (i=0; i < bw; ++i)
+      output[i] += scale * (d1[i] + (d2[i]-d1[i])*lerpweight);
+}
+
+static void hex_blep (float *output, float time_since_transition, float scale)
+{
+   hex_add_oversampled_bleplike(output, time_since_transition, scale, hexblep.blep);
+}
+
+static void hex_blamp(float *output, float time_since_transition, float scale)
+{
+   hex_add_oversampled_bleplike(output, time_since_transition, scale, hexblep.blamp);
+}
+
+typedef struct
+{
+   float t,v,s; // time, value, slope
+} hexvert;
+
+// each half of the waveform needs 4 vertices to represent 3 line
+// segments, plus 1 more for wraparound
+static void hexwave_generate_linesegs(hexvert vert[9], HexWave *hex, float dt)
+{
+   int j;
+   float min_len = dt / 256.0f;
+
+   vert[0].t = 0;
+   vert[0].v = 0;
+   vert[1].t = hex->current.zero_wait*0.5f;
+   vert[1].v = 0;
+   vert[2].t = 0.5f*hex->current.peak_time + vert[1].t*(1-hex->current.peak_time);
+   vert[2].v = 1;
+   vert[3].t = 0.5f;
+   vert[3].v = hex->current.half_height;
+
+   if (hex->current.reflect) {
+      for (j=4; j <= 7; ++j) {
+         vert[j].t = 1 -  vert[7-j].t;
+         vert[j].v =    - vert[7-j].v;
+      }
+   } else {
+      for (j=4; j <= 7; ++j) {
+         vert[j].t =  0.5f +  vert[j-4].t;
+         vert[j].v =        - vert[j-4].v;
+      }
+   }
+   vert[8].t = 1;
+   vert[8].v = 0;
+
+   for (j=0; j < 8; ++j) {
+      if (vert[j+1].t <= vert[j].t + min_len) {
+          // if change takes place over less than a fraction of a sample treat as discontinuity
+          //
+          // otherwise the slope computation can blow up to arbitrarily large and we
+          // try to generate a huge BLAMP and the result is wrong.
+          // 
+          // why does this happen if the math is right? i believe if done perfectly,
+          // the two BLAMPs on either side of the slope would cancel out, but our
+          // BLAMPs have only limited sub-sample precision and limited integration
+          // accuracy. or maybe it's just the math blowing up w/ floating point precision
+          // limits as we try to make x * (1/x) cancel out
+          //
+          // min_len verified artifact-free even near nyquist with only oversample=4
+         vert[j+1].t = vert[j].t;
+      }
+   }
+
+   if (vert[8].t != 1.0f) {
+      // if the above fixup moved the endpoint away from 1.0, move it back,
+      // along with any other vertices that got moved to the same time
+      float t = vert[8].t;
+      for (j=5; j <= 8; ++j)
+         if (vert[j].t == t)
+            vert[j].t = 1.0f;
+   }
+
+   // compute the exact slopes from the final fixed-up positions
+   for (j=0; j < 8; ++j)
+      if (vert[j+1].t == vert[j].t)
+         vert[j].s = 0;
+      else
+         vert[j].s = (vert[j+1].v - vert[j].v) / (vert[j+1].t - vert[j].t);
+
+   // wraparound at end
+   vert[8].t = 1;
+   vert[8].v = vert[0].v;
+   vert[8].s = vert[0].s;
+}
+
+STB_HEXWAVE_DEF void hexwave_generate_samples(float *output, int num_samples, HexWave *hex, float freq)
+{
+   hexvert vert[9];
+   int pass,i,j;
+   float t = hex->t;
+   float temp_output[2*STB_HEXWAVE_MAX_BLEP_LENGTH];
+   int buffered_length = sizeof(float)*hexblep.width;
+   float dt = (float) fabs(freq);
+   float recip_dt = (dt == 0.0f) ? 0.0f : 1.0f / dt;
+
+   int halfw = hexblep.width/2;
+   // all sample times are biased by halfw to leave room for BLEP/BLAMP to go back in time
+
+   if (num_samples <= 0)
+      return;
+
+   // convert parameters to times and slopes
+   hexwave_generate_linesegs(vert, hex, dt);
+
+   if (hex->prev_dt != dt) {
+      // if frequency changes, add a fixup at the derivative discontinuity starting at now
+      float slope;
+      for (j=1; j < 6; ++j)
+         if (t < vert[j].t)
+            break;
+      slope = vert[j].s;
+      if (slope != 0)
+         hex_blamp(output, 0, (dt - hex->prev_dt)*slope);
+      hex->prev_dt = dt;
+   }
+
+   // copy the buffered data from last call and clear the rest of the output array
+   memset(output, 0, sizeof(float)*num_samples);
+   memset(temp_output, 0, 2*hexblep.width*sizeof(float));
+
+   if (num_samples >= hexblep.width) {
+      memcpy(output, hex->buffer, buffered_length);
+   } else {
+      // if the output is shorter than hexblep.width, we do all synthesis to temp_output
+      memcpy(temp_output, hex->buffer, buffered_length);
+   }
+
+   for (pass=0; pass < 2; ++pass) {
+      int i0,i1;
+      float *out;
+
+      // we want to simulate having one buffer that is num_output + hexblep.width
+      // samples long, without putting that requirement on the user, and without
+      // allocating a temp buffer that's as long as the whole thing. so we use two
+      // overlapping buffers, one the user's buffer and one a fixed-length temp
+      // buffer.
+
+      if (pass == 0) {
+         if (num_samples < hexblep.width)
+            continue;
+         // run as far as we can without overwriting the end of the user's buffer 
+         out = output;
+         i0 = 0;
+         i1 = num_samples - hexblep.width;
+      } else {
+         // generate the rest into a temp buffer
+         out = temp_output;
+         i0 = 0;
+         if (num_samples >= hexblep.width)
+            i1 = hexblep.width;
+         else
+            i1 = num_samples;
+      }
+
+      // determine current segment
+      for (j=0; j < 8; ++j)
+         if (t < vert[j+1].t)                                  
+            break;
+
+      i = i0;
+      for(;;) {
+         while (t < vert[j+1].t) {
+            if (i == i1)
+               goto done;
+            out[i+halfw] += vert[j].v + vert[j].s*(t - vert[j].t);
+            t += dt;
+            ++i;
+         }
+         // transition from lineseg starting at j to lineseg starting at j+1
+
+         if (vert[j].t == vert[j+1].t)
+            hex_blep(out+i, recip_dt*(t-vert[j+1].t), (vert[j+1].v - vert[j].v));
+         hex_blamp(out+i, recip_dt*(t-vert[j+1].t), dt*(vert[j+1].s - vert[j].s));
+         ++j;
+
+         if (j == 8) {
+            // change to different waveform if there's a change pending
+            j = 0;
+            t -= 1.0; // t was >= 1.f if j==8
+            if (hex->have_pending) {
+               float prev_s0 = vert[j].s;
+               float prev_v0 = vert[j].v;
+               hex->current = hex->pending;
+               hex->have_pending = 0;
+               hexwave_generate_linesegs(vert, hex, dt);
+               // the following never occurs with this oscillator, but it makes
+               // the code work in more general cases
+               if (vert[j].v != prev_v0)
+                  hex_blep (out+i, recip_dt*t,    (vert[j].v - prev_v0));
+               if (vert[j].s != prev_s0)
+                  hex_blamp(out+i, recip_dt*t, dt*(vert[j].s - prev_s0));
+            }
+         }
+      }
+     done:
+      ;
+   }
+
+   // at this point, we've written output[] and temp_output[]
+   if (num_samples >= hexblep.width) {
+      // the first half of temp[] overlaps the end of output, the second half will be the new start overlap
+      for (i=0; i < hexblep.width; ++i)
+         output[num_samples-hexblep.width + i] += temp_output[i];
+      memcpy(hex->buffer, temp_output+hexblep.width, buffered_length);
+   } else {
+      for (i=0; i < num_samples; ++i)
+         output[i] = temp_output[i];
+      memcpy(hex->buffer, temp_output+num_samples, buffered_length);
+   }
+
+   hex->t = t;
+}
+
+STB_HEXWAVE_DEF void hexwave_shutdown(float *user_buffer)
+{
+   #ifndef STB_HEXWAVE_NO_ALLOCATION
+   if (user_buffer != 0) {
+      free(hexblep.blep);
+      free(hexblep.blamp);
+   }
+   #endif
+}
+
+// buffer should be NULL or must be 4*(width*(oversample+1)*2 + 
+STB_HEXWAVE_DEF void hexwave_init(int width, int oversample, float *user_buffer)
+{
+   int halfwidth = width/2;
+   int half = halfwidth*oversample;
+   int blep_buffer_count = width*(oversample+1);
+   int n = 2*half+1;
+#ifdef STB_HEXWAVE_NO_ALLOCATION
+   float *buffers = user_buffer;
+#else
+   float *buffers = user_buffer ? user_buffer : (float *) malloc(sizeof(float) * n * 2);
+#endif
+   float *step    = buffers+0*n;
+   float *ramp    = buffers+1*n;
+   float *blep_buffer, *blamp_buffer;
+   double integrate_impulse=0, integrate_step=0;
+   int i,j;
+
+   if (width > STB_HEXWAVE_MAX_BLEP_LENGTH)
+      width = STB_HEXWAVE_MAX_BLEP_LENGTH;
+
+   if (user_buffer == 0) {
+      #ifndef STB_HEXWAVE_NO_ALLOCATION
+      blep_buffer  = (float *) malloc(sizeof(float)*blep_buffer_count);
+      blamp_buffer = (float *) malloc(sizeof(float)*blep_buffer_count);
+      #endif
+   } else {
+      blep_buffer  = ramp+n;
+      blamp_buffer = blep_buffer + blep_buffer_count;
+   }
+
+   // compute BLEP and BLAMP by integerating windowed sinc
+   for (i=0; i < n; ++i) {
+      for (j=0; j < 16; ++j) {
+         float sinc_t = 3.141592f* (i-half) / oversample;
+         float sinc   = (i==half) ? 1.0f : (float) sin(sinc_t) / (sinc_t);
+         float wt     = 2.0f*3.1415926f * i / (n-1);
+         float window = (float) (0.355768 - 0.487396*cos(wt) + 0.144232*cos(2*wt) - 0.012604*cos(3*wt)); // Nuttall
+         double value       =         window * sinc;
+         integrate_impulse +=         value/16;
+         integrate_step    +=         integrate_impulse/16;
+      }
+      step[i]            = (float) integrate_impulse;
+      ramp[i]            = (float) integrate_step;
+   }
+
+   // renormalize
+   for (i=0; i < n; ++i) {
+      step[i] = step[i] * (float) (1.0       / step[n-1]); // step needs to reach to 1.0
+      ramp[i] = ramp[i] * (float) (halfwidth / ramp[n-1]); // ramp needs to become a slope of 1.0 after oversampling
+   }
+
+   // deinterleave to allow efficient interpolation e.g. w/SIMD
+   for (j=0; j <= oversample; ++j) {
+      for (i=0; i < width; ++i) {
+         blep_buffer [j*width+i] = step[j+i*oversample];
+         blamp_buffer[j*width+i] = ramp[j+i*oversample];
+      }
+   }
+
+   // subtract out the naive waveform; note we can't do this to the raw data
+   // above, because we want the discontinuity to be in a different locations
+   // for j=0 and j=oversample (which exists to provide something to interpolate against)
+   for (j=0; j <= oversample; ++j) {
+      // subtract step
+      for (i=halfwidth; i < width; ++i)
+         blep_buffer [j*width+i] -= 1.0f;
+      // subtract ramp
+      for (i=halfwidth; i < width; ++i)
+         blamp_buffer[j*width+i] -= (j+i*oversample-half)*(1.0f/oversample);
+   }
+
+   hexblep.blep  = blep_buffer;
+   hexblep.blamp = blamp_buffer;
+   hexblep.width = width;
+   hexblep.oversample = oversample;
+
+   #ifndef STB_HEXWAVE_NO_ALLOCATION
+   if (user_buffer == 0)
+      free(buffers);
+   #endif
+}
+#endif // STB_HEXWAVE_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_image.h b/lib/stb/stb_image.h
new file mode 100644
index 0000000..9eedabe
--- /dev/null
+++ b/lib/stb/stb_image.h
@@ -0,0 +1,7988 @@
+/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.30  (2024-05-31) avoid erroneous gcc warning
+      2.29  (2023-05-xx) optimizations
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two ints fits in a signed short, 0 on overflow.
+static int stbi__mul2shorts_valid(int a, int b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      stbi_uc x = stbi__get8(j->s);
+      while (x == 0xff) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   int hit_zeof_once;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         if (!a->hit_zeof_once) {
+            // This is the first time we hit eof, insert 16 extra padding btis
+            // to allow us to keep going; if we actually consume any of them
+            // though, that is invalid data. This is caught later.
+            a->hit_zeof_once = 1;
+            a->num_bits += 16; // add 16 implicit zero bits
+         } else {
+            // We already inserted our extra 16 padding bits and are again
+            // out, this stream is actually prematurely terminated.
+            return -1;
+         }
+      } else {
+         stbi__fill_bits(a);
+      }
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            if (a->hit_zeof_once && a->num_bits < 16) {
+               // The first time we hit zeof, we inserted 16 extra zero bits into our bit
+               // buffer so the decoder can just do its speculative decoding. But if we
+               // actually consumed any of those bits (which is the case when num_bits < 16),
+               // the stream actually read past the end so it is malformed.
+               return stbi__err("unexpected end","Corrupt PNG");
+            }
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (len > a->zout_end - zout) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   a->hit_zeof_once = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filter used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   // This formulation looks very different from the reference in the PNG spec, but is
+   // actually equivalent and has favorable data dependencies and admits straightforward
+   // generation of branch-free code, which helps performance significantly.
+   int thresh = c*3 - (a + b);
+   int lo = a < b ? a : b;
+   int hi = a < b ? b : a;
+   int t0 = (hi <= thresh) ? lo : c;
+   int t1 = (thresh <= lo) ? hi : t0;
+   return t1;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// adds an extra all-255 alpha channel
+// dest == src is legal
+// img_n must be 1 or 3
+static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n)
+{
+   int i;
+   // must process data backwards since we allow dest==src
+   if (img_n == 1) {
+      for (i=x-1; i >= 0; --i) {
+         dest[i*2+1] = 255;
+         dest[i*2+0] = src[i];
+      }
+   } else {
+      STBI_ASSERT(img_n == 3);
+      for (i=x-1; i >= 0; --i) {
+         dest[i*4+3] = 255;
+         dest[i*4+2] = src[i*3+2];
+         dest[i*4+1] = src[i*3+1];
+         dest[i*4+0] = src[i*3+0];
+      }
+   }
+}
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   stbi_uc *filter_buf;
+   int all_ok = 1;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   // note: error exits here don't need to clean up a->out individually,
+   // stbi__do_png always does on error.
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG");
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   // Allocate two scan lines worth of filter workspace buffer.
+   filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0);
+   if (!filter_buf) return stbi__err("outofmem", "Out of memory");
+
+   // Filtering for low-bit-depth images
+   if (depth < 8) {
+      filter_bytes = 1;
+      width = img_width_bytes;
+   }
+
+   for (j=0; j < y; ++j) {
+      // cur/prior filter buffers alternate
+      stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes;
+      stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes;
+      stbi_uc *dest = a->out + stride*j;
+      int nk = width * filter_bytes;
+      int filter = *raw++;
+
+      // check filter type
+      if (filter > 4) {
+         all_ok = stbi__err("invalid filter","Corrupt PNG");
+         break;
+      }
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // perform actual filtering
+      switch (filter) {
+      case STBI__F_none:
+         memcpy(cur, raw, nk);
+         break;
+      case STBI__F_sub:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]);
+         break;
+      case STBI__F_up:
+         for (k = 0; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+         break;
+      case STBI__F_avg:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1));
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1));
+         break;
+      case STBI__F_paeth:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes]));
+         break;
+      case STBI__F_avg_first:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1));
+         break;
+      }
+
+      raw += nk;
+
+      // expand decoded bits in cur to dest, also adding an extra alpha channel if desired
+      if (depth < 8) {
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+         stbi_uc *in = cur;
+         stbi_uc *out = dest;
+         stbi_uc inb = 0;
+         stbi__uint32 nsmp = x*img_n;
+
+         // expand bits to bytes first
+         if (depth == 4) {
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 1) == 0) inb = *in++;
+               *out++ = scale * (inb >> 4);
+               inb <<= 4;
+            }
+         } else if (depth == 2) {
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 3) == 0) inb = *in++;
+               *out++ = scale * (inb >> 6);
+               inb <<= 2;
+            }
+         } else {
+            STBI_ASSERT(depth == 1);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 7) == 0) inb = *in++;
+               *out++ = scale * (inb >> 7);
+               inb <<= 1;
+            }
+         }
+
+         // insert alpha=255 values if desired
+         if (img_n != out_n)
+            stbi__create_png_alpha_expand8(dest, dest, x, img_n);
+      } else if (depth == 8) {
+         if (img_n == out_n)
+            memcpy(dest, cur, x*img_n);
+         else
+            stbi__create_png_alpha_expand8(dest, cur, x, img_n);
+      } else if (depth == 16) {
+         // convert the image data from big-endian to platform-native
+         stbi__uint16 *dest16 = (stbi__uint16*)dest;
+         stbi__uint32 nsmp = x*img_n;
+
+         if (img_n == out_n) {
+            for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
+               *dest16 = (cur[0] << 8) | cur[1];
+         } else {
+            STBI_ASSERT(img_n+1 == out_n);
+            if (img_n == 1) {
+               for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = 0xffff;
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = (cur[2] << 8) | cur[3];
+                  dest16[2] = (cur[4] << 8) | cur[5];
+                  dest16[3] = 0xffff;
+               }
+            }
+         }
+      }
+   }
+
+   STBI_FREE(filter_buf);
+   if (!all_ok) return 0;
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
+                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n && k < 3; ++k)
+                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_image_resize2.h b/lib/stb/stb_image_resize2.h
new file mode 100644
index 0000000..2f26274
--- /dev/null
+++ b/lib/stb/stb_image_resize2.h
@@ -0,0 +1,10601 @@
+/* stb_image_resize2 - v2.12 - public domain image resizing
+
+   by Jeff Roberts (v2) and Jorge L Rodriguez
+   http://github.com/nothings/stb
+
+   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
+   scaling and translation is supported, no rotations or shears.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   EASY API CALLS:
+     Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
+
+     stbir_resize_uint8_srgb( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                              output_pixels, output_w, output_h, output_stride_in_bytes,
+                              pixel_layout_enum )
+
+     stbir_resize_uint8_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                                output_pixels, output_w, output_h, output_stride_in_bytes,
+                                pixel_layout_enum )
+
+     stbir_resize_float_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                                output_pixels, output_w, output_h, output_stride_in_bytes,
+                                pixel_layout_enum )
+
+     If you pass NULL or zero for the output_pixels, we will allocate the output buffer
+     for you and return it from the function (free with free() or STBIR_FREE).
+     As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
+
+   API LEVELS
+      There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
+
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      MEMORY ALLOCATION
+         By default, we use malloc and free for memory allocation.  To override the
+         memory allocation, before the implementation #include, add a:
+
+            #define STBIR_MALLOC(size,user_data) ...
+            #define STBIR_FREE(ptr,user_data)   ...
+
+         Each resize makes exactly one call to malloc/free (unless you use the
+         extended API where you can do one allocation for many resizes). Under
+         address sanitizer, we do separate allocations to find overread/writes.
+
+      PERFORMANCE
+         This library was written with an emphasis on performance. When testing
+         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
+         STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
+         libs do by default). Also, make sure SIMD is turned on of course (default
+         for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
+
+         This library also comes with profiling built-in. If you define STBIR_PROFILE,
+         you can use the advanced API and get low-level profiling information by
+         calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
+         after a resize.
+
+      SIMD
+         Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
+
+         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
+         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
+         STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
+         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
+         support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
+
+         On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
+         we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
+         on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
+         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
+         automatically enable NEON.
+
+         On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
+         for converting back and forth to half-floats. This is autoselected when we
+         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
+         the built-in half float hardware NEON instructions.
+
+         You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
+         Because x86 doesn't always have fma, we turn it off by default to maintain
+         determinism across all platforms. If you don't care about non-FMA determinism
+         and are willing to restrict yourself to more recent x86 CPUs (around the AVX
+         timeframe), then fma will give you around a 15% speedup.
+
+         You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
+         off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
+         to 40% faster, and AVX2 is generally another 12%.
+
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how the alpha
+         channel of an image is processed.
+
+         When alpha represents transparency, it is important that when combining
+         colors with filtering, the pixels should not be treated equally; they
+         should use a weighted average based on their alpha values. For example,
+         if a pixel is 1% opaque bright green and another pixel is 99% opaque
+         black and you average them, the average will be 50% opaque, but the
+         unweighted average and will be a middling green color, while the weighted
+         average will be nearly black. This means the unweighted version introduced
+         green energy that didn't exist in the source image.
+
+         (If you want to know why this makes sense, you can work out the math for
+         the following: consider what happens if you alpha composite a source image
+         over a fixed color and then average the output, vs. if you average the
+         source image pixels and then composite that over the same fixed color.
+         Only the weighted average produces the same result as the ground truth
+         composite-then-average result.)
+
+         Therefore, it is in general best to "alpha weight" the pixels when applying
+         filters to them. This essentially means multiplying the colors by the alpha
+         values before combining them, and then dividing by the alpha value at the
+         end.
+
+         The computer graphics industry introduced a technique called "premultiplied
+         alpha" or "associated alpha" in which image colors are stored in image files
+         already multiplied by their alpha. This saves some math when compositing,
+         and also avoids the need to divide by the alpha at the end (which is quite
+         inefficient). However, while premultiplied alpha is common in the movie CGI
+         industry, it is not commonplace in other industries like videogames, and most
+         consumer file formats are generally expected to contain not-premultiplied
+         colors. For example, Photoshop saves PNG files "unpremultiplied", and web
+         browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
+
+         Note that there are three possibilities that might describe your image
+         and resize expectation:
+
+             1. images are not premultiplied, alpha weighting is desired
+             2. images are not premultiplied, alpha weighting is not desired
+             3. images are premultiplied
+
+         Both case #2 and case #3 require the exact same math: no alpha weighting
+         should be applied or removed. Only case 1 requires extra math operations;
+         the other two cases can be handled identically.
+
+         stb_image_resize expects case #1 by default, applying alpha weighting to
+         images, expecting the input images to be unpremultiplied. This is what the
+         COLOR+ALPHA buffer types tell the resizer to do.
+
+         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
+         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
+         non-premultiplied. In these cases, the resizer will alpha weight the colors
+         (effectively creating the premultiplied image), do the filtering, and then
+         convert back to non-premult on exit.
+
+         When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
+         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
+         ARE premultiplied. In this case, the resizer doesn't have to do the
+         premultipling - it can filter directly on the input. This about twice as
+         fast as the non-premultiplied case, so it's the right option if your data is
+         already setup correctly.
+
+         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
+         telling us that there is no channel that represents transparency; it may be
+         RGB and some unrelated fourth channel that has been stored in the alpha
+         channel, but it is actually not alpha. No special processing will be
+         performed.
+
+         The difference between the generic 4 or 2 channel layouts, and the
+         specialized _PM versions is with the _PM versions you are telling us that
+         the data *is* alpha, just don't premultiply it. That's important when
+         using SRGB pixel formats, we need to know where the alpha is, because
+         it is converted linearly (rather than with the SRGB converters).
+
+         Because alpha weighting produces the same effect as premultiplying, you
+         even have the option with non-premultiplied inputs to let the resizer
+         produce a premultiplied output. Because the intially computed alpha-weighted
+         output image is effectively premultiplied, this is actually more performant
+         than the normal path which un-premultiplies the output image as a final step.
+
+         Finally, when converting both in and out of non-premulitplied space (for
+         example, when using STBIR_RGBA), we go to somewhat heroic measures to
+         ensure that areas with zero alpha value pixels get something reasonable
+         in the RGB values. If you don't care about the RGB values of zero alpha
+         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
+         function - this runs a premultiplied resize about 25% faster. That said,
+         when you really care about speed, using premultiplied pixels for both in
+         and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
+         options.
+
+      PIXEL LAYOUT CONVERSION
+         The resizer can convert from some pixel layouts to others. When using the
+         stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
+         on input, and STBIR_ARGB on output, and it will re-organize the channels
+         during the resize. Currently, you can only convert between two pixel
+         layouts with the same number of channels.
+
+      DETERMINISM
+         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
+         This requires compiling with fast-math off (using at least /fp:precise).
+         Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
+         We attempt to do this with pragmas, but with Clang, you usually want to add
+         -ffp-contract=off to the command line as well.
+
+         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
+         if the scalar x87 unit gets used at all, we immediately lose determinism.
+         On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
+         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
+         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
+         -fpmath=sse.
+
+         Note that we will not be deterministic with float data containing NaNs -
+         the NaNs will propagate differently on different SIMD and platforms.
+
+         If you turn on STBIR_USE_FMA, then we will be deterministic with other
+         fma targets, but we will differ from non-fma targets (this is unavoidable,
+         because a fma isn't simply an add with a mult - it also introduces a
+         rounding difference compared to non-fma instruction sequences.
+
+      FLOAT PIXEL FORMAT RANGE
+         Any range of values can be used for the non-alpha float data that you pass
+         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
+         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
+         scale back properly. The alpha channel must also be 0 to 1 for any format
+         that does premultiplication prior to resizing.
+
+         Note also that with float output, using filters with negative lobes, the
+         output filtered values might go slightly out of range. You can define
+         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
+         to clamp to on output, if that's important.
+
+      MAX/MIN SCALE FACTORS
+         The input pixel resolutions are in integers, and we do the internal pointer
+         resolution in size_t sized integers. However, the scale ratio from input
+         resolution to output resolution is calculated in float form. This means
+         the effective possible scale ratio is limited to 24 bits (or 16 million
+         to 1). As you get close to the size of the float resolution (again, 16
+         million pixels wide or high), you might start seeing float inaccuracy
+         issues in general in the pipeline. If you have to do extreme resizes,
+         you can usually do this is multiple stages (using float intermediate
+         buffers).
+
+      FLIPPED IMAGES
+         Stride is just the delta from one scanline to the next. This means you can
+         use a negative stride to handle inverted images (point to the final
+         scanline and use a negative stride). You can invert the input or output,
+         using negative strides.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters to
+         use, you can change the compile-time defaults with:
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are supplied. For a list of supported
+         filters, see the stbir_filter enum. You can install your own filters by
+         using the stbir_set_filter_callbacks function.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can use the the
+         scanline callbacks in the extended API. It would have to be a *very* large
+         image resample to need progress though - we're very fast.
+
+      CEIL and FLOOR
+         In scalar mode, the only functions we use from math.h are ceilf and floorf,
+         but if you have your own versions, you can define the STBIR_CEILF(v) and
+         STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
+         our own versions.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+     PORTING FROM VERSION 1
+        The API has changed. You can continue to use the old version of stb_image_resize.h,
+        which is available in the "deprecated/" directory.
+
+        If you're using the old simple-to-use API, porting is straightforward.
+        (For more advanced APIs, read the documentation.)
+
+          stbir_resize_uint8():
+            - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_float():
+            - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_uint8_srgb():
+            - function name is unchanged
+            - cast channel count to `stbir_pixel_layout`
+            - above is sufficient unless your image has alpha and it's not RGBA/BGRA
+              - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
+
+          stbir_resize_uint8_srgb_edgemode()
+            - switch to the "medium complexity" API
+            - stbir_resize(), very similar API but a few more parameters:
+              - pixel_layout: cast channel count to `stbir_pixel_layout`
+              - data_type:    STBIR_TYPE_UINT8_SRGB
+              - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
+              - filter:       STBIR_FILTER_DEFAULT
+            - which channel is alpha is specified in stbir_pixel_layout, see enum for details
+
+      FUTURE TODOS
+        *  For polyphase integral filters, we just memcpy the coeffs to dupe
+           them, but we should indirect and use the same coeff memory.
+        *  Add pixel layout conversions for sensible different channel counts
+           (maybe, 1->3/4, 3->4, 4->1, 3->1).
+         * For SIMD encode and decode scanline routines, do any pre-aligning
+           for bad input/output buffer alignments and pitch?
+         * For very wide scanlines, we should we do vertical strips to stay within
+           L2 cache. Maybe do chunks of 1K pixels at a time. There would be
+           some pixel reconversion, but probably dwarfed by things falling out
+           of cache. Probably also something possible with alternating between
+           scattering and gathering at high resize scales?
+         * Rewrite the coefficient generator to do many at once.
+         * AVX-512 vertical kernels - worried about downclocking here.
+         * Convert the reincludes to macros when we know they aren't changing.
+         * Experiment with pivoting the horizontal and always using the
+           vertical filters (which are faster, but perhaps not enough to overcome
+           the pivot cost and the extra memory touches). Need to buffer the whole
+           image so have to balance memory use.
+         * Most of our code is internally function pointers, should we compile
+           all the SIMD stuff always and dynamically dispatch?
+
+   CONTRIBUTORS
+      Jeff Roberts: 2.0 implementation, optimizations, SIMD
+      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
+      Fabian Giesen: half float and srgb converters
+      Sean Barrett: API design, optimizations
+      Jorge L Rodriguez: Original 1.0 implementation
+      Aras Pranckevicius: bugfixes
+      Nathan Reed: warning fixes for 1.0
+
+   REVISIONS
+      2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
+      2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
+                          with AVX-2, fix some weird scaling edge conditions with
+                          point sample mode.
+      2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
+                          fix MSVC 32-bit arm half float routines.
+      2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
+                          hardware half floats).
+      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
+                          to Ryan Salsbury), fix for sub-rect resizes, use the
+                          pragmas to control unrolling when they are available.
+      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
+                          wide scanlines when downsampling (caused by extra input 
+                          converting), fix for wide scanline resamples with many 
+                          splits (int overflow), fix GCC warning.
+      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
+                          undersampling a single row on rare resize ratios (about 1%).
+      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
+                        fix for output callback (thanks Julien Koenen).
+      2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
+      2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
+      2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
+                          2x-5x faster without simd, 4x-12x faster with simd,
+                          in some cases, 20x to 40x faster esp resizing large to very small.
+      0.96 (2019-03-04) fixed warnings
+      0.95 (2017-07-23) fixed warnings
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+*/
+
+#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS)   // for internal re-includes
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+
+#include <stddef.h>
+#ifdef _MSC_VER
+typedef unsigned char    stbir_uint8;
+typedef unsigned short   stbir_uint16;
+typedef unsigned int     stbir_uint32;
+typedef unsigned __int64 stbir_uint64;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+typedef uint64_t stbir_uint64;
+#endif
+
+#ifdef _M_IX86_FP
+#if ( _M_IX86_FP >= 1 )
+#ifndef STBIR_SSE
+#define STBIR_SSE
+#endif
+#endif
+#endif
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
+  #ifndef STBIR_SSE2
+    #define STBIR_SSE2
+  #endif
+  #if defined(__AVX__) || defined(STBIR_AVX2)
+    #ifndef STBIR_AVX
+      #ifndef STBIR_NO_AVX
+        #define STBIR_AVX
+      #endif
+    #endif
+  #endif
+  #if defined(__AVX2__) || defined(STBIR_AVX2)
+    #ifndef STBIR_NO_AVX2
+      #ifndef STBIR_AVX2
+        #define STBIR_AVX2
+      #endif
+      #if defined( _MSC_VER ) && !defined(__clang__)
+        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
+          #define STBIR_FP16C
+        #endif
+      #endif
+    #endif
+  #endif
+  #ifdef __F16C__
+    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
+      #define STBIR_FP16C
+    #endif
+  #endif
+#endif
+
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
+#ifndef STBIR_NEON
+#define STBIR_NEON
+#endif
+#endif
+
+#if defined(_M_ARM) || defined(__arm__)
+#ifdef STBIR_USE_FMA
+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
+#endif
+#endif
+
+#if defined(__wasm__) && defined(__wasm_simd128__)
+#ifndef STBIR_WASM
+#define STBIR_WASM
+#endif
+#endif
+
+#ifndef STBIRDEF
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+////   start "header file" ///////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * stride is the offset between successive rows of image data
+//        in memory, in bytes. specify 0 for packed continuously in memory
+//     * colorspace is linear or sRGB as specified by function name
+//     * Uses the default filters
+//     * Uses edge mode clamped
+//     * returned result is 1 for success or 0 in case of an error.
+
+
+// stbir_pixel_layout specifies:
+//   number of channels
+//   order of channels
+//   whether color is premultiplied by alpha
+// for back compatibility, you can cast the old channel count to an stbir_pixel_layout
+typedef enum
+{
+  STBIR_1CHANNEL = 1,
+  STBIR_2CHANNEL = 2,
+  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping)
+  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
+  STBIR_4CHANNEL = 5,
+
+  STBIR_RGBA = 4,                   // alpha formats, where alpha is NOT premultiplied into color channels
+  STBIR_BGRA = 6,
+  STBIR_ARGB = 7,
+  STBIR_ABGR = 8,
+  STBIR_RA   = 9,
+  STBIR_AR   = 10,
+
+  STBIR_RGBA_PM = 11,               // alpha formats, where alpha is premultiplied into color channels
+  STBIR_BGRA_PM = 12,
+  STBIR_ARGB_PM = 13,
+  STBIR_ABGR_PM = 14,
+  STBIR_RA_PM   = 15,
+  STBIR_AR_PM   = 16,
+
+  STBIR_RGBA_NO_AW = 11,            // alpha formats, where NO alpha weighting is applied at all!
+  STBIR_BGRA_NO_AW = 12,            //   these are just synonyms for the _PM flags (which also do
+  STBIR_ARGB_NO_AW = 13,            //   no alpha weighting). These names just make it more clear
+  STBIR_ABGR_NO_AW = 14,            //   for some folks).
+  STBIR_RA_NO_AW   = 15,
+  STBIR_AR_NO_AW   = 16,
+
+} stbir_pixel_layout;
+
+//===============================================================
+//  Simple-complexity API
+//
+//    If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
+//--------------------------------
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_type );
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_type );
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_type );
+//===============================================================
+
+//===============================================================
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
+//     * Edge wrap can selected explicitly
+//     * Filter can be selected explicitly
+//--------------------------------
+
+typedef enum
+{
+  STBIR_EDGE_CLAMP   = 0,
+  STBIR_EDGE_REFLECT = 1,
+  STBIR_EDGE_WRAP    = 2,  // this edge mode is slower and uses more memory
+  STBIR_EDGE_ZERO    = 3,
+} stbir_edge;
+
+typedef enum
+{
+  STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+  STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+  STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+  STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+  STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+  STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+  STBIR_FILTER_POINT_SAMPLE = 6,  // Simple point sampling
+  STBIR_FILTER_OTHER        = 7,  // User callback specified
+} stbir_filter;
+
+typedef enum
+{
+  STBIR_TYPE_UINT8            = 0,
+  STBIR_TYPE_UINT8_SRGB       = 1,
+  STBIR_TYPE_UINT8_SRGB_ALPHA = 2,  // alpha channel, when present, should also be SRGB (this is very unusual)
+  STBIR_TYPE_UINT16           = 3,
+  STBIR_TYPE_FLOAT            = 4,
+  STBIR_TYPE_HALF_FLOAT       = 5
+} stbir_datatype;
+
+// medium api
+STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                     void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                               stbir_pixel_layout pixel_layout, stbir_datatype data_type,
+                               stbir_edge edge, stbir_filter filter );
+//===============================================================
+
+
+
+//===============================================================
+// Extended-complexity API
+//
+// This API exposes all resize functionality.
+//
+//     * Separate filter types for each axis
+//     * Separate edge modes for each axis
+//     * Separate input and output data types
+//     * Can specify regions with subpixel correctness
+//     * Can specify alpha flags
+//     * Can specify a memory callback
+//     * Can specify a callback data type for pixel input and output
+//     * Can be threaded for a single resize
+//     * Can be used to resize many frames without recalculating the sampler info
+//
+//  Use this API as follows:
+//     1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
+//     2) Call any of the stbir_set functions
+//     3) Optionally call stbir_build_samplers() if you are going to resample multiple times
+//        with the same input and output dimensions (like resizing video frames)
+//     4) Resample by calling stbir_resize_extended().
+//     5) Call stbir_free_samplers() if you called stbir_build_samplers()
+//--------------------------------
+
+
+// Types:
+
+// INPUT CALLBACK: this callback is used for input scanlines
+typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
+
+// OUTPUT CALLBACK: this callback is used for output scanlines
+typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
+
+// callbacks for user installed filters
+typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
+typedef float stbir__support_callback( float scale, void * user_data );
+
+// internal structure with precomputed scaling
+typedef struct stbir__info stbir__info;
+
+typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
+{
+  void * user_data;
+  void const * input_pixels;
+  int input_w, input_h;
+  double input_s0, input_t0, input_s1, input_t1;
+  stbir_input_callback * input_cb;
+  void * output_pixels;
+  int output_w, output_h;
+  int output_subx, output_suby, output_subw, output_subh;
+  stbir_output_callback * output_cb;
+  int input_stride_in_bytes;
+  int output_stride_in_bytes;
+  int splits;
+  int fast_alpha;
+  int needs_rebuild;
+  int called_alloc;
+  stbir_pixel_layout input_pixel_layout_public;
+  stbir_pixel_layout output_pixel_layout_public;
+  stbir_datatype input_data_type;
+  stbir_datatype output_data_type;
+  stbir_filter horizontal_filter, vertical_filter;
+  stbir_edge horizontal_edge, vertical_edge;
+  stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
+  stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
+  stbir__info * samplers;
+} STBIR_RESIZE;
+
+// extended complexity api
+
+
+// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
+                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
+                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
+                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type );
+
+//===============================================================
+// You can update these parameters any time after resize_init and there is no cost
+//--------------------------------
+
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
+
+//===============================================================
+
+
+//===============================================================
+// If you call any of these functions, you will trigger a sampler rebuild!
+//--------------------------------
+
+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout );  // sets new buffer layouts
+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
+
+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
+
+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
+
+// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
+//   that fills the zero alpha pixel's RGB values with something plausible.  If you don't care about areas of
+//   zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
+//   types of resizes.
+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
+//===============================================================
+
+
+//===============================================================
+// You can call build_samplers to prebuild all the internal data we need to resample.
+//   Then, if you call resize_extended many times with the same resize, you only pay the
+//   cost once.
+// If you do call build_samplers, you MUST call free_samplers eventually.
+//--------------------------------
+
+// This builds the samplers and does one allocation
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
+
+// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
+STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
+//===============================================================
+
+
+// And this is the main function to perform the resize synchronously on one thread.
+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
+
+
+//===============================================================
+// Use these functions for multithreading.
+//   1) You call stbir_build_samplers_with_splits first on the main thread
+//   2) Then stbir_resize_with_split on each thread
+//   3) stbir_free_samplers when done on the main thread
+//--------------------------------
+
+// This will build samplers for threading.
+//   You can pass in the number of threads you'd like to use (try_splits).
+//   It returns the number of splits (threads) that you can call it with.
+///  It might be less if the image resize can't be split up that many ways.
+
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
+
+// This function does a split of the resizing (you call this fuction for each
+// split, on multiple threads). A split is a piece of the output resize pixel space.
+
+// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
+
+// Usually, you will always call stbir_resize_split with split_start as the thread_index
+//   and "1" for the split_count.
+// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
+//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
+//   split_count each time to turn in into a 4 thread resize. (This is unusual).
+
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
+//===============================================================
+
+
+//===============================================================
+// Pixel Callbacks info:
+//--------------------------------
+
+//   The input callback is super flexible - it calls you with the input address
+//   (based on the stride and base pointer), it gives you an optional_output
+//   pointer that you can fill, or you can just return your own pointer into
+//   your own data.
+//
+//   You can also do conversion from non-supported data types if necessary - in
+//   this case, you ignore the input_ptr and just use the x and y parameters to
+//   calculate your own input_ptr based on the size of each non-supported pixel.
+//   (Something like the third example below.)
+//
+//   You can also install just an input or just an output callback by setting the
+//   callback that you don't want to zero.
+//
+//     First example, progress: (getting a callback that you can monitor the progress):
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           percentage_done = y / input_height;
+//           return input_ptr;  // use buffer from call
+//        }
+//
+//     Next example, copying: (copy from some other buffer or stream):
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
+//           return optional_output;  // return the optional buffer that we filled
+//        }
+//
+//     Third example, input another buffer without copying: (zero-copy from other buffer):
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
+//           return pixels;       // return pointer to your data without copying
+//        }
+//
+//
+//   The output callback is considerably simpler - it just calls you so that you can dump
+//   out each scanline. You could even directly copy out to disk if you have a simple format
+//   like TGA or BMP. You can also convert to other output types here if you want.
+//
+//   Simple example:
+//        void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
+//        {
+//           percentage_done = y / output_height;
+//           fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
+//        }
+//===============================================================
+
+
+
+
+//===============================================================
+// optional built-in profiling API
+//--------------------------------
+
+#ifdef STBIR_PROFILE
+
+typedef struct STBIR_PROFILE_INFO
+{
+  stbir_uint64 total_clocks;
+
+  // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
+  //    there are "resize_count" number of zones
+  stbir_uint64 clocks[ 8 ];
+  char const ** descriptions;
+
+  // count of clocks and descriptions
+  stbir_uint32 count;
+} STBIR_PROFILE_INFO;
+
+// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
+
+// use after calling stbir_resize_extended
+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
+
+// use after calling stbir_resize_extended_split
+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
+
+//===============================================================
+
+#endif
+
+
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+
+#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
+#define STBIR_FREE(ptr,user_data)    ((void)(user_data), free(ptr))
+// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
+#endif
+
+#ifdef _MSC_VER
+
+#define stbir__inline __forceinline
+
+#else
+
+#define stbir__inline __inline__
+
+// Clang address sanitizer
+#if defined(__has_feature)
+  #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
+    #ifndef STBIR__SEPARATE_ALLOCATIONS
+      #define STBIR__SEPARATE_ALLOCATIONS
+    #endif
+  #endif
+#endif
+
+#endif
+
+// GCC and MSVC
+#if defined(__SANITIZE_ADDRESS__)
+  #ifndef STBIR__SEPARATE_ALLOCATIONS
+    #define STBIR__SEPARATE_ALLOCATIONS
+  #endif
+#endif
+
+// Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
+// Otherwise, this is a determinism disaster.
+#ifndef STBIR_DONT_CHANGE_FP_CONTRACT  // override in case you don't want this behavior
+#if defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER > 1200
+#pragma fp_contract(off)
+#endif
+#elif defined(__GNUC__) &&  !defined(__clang__)
+#pragma GCC optimize("fp-contract=off")
+#else
+#pragma STDC FP_CONTRACT OFF
+#endif
+#endif
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED(v)  (void)(v)
+#else
+#define STBIR__UNUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+
+#ifndef STBIR__HEADER_FILENAME
+#define STBIR__HEADER_FILENAME "stb_image_resize2.h"
+#endif
+
+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
+typedef enum
+{
+  STBIRI_1CHANNEL = 0,
+  STBIRI_2CHANNEL = 1,
+  STBIRI_RGB      = 2,
+  STBIRI_BGR      = 3,
+  STBIRI_4CHANNEL = 4,
+
+  STBIRI_RGBA = 5,
+  STBIRI_BGRA = 6,
+  STBIRI_ARGB = 7,
+  STBIRI_ABGR = 8,
+  STBIRI_RA   = 9,
+  STBIRI_AR   = 10,
+
+  STBIRI_RGBA_PM = 11,
+  STBIRI_BGRA_PM = 12,
+  STBIRI_ARGB_PM = 13,
+  STBIRI_ABGR_PM = 14,
+  STBIRI_RA_PM   = 15,
+  STBIRI_AR_PM   = 16,
+} stbir_internal_pixel_layout;
+
+// define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
+#define STBIR_BGR bad_dont_use_in_implementation
+#define STBIR_1CHANNEL STBIR_BGR
+#define STBIR_2CHANNEL STBIR_BGR
+#define STBIR_RGB STBIR_BGR
+#define STBIR_RGBA STBIR_BGR
+#define STBIR_4CHANNEL STBIR_BGR
+#define STBIR_BGRA STBIR_BGR
+#define STBIR_ARGB STBIR_BGR
+#define STBIR_ABGR STBIR_BGR
+#define STBIR_RA STBIR_BGR
+#define STBIR_AR STBIR_BGR
+#define STBIR_RGBA_PM STBIR_BGR
+#define STBIR_BGRA_PM STBIR_BGR
+#define STBIR_ARGB_PM STBIR_BGR
+#define STBIR_ABGR_PM STBIR_BGR
+#define STBIR_RA_PM STBIR_BGR
+#define STBIR_AR_PM STBIR_BGR
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+  1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
+};
+
+// When gathering, the contributors are which source pixels contribute.
+// When scattering, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+  int n0; // First contributing pixel
+  int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+  int lowest;    // First sample index for whole filter
+  int highest;   // Last sample index for whole filter
+  int widest;    // widest single set of samples for an output
+} stbir__filter_extent_info;
+
+typedef struct
+{
+  int n0; // First pixel of decode buffer to write to
+  int n1; // Last pixel of decode that will be written to
+  int pixel_offset_for_input;  // Pixel offset into input_scanline
+} stbir__span;
+
+typedef struct stbir__scale_info
+{
+  int input_full_size;
+  int output_sub_size;
+  float scale;
+  float inv_scale;
+  float pixel_shift; // starting shift in output pixel space (in pixels)
+  int scale_is_rational;
+  stbir_uint32 scale_numerator, scale_denominator;
+} stbir__scale_info;
+
+typedef struct
+{
+  stbir__contributors * contributors;
+  float* coefficients;
+  stbir__contributors * gather_prescatter_contributors;
+  float * gather_prescatter_coefficients;
+  stbir__scale_info scale_info;
+  float support;
+  stbir_filter filter_enum;
+  stbir__kernel_callback * filter_kernel;
+  stbir__support_callback * filter_support;
+  stbir_edge edge;
+  int coefficient_width;
+  int filter_pixel_width;
+  int filter_pixel_margin;
+  int num_contributors;
+  int contributors_size;
+  int coefficients_size;
+  stbir__filter_extent_info extent_info;
+  int is_gather;  // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
+  int gather_prescatter_num_contributors;
+  int gather_prescatter_coefficient_width;
+  int gather_prescatter_contributors_size;
+  int gather_prescatter_coefficients_size;
+} stbir__sampler;
+
+typedef struct
+{
+  stbir__contributors conservative;
+  int edge_sizes[2];    // this can be less than filter_pixel_margin, if the filter and scaling falls off
+  stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
+} stbir__extents;
+
+typedef struct
+{
+#ifdef STBIR_PROFILE
+  union
+  {
+    struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
+    stbir_uint64 array[8];
+  } profile;
+  stbir_uint64 * current_zone_excluded_ptr;
+#endif
+  float* decode_buffer;
+
+  int ring_buffer_first_scanline;
+  int ring_buffer_last_scanline;
+  int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+  int start_output_y, end_output_y;
+  int start_input_y, end_input_y;  // used in scatter only
+
+  #ifdef STBIR__SEPARATE_ALLOCATIONS
+    float** ring_buffers; // one pointer for each ring buffer
+  #else
+    float* ring_buffer;  // one big buffer that we index into
+  #endif
+
+  float* vertical_buffer;
+
+  char no_cache_straddle[64];
+} stbir__per_split_info;
+
+typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
+typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
+typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
+  stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
+typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
+typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
+
+struct stbir__info
+{
+#ifdef STBIR_PROFILE
+  union
+  {
+    struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
+    stbir_uint64 array[7];
+  } profile;
+  stbir_uint64 * current_zone_excluded_ptr;
+#endif
+  stbir__sampler horizontal;
+  stbir__sampler vertical;
+
+  void const * input_data;
+  void * output_data;
+
+  int input_stride_bytes;
+  int output_stride_bytes;
+  int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+  int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+
+  stbir_datatype input_type;
+  stbir_datatype output_type;
+
+  stbir_input_callback * in_pixels_cb;
+  void * user_data;
+  stbir_output_callback * out_pixels_cb;
+
+  stbir__extents scanline_extents;
+
+  void * alloced_mem;
+  stbir__per_split_info * split_info;  // by default 1, but there will be N of these allocated based on the thread init you did
+
+  stbir__decode_pixels_func * decode_pixels;
+  stbir__alpha_weight_func * alpha_weight;
+  stbir__horizontal_gather_channels_func * horizontal_gather_channels;
+  stbir__alpha_unweight_func * alpha_unweight;
+  stbir__encode_pixels_func * encode_pixels;
+
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+  int splits; // count of splits
+
+  stbir_internal_pixel_layout input_pixel_layout_internal;
+  stbir_internal_pixel_layout output_pixel_layout_internal;
+
+  int input_color_and_type;
+  int offset_x, offset_y; // offset within output_data
+  int vertical_first;
+  int channels;
+  int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
+  size_t alloced_total;
+};
+
+
+#define stbir__max_uint8_as_float             255.0f
+#define stbir__max_uint16_as_float            65535.0f
+#define stbir__max_uint8_as_float_inverted    (1.0f/255.0f)
+#define stbir__max_uint16_as_float_inverted   (1.0f/65535.0f)
+#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+
+// min/max friendly
+#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
+  if ( (x) < (xmin) ) (x) = (xmin);     \
+  if ( (x) > (xmax) ) (x) = (xmax);     \
+  break;                                \
+}
+
+static stbir__inline int stbir__min(int a, int b)
+{
+  return a < b ? a : b;
+}
+
+static stbir__inline int stbir__max(int a, int b)
+{
+  return a > b ? a : b;
+}
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+  0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+  0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+  0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+  0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+  0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+  0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+  0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+  0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+  0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+  0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+  0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+  0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+  0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+  0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+  0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+  0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+  0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+  0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+  0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+  0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+  0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+  0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+  0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+  0.982251f, 0.991102f, 1.0f
+};
+
+typedef union
+{
+  unsigned int u;
+  float f;
+} stbir__FP32;
+
+// From https://gist.github.com/rygorous/2203834
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+  0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+  0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+  0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+  0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+  0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+  0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+  0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+  0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+  0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+  0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+  0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+  0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+  0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+
+static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+  static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+  static const stbir__FP32 minval = { (127-13) << 23 };
+  stbir_uint32 tab,bias,scale,t;
+  stbir__FP32 f;
+
+  // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+  // The tests are carefully written so that NaNs map to 0, same as in the reference
+  // implementation.
+  if (!(in > minval.f)) // written this way to catch NaNs
+      return 0;
+  if (in > almostone.f)
+      return 255;
+
+  // Do the table lookup and unpack bias, scale
+  f.f = in;
+  tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+  bias = (tab >> 16) << 9;
+  scale = tab & 0xffff;
+
+  // Grab next-highest mantissa bits and perform linear interpolation
+  t = (f.u >> 12) & 0xff;
+  return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
+#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
+#endif
+
+#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
+#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
+#endif
+
+// restrict pointers for the output pointers, other loop and unroll control
+#if defined( _MSC_VER ) && !defined(__clang__)
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict
+  #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
+  #if _MSC_VER >= 1900
+    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START 
+  #endif
+#elif defined( __clang__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
+  #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif 
+#elif defined( __GNUC__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+  #if __GNUC__ >= 14
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif
+  #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
+#else
+  #define STBIR_STREAMOUT_PTR( star ) star
+  #define STBIR_NO_UNROLL( ptr )
+  #define STBIR_NO_UNROLL_LOOP_START
+#endif
+
+#ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
+#define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
+#endif
+
+#ifdef STBIR_NO_SIMD // force simd off for whatever reason
+
+// force simd off overrides everything else, so clear it all
+
+#ifdef STBIR_SSE2
+#undef STBIR_SSE2
+#endif
+
+#ifdef STBIR_AVX
+#undef STBIR_AVX
+#endif
+
+#ifdef STBIR_NEON
+#undef STBIR_NEON
+#endif
+
+#ifdef STBIR_AVX2
+#undef STBIR_AVX2
+#endif
+
+#ifdef STBIR_FP16C
+#undef STBIR_FP16C
+#endif
+
+#ifdef STBIR_WASM
+#undef STBIR_WASM
+#endif
+
+#ifdef STBIR_SIMD
+#undef STBIR_SIMD
+#endif
+
+#else // STBIR_SIMD
+
+#ifdef STBIR_SSE2
+  #include <emmintrin.h>
+
+  #define stbir__simdf __m128
+  #define stbir__simdi __m128i
+
+  #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
+  #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
+  #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values must be zero
+  #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
+
+  #define stbir__simdf_zeroP() _mm_setzero_ps()
+  #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
+
+  #define stbir__simdf_store( ptr, reg )  _mm_storeu_ps( (float*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
+  #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
+  #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
+
+  #define stbir__simdi_store( ptr, reg )  _mm_storeu_si128( (__m128i*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
+  #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
+
+  #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out2 = _mm_unpacklo_epi8( ireg, zero ); \
+    out3 = _mm_unpackhi_epi8( ireg, zero ); \
+    out0 = _mm_unpacklo_epi16( out2, zero ); \
+    out1 = _mm_unpackhi_epi16( out2, zero ); \
+    out2 = _mm_unpacklo_epi16( out3, zero ); \
+    out3 = _mm_unpackhi_epi16( out3, zero ); \
+  }
+
+#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out = _mm_unpacklo_epi8( ireg, zero ); \
+    out = _mm_unpacklo_epi16( out, zero ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out0 = _mm_unpacklo_epi16( ireg, zero ); \
+    out1 = _mm_unpackhi_epi16( ireg, zero ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
+  #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
+
+  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
+
+  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
+  #include <immintrin.h>
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
+  #else
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
+  #endif
+
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
+
+  static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
+  static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
+  #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
+  #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
+  #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    stbir__simdf af,bf; \
+    stbir__simdi a,b; \
+    af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
+    bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
+    af = _mm_max_ps( af, _mm_setzero_ps() ); \
+    bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
+    a = _mm_cvttps_epi32( af ); \
+    b = _mm_cvttps_epi32( bf ); \
+    a = _mm_packs_epi32( a, b ); \
+    out = _mm_packus_epi16( a, a ); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+      stbir__simdf_load( o0, (ptr) );    \
+      stbir__simdf_load( o1, (ptr)+4 );  \
+      stbir__simdf_load( o2, (ptr)+8 );  \
+      stbir__simdf_load( o3, (ptr)+12 ); \
+      {                                  \
+        __m128 tmp0, tmp1, tmp2, tmp3;   \
+        tmp0 = _mm_unpacklo_ps(o0, o1);  \
+        tmp2 = _mm_unpacklo_ps(o2, o3);  \
+        tmp1 = _mm_unpackhi_ps(o0, o1);  \
+        tmp3 = _mm_unpackhi_ps(o2, o3);  \
+        o0 = _mm_movelh_ps(tmp0, tmp2);  \
+        o1 = _mm_movehl_ps(tmp2, tmp0);  \
+        o2 = _mm_movelh_ps(tmp1, tmp3);  \
+        o3 = _mm_movehl_ps(tmp3, tmp1);  \
+      }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+      r0 = _mm_packs_epi32( r0, r1 ); \
+      r2 = _mm_packs_epi32( r2, r3 ); \
+      r1 = _mm_unpacklo_epi16( r0, r2 ); \
+      r3 = _mm_unpackhi_epi16( r0, r2 ); \
+      r0 = _mm_unpacklo_epi16( r1, r3 ); \
+      r2 = _mm_unpackhi_epi16( r1, r3 ); \
+      r0 = _mm_packus_epi16( r0, r2 ); \
+      stbir__simdi_store( ptr, r0 ); \
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
+
+  #if defined(_MSC_VER) && !defined(__clang__)
+    // msvc inits with 8 bytes
+    #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
+    #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
+    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
+  #else
+    // everything else inits with long long's
+    #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
+    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
+  #endif
+
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  #if defined(STBIR_AVX) || defined(__SSE4_1__)
+    #include <smmintrin.h>
+    #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
+  #else
+    STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
+    STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
+
+    #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
+      { \
+        stbir__simdi tmp0,tmp1; \
+        tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
+        tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
+        tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
+        tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
+        out = _mm_packs_epi32( tmp0, tmp1 ); \
+        out = _mm_sub_epi16( out, stbir__s16_32768 ); \
+      }
+
+  #endif
+
+  #define STBIR_SIMD
+
+  // if we detect AVX, set the simd8 defines
+  #ifdef STBIR_AVX
+    #include <immintrin.h>
+    #define STBIR_SIMD8
+    #define stbir__simdf8 __m256
+    #define stbir__simdi8 __m256i
+    #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
+    #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
+    #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
+    #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
+    #define stbir__simdi8_store( ptr, reg )  _mm256_storeu_si256( (__m256i*)(ptr), reg )
+    #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
+
+    #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
+    #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
+
+    #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
+    #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
+    #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
+    #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
+    #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
+    #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr )  // avx load instruction
+
+    #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
+    #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
+
+    #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
+    #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
+
+    #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
+
+    #ifdef STBIR_AVX2
+
+    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
+    { \
+      stbir__simdi8 a, zero  =_mm256_setzero_si256();\
+      a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
+      out0 = _mm256_unpacklo_epi16( a, zero ); \
+      out1 = _mm256_unpackhi_epi16( a, zero ); \
+    }
+
+    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
+    { \
+      stbir__simdi8 t; \
+      stbir__simdf8 af,bf; \
+      stbir__simdi8 a,b; \
+      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
+      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
+      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+      a = _mm256_cvttps_epi32( af ); \
+      b = _mm256_cvttps_epi32( bf ); \
+      t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
+      out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
+    }
+
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
+
+    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
+      { \
+        stbir__simdf8 af,bf; \
+        stbir__simdi8 a,b; \
+        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
+        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
+        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+        a = _mm256_cvttps_epi32( af ); \
+        b = _mm256_cvttps_epi32( bf ); \
+        (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
+      }
+
+    #else
+
+    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
+    { \
+      stbir__simdi a,zero = _mm_setzero_si128(); \
+      a = _mm_unpacklo_epi8( ireg, zero ); \
+      out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
+      a = _mm_unpackhi_epi8( ireg, zero ); \
+      out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
+    }
+
+    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
+    { \
+      stbir__simdi t; \
+      stbir__simdf8 af,bf; \
+      stbir__simdi8 a,b; \
+      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
+      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
+      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+      a = _mm256_cvttps_epi32( af ); \
+      b = _mm256_cvttps_epi32( bf ); \
+      out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
+      out = _mm_packus_epi16( out, out ); \
+      t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
+      t = _mm_packus_epi16( t, t ); \
+      out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
+    }
+
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
+    { \
+      stbir__simdi a,b,zero = _mm_setzero_si128(); \
+      a = _mm_unpacklo_epi16( ireg, zero ); \
+      b = _mm_unpackhi_epi16( ireg, zero ); \
+      out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
+    }
+
+    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
+      { \
+        stbir__simdi t0,t1; \
+        stbir__simdf8 af,bf; \
+        stbir__simdi8 a,b; \
+        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
+        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
+        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+        a = _mm256_cvttps_epi32( af ); \
+        b = _mm256_cvttps_epi32( bf ); \
+        t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
+        t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
+        out = _mm256_setr_m128i( t0, t1 ); \
+      }
+
+    #endif
+
+    static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
+    #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
+
+    static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
+    #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
+
+    #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
+
+    #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
+
+    static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
+    #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
+    #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8,  _mm256_castps128_ps256( b ) )
+
+    static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i(  0x80000000,  0x80000000, 0, 0 ) };
+    #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
+
+    #define stbir__simdf8_0123to00000000( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
+    #define stbir__simdf8_0123to11111111( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
+    #define stbir__simdf8_0123to22222222( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
+    #define stbir__simdf8_0123to33333333( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
+    #define stbir__simdf8_0123to21032103( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
+    #define stbir__simdf8_0123to32103210( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
+    #define stbir__simdf8_0123to12301230( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
+    #define stbir__simdf8_0123to10321032( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
+    #define stbir__simdf8_0123to30123012( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
+
+    #define stbir__simdf8_0123to11331133( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
+    #define stbir__simdf8_0123to00220022( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
+
+    #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
+    #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
+    #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
+    #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
+
+    #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
+
+    #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
+    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
+    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
+    #else
+    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
+    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )  (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
+    #endif
+    #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
+
+  #endif
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)  // martins floorf
+  {
+    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
+    __m128 t = _mm_set_ss(x);
+    return _mm_cvtss_f32( _mm_floor_ss(t, t) );
+    #else
+    __m128 f = _mm_set_ss(x);
+    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
+    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
+    return _mm_cvtss_f32(r);
+    #endif
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)  // martins ceilf
+  {
+    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
+    __m128 t = _mm_set_ss(x);
+    return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
+    #else
+    __m128 f = _mm_set_ss(x);
+    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
+    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
+    return _mm_cvtss_f32(r);
+    #endif
+  }
+
+#elif defined(STBIR_NEON)
+
+  #include <arm_neon.h>
+
+  #define stbir__simdf float32x4_t
+  #define stbir__simdi uint32x4_t
+
+  #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
+  #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
+  #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
+  #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) )  // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
+
+  #define stbir__simdf_zeroP() vdupq_n_f32(0)
+  #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
+
+  #define stbir__simdf_store( ptr, reg )  vst1q_f32( (float*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
+  #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
+  #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
+
+  #define stbir__simdi_store( ptr, reg )  vst1q_u32( (uint32_t*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
+  #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
+
+  #define stbir__prefetch( ptr )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
+    uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
+    out0 = vmovl_u16( vget_low_u16 ( l ) ); \
+    out1 = vmovl_u16( vget_high_u16( l ) ); \
+    out2 = vmovl_u16( vget_low_u16 ( h ) ); \
+    out3 = vmovl_u16( vget_high_u16( h ) ); \
+  }
+
+  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
+    out = vmovl_u16( vget_low_u16( tmp ) ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
+    out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
+    out1 = vmovl_u16( vget_high_u16( tmp ) ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
+  #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
+  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
+
+  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
+  #else
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
+  #endif
+
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
+
+  #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
+  #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
+
+  #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+
+    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
+    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
+
+    #if defined( _MSC_VER ) && !defined(__clang__)
+      #define stbir_make16(a,b,c,d) vcombine_u8( \
+        vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
+          ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
+        vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
+          ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
+
+      static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
+      {
+        uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
+        return r;
+      }
+    #else
+      #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
+      #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
+    #endif
+
+    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
+    #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
+
+    #define stbir__simdi_16madd( out, reg0, reg1 ) \
+    { \
+      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
+      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
+      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
+      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
+      (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
+    }
+
+  #else
+
+    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
+    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
+
+    #if defined( _MSC_VER ) && !defined(__clang__)
+      static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
+      {
+        uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
+        return r;
+      }
+      #define stbir_make8(a,b) vcreate_u8( \
+        (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
+        ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
+    #else
+      #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
+      #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
+    #endif
+
+    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
+        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
+        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
+
+    #define stbir__simdi_16madd( out, reg0, reg1 ) \
+    { \
+      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
+      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
+      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
+      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
+      int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
+      int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
+      (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
+    }
+
+  #endif
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
+    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
+    int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
+    int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
+    uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
+    out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
+  }
+
+  #define stbir__simdf_pack_to_8words(out,aa,bb) \
+  { \
+    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
+    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
+    int32x4_t ai = vcvtq_s32_f32( af ); \
+    int32x4_t bi = vcvtq_s32_f32( bf ); \
+    out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
+  }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+  { \
+    int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
+    int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
+    uint8x8x2_t out = \
+    { { \
+      vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
+      vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
+    } }; \
+    vst2_u8(ptr, out); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+  { \
+    float32x4x4_t tmp = vld4q_f32(ptr); \
+    o0 = tmp.val[0]; \
+    o1 = tmp.val[1]; \
+    o2 = tmp.val[2]; \
+    o3 = tmp.val[3]; \
+  }
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
+
+  #if defined( _MSC_VER ) && !defined(__clang__)
+    #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
+    #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
+    #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
+    #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
+  #else
+    #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+    #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
+    #define STBIR__CONSTF(var) (var)
+    #define STBIR__CONSTI(var) (var)
+  #endif
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)
+  {
+    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+    return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
+    #else
+    float32x2_t f = vdup_n_f32(x);
+    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
+    uint32x2_t a = vclt_f32(f, t);
+    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
+    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
+    return vget_lane_f32(r, 0);
+    #endif
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)
+  {
+    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+    return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
+    #else
+    float32x2_t f = vdup_n_f32(x);
+    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
+    uint32x2_t a = vclt_f32(t, f);
+    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
+    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
+    return vget_lane_f32(r, 0);
+    #endif
+  }
+
+  #define STBIR_SIMD
+
+#elif defined(STBIR_WASM)
+
+  #include <wasm_simd128.h>
+
+  #define stbir__simdf v128_t
+  #define stbir__simdi v128_t
+
+  #define stbir_simdi_castf( reg ) (reg)
+  #define stbir_simdf_casti( reg ) (reg)
+
+  #define stbir__simdf_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) )
+  #define stbir__simdf_load1z( out, ptr )           (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
+  #define stbir__simdf_frep4( fvar )                wasm_f32x4_splat( fvar )
+  #define stbir__simdf_load1frep4( out, fvar )      (out) = wasm_f32x4_splat( fvar )
+  #define stbir__simdf_load2( out, ptr )            (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr )           (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
+
+  #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
+  #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
+
+  #define stbir__simdf_store( ptr, reg )   wasm_v128_store( (void*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg )  wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdf_store2( ptr, reg )  wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
+
+  #define stbir__simdi_store( ptr, reg )  wasm_v128_store( (void*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
+
+  #define stbir__prefetch( ptr )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
+    v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
+    out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
+    out1 = wasm_u32x4_extend_high_u16x8( l ); \
+    out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
+    out3 = wasm_u32x4_extend_high_u16x8( h ); \
+  }
+
+  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
+    out = wasm_u32x4_extend_low_u16x8(tmp); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
+    out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
+  #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
+  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0)
+  #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
+  #define stbir__simdf_add( out, reg0, reg1 )          (out) = wasm_f32x4_add( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 )         (out) = wasm_f32x4_mul( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr )       (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr )      (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr )        (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr )       (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
+
+  #define stbir__simdf_madd( out, add, mul1, mul2 )    (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 )   (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr )  (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
+
+  #define stbir__simdf_add1( out, reg0, reg1 )  (out) = wasm_f32x4_add( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 )  (out) = wasm_v128_or( reg0, reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
+
+  #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
+  #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
+  #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
+  #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
+
+  #define stbir__simdi_and( out, reg0, reg1 )    (out) = wasm_v128_and( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 )     (out) = wasm_v128_or( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
+    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
+    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
+    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
+    v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
+    out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
+  }
+
+  #define stbir__simdf_pack_to_8words(out,aa,bb) \
+  { \
+    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
+    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
+    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
+    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
+    out = wasm_u16x8_narrow_i32x4( ai, bi ); \
+  }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+  { \
+    v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
+    v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
+    v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
+    tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
+    wasm_v128_store( (void*)(ptr), tmp); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+  { \
+    v128_t t0 = wasm_v128_load( ptr    ); \
+    v128_t t1 = wasm_v128_load( ptr+4  ); \
+    v128_t t2 = wasm_v128_load( ptr+8  ); \
+    v128_t t3 = wasm_v128_load( ptr+12 ); \
+    v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
+    v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
+    v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
+    v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
+    o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
+    o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
+    o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
+    o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
+  }
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
+
+  typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)
+  {
+    return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)
+  {
+    return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
+  }
+
+  #define STBIR_SIMD
+
+#endif  // SSE2/NEON/WASM
+
+#endif // NO SIMD
+
+#ifdef STBIR_SIMD8
+  #define stbir__simdfX stbir__simdf8
+  #define stbir__simdiX stbir__simdi8
+  #define stbir__simdfX_load stbir__simdf8_load
+  #define stbir__simdiX_load stbir__simdi8_load
+  #define stbir__simdfX_mult stbir__simdf8_mult
+  #define stbir__simdfX_add_mem stbir__simdf8_add_mem
+  #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
+  #define stbir__simdfX_store stbir__simdf8_store
+  #define stbir__simdiX_store stbir__simdi8_store
+  #define stbir__simdf_frepX  stbir__simdf8_frep8
+  #define stbir__simdfX_madd stbir__simdf8_madd
+  #define stbir__simdfX_min stbir__simdf8_min
+  #define stbir__simdfX_max stbir__simdf8_max
+  #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
+  #define stbir__simdfX_1aaa stbir__simdf8_1aaa
+  #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
+  #define stbir__simdfX_1a1a stbir__simdf8_1a1a
+  #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
+  #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
+  #define stbir__simdfX_zero stbir__simdf8_zero
+  #define STBIR_onesX STBIR_ones8
+  #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
+  #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
+  #define STBIR_simd_point5X STBIR_simd_point58
+  #define stbir__simdfX_float_count 8
+  #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
+  #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
+  static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
+  static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
+  static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
+  static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
+  static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
+  static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
+#else
+  #define stbir__simdfX stbir__simdf
+  #define stbir__simdiX stbir__simdi
+  #define stbir__simdfX_load stbir__simdf_load
+  #define stbir__simdiX_load stbir__simdi_load
+  #define stbir__simdfX_mult stbir__simdf_mult
+  #define stbir__simdfX_add_mem stbir__simdf_add_mem
+  #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
+  #define stbir__simdfX_store stbir__simdf_store
+  #define stbir__simdiX_store stbir__simdi_store
+  #define stbir__simdf_frepX  stbir__simdf_frep4
+  #define stbir__simdfX_madd stbir__simdf_madd
+  #define stbir__simdfX_min stbir__simdf_min
+  #define stbir__simdfX_max stbir__simdf_max
+  #define stbir__simdfX_aaa1 stbir__simdf_aaa1
+  #define stbir__simdfX_1aaa stbir__simdf_1aaa
+  #define stbir__simdfX_a1a1 stbir__simdf_a1a1
+  #define stbir__simdfX_1a1a stbir__simdf_1a1a
+  #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
+  #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
+  #define stbir__simdfX_zero stbir__simdf_zero
+  #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
+  #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
+  #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
+  #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
+  #define stbir__simdfX_float_count 4
+  #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
+  #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
+  #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
+#endif
+
+
+#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
+
+  #if defined( _MSC_VER ) && !defined(__clang__)
+  typedef __int16 stbir__FP16;
+  #else
+  typedef float16_t stbir__FP16;
+  #endif
+
+#else // no NEON, or 32-bit ARM for MSVC
+
+  typedef union stbir__FP16
+  {
+    unsigned short u;
+  } stbir__FP16;
+
+#endif
+
+#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
+
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    static const stbir__FP32 magic = { (254 - 15) << 23 };
+    static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
+    stbir__FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    o.f *= magic.f;                 // exponent adjust
+    if (o.f >= was_infnan.f)        // make sure Inf/NaN survive
+      o.u |= 255 << 23;
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half(float val)
+  {
+    stbir__FP32 f32infty = { 255 << 23 };
+    stbir__FP32 f16max   = { (127 + 16) << 23 };
+    stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+    unsigned int sign_mask = 0x80000000u;
+    stbir__FP16 o = { 0 };
+    stbir__FP32 f;
+    unsigned int sign;
+
+    f.f = val;
+    sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
+      o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+    else // (De)normalized number or zero
+    {
+      if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
+      {
+        // use a magic value to align our 10 mantissa bits at the bottom of
+        // the float. as long as FP addition is round-to-nearest-even this
+        // just works.
+        f.f += denorm_magic.f;
+        // and one integer subtract of the bias later, we have our final float!
+        o.u = (unsigned short) ( f.u - denorm_magic.u );
+      }
+      else
+      {
+        unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+        // update exponent, rounding bias part 1
+        f.u = f.u + ((15u - 127) << 23) + 0xfff;
+        // rounding bias part 2
+        f.u += mant_odd;
+        // take the bits!
+        o.u = (unsigned short) ( f.u >> 13 );
+      }
+    }
+
+    o.u |= sign >> 16;
+    return o;
+  }
+
+#endif
+
+
+#if defined(STBIR_FP16C)
+
+  #include <immintrin.h>
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    stbir__FP16 h;
+    h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
+    return h;
+  }
+
+#elif defined(STBIR_SSE2)
+
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+  stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_nosign,      0x7fff);
+    static const STBIR__SIMDI_CONST(smallest_normal,  0x0400);
+    static const STBIR__SIMDI_CONST(infinity,         0x7c00);
+    static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
+    static const STBIR__SIMDI_CONST(magic_denorm,     113 << 23);
+
+    __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
+    __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
+    __m128i mnosign     = STBIR__CONSTI(mask_nosign);
+    __m128i eadjust     = STBIR__CONSTI(expadjust_normal);
+    __m128i smallest    = STBIR__CONSTI(smallest_normal);
+    __m128i infty       = STBIR__CONSTI(infinity);
+    __m128i expmant     = _mm_and_si128(mnosign, h);
+    __m128i justsign    = _mm_xor_si128(h, expmant);
+    __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+    __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+    __m128i shifted     = _mm_slli_epi32(expmant, 13);
+    __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+    __m128i den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
+    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    __m128i sign        = _mm_slli_epi32(justsign, 16);
+    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+    stbir__simdf_store( output + 0,  final );
+
+    h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
+    expmant     = _mm_and_si128(mnosign, h);
+    justsign    = _mm_xor_si128(h, expmant);
+    b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+    b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+    shifted     = _mm_slli_epi32(expmant, 13);
+    adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+    adjusted    = _mm_add_epi32(eadjust, shifted);
+    den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
+    adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+    den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    sign        = _mm_slli_epi32(justsign, 16);
+    final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+    stbir__simdf_store( output + 4,  final );
+
+    // ~38 SSE2 ops for 8 values
+  }
+
+  // Fabian's round-to-nearest-even float to half
+  // ~48 SSE2 ops for 8 output
+  stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_sign,      0x80000000u);
+    static const STBIR__SIMDI_CONST(c_f16max,       (127 + 16) << 23); // all FP32 values >=this round to +inf
+    static const STBIR__SIMDI_CONST(c_nanbit,        0x200);
+    static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
+    static const STBIR__SIMDI_CONST(c_min_normal,    (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
+    static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
+    static const STBIR__SIMDI_CONST(c_normal_bias,    0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
+
+    __m128  f           =  _mm_loadu_ps(input);
+    __m128  msign       = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
+    __m128  justsign    = _mm_and_ps(msign, f);
+    __m128  absf        = _mm_xor_ps(f, justsign);
+    __m128i absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    __m128i f16max      = STBIR__CONSTI(c_f16max);
+    __m128  b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
+    __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
+    __m128i nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
+    __m128i inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    __m128i min_normal  = STBIR__CONSTI(c_min_normal);
+    __m128i b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
+
+    // "result is subnormal" path
+    __m128  subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    __m128i subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    __m128i mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    __m128i mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    __m128i round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
+    __m128i round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    __m128i normal      = _mm_srli_epi32(round2, 13); // rounded result
+
+    // combine the two non-specials
+    __m128i nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
+
+    // merge in specials as well
+    __m128i joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
+
+    __m128i sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
+    __m128i final2, final= _mm_or_si128(joined, sign_shift);
+
+    f           =  _mm_loadu_ps(input+4);
+    justsign    = _mm_and_ps(msign, f);
+    absf        = _mm_xor_ps(f, justsign);
+    absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
+    b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
+    nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
+    inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
+
+    // "result is subnormal" path
+    subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
+    round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    normal      = _mm_srli_epi32(round2, 13); // rounded result
+
+    // combine the two non-specials
+    nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
+
+    // merge in specials as well
+    joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
+
+    sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
+    final2      = _mm_or_si128(joined, sign_shift);
+    final       = _mm_packs_epi32(final, final2);
+    stbir__simdi_store( output,final );
+  }
+
+#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    float16x4_t in0 = vld1_f16(input + 0);
+    float16x4_t in1 = vld1_f16(input + 4);
+    vst1q_f32(output + 0, vcvt_f32_f16(in0));
+    vst1q_f32(output + 4, vcvt_f32_f16(in1));
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
+    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
+    vst1_f16(output+0, out0);
+    vst1_f16(output+4, out1);
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
+  }
+
+#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    float16x8_t in = vld1q_f16(input);
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
+    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
+    vst1q_f16(output, vcombine_f16(out0, out1));
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
+  }
+
+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__half_to_float(input[i]);
+    }
+  }
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__float_to_half(input[i]);
+    }
+  }
+
+#endif
+
+
+#ifdef STBIR_SIMD
+
+#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
+#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
+#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
+#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
+#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
+#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
+#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
+#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
+#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
+#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
+#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
+#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
+#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
+#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
+#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
+#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
+#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
+#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
+#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
+#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
+
+typedef union stbir__simdi_u32
+{
+  stbir_uint32 m128i_u32[4];
+  int m128i_i32[4];
+  stbir__simdi m128i_i128;
+} stbir__simdi_u32;
+
+static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
+
+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float,           stbir__max_uint8_as_float);
+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float,          stbir__max_uint16_as_float);
+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted,  stbir__max_uint8_as_float_inverted);
+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
+
+static const STBIR__SIMDF_CONST(STBIR_simd_point5,   0.5f);
+static const STBIR__SIMDF_CONST(STBIR_ones,          1.0f);
+static const STBIR__SIMDI_CONST(STBIR_almost_zero,   (127 - 13) << 23);
+static const STBIR__SIMDI_CONST(STBIR_almost_one,    0x3f7fffff);
+static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
+static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
+
+//   Basically, in simd mode, we unroll the proper amount, and we don't want
+//   the non-simd remnant loops to be unroll because they only run a few times
+//   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
+#define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
+#define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
+
+#ifdef STBIR_MEMCPY
+#undef STBIR_MEMCPY
+#endif
+#define STBIR_MEMCPY stbir_simd_memcpy
+
+// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
+static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
+  ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
+
+  // check overlaps
+  STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
+
+  if ( bytes < (16*stbir__simdfX_float_count) )
+  {
+    if ( bytes < 16 )
+    {
+      if ( bytes )
+      {
+        STBIR_SIMD_NO_UNROLL_LOOP_START
+        do
+        {
+          STBIR_SIMD_NO_UNROLL(d);
+          d[ 0 ] = d[ ofs_to_src ];
+          ++d;
+        } while ( d < d_end );
+      }
+    }
+    else
+    {
+      stbir__simdf x;
+      // do one unaligned to get us aligned for the stream out below
+      stbir__simdf_load( x, ( d + ofs_to_src ) );
+      stbir__simdf_store( d, x );
+      d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
+
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+      for(;;)
+      {
+        STBIR_SIMD_NO_UNROLL(d);
+
+        if ( d > ( d_end - 16 ) )
+        {
+          if ( d == d_end )
+            return;
+          d = d_end - 16;
+        }
+
+        stbir__simdf_load( x, ( d + ofs_to_src ) );
+        stbir__simdf_store( d, x );
+        d += 16;
+      }
+    }
+  }
+  else
+  {
+    stbir__simdfX x0,x1,x2,x3;
+
+    // do one unaligned to get us aligned for the stream out below
+    stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
+    stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
+    stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
+    stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
+    stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
+    stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
+    stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
+    stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
+    d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      STBIR_SIMD_NO_UNROLL(d);
+
+      if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
+      {
+        if ( d == d_end )
+          return;
+        d = d_end - (16*stbir__simdfX_float_count);
+      }
+
+      stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
+      stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
+      stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
+      stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
+      stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
+      stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
+      stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
+      stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
+      d += (16*stbir__simdfX_float_count);
+    }
+  }
+}
+
+// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
+//   the diff between dest and src)
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
+  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
+
+  if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
+  {
+    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    do
+    {
+      stbir__simdf x;
+      STBIR_SIMD_NO_UNROLL(sd);
+      stbir__simdf_load( x, sd );
+      stbir__simdf_store(  ( sd + ofs_to_dest ), x );
+      sd += 16;
+    } while ( sd < s_end16 );
+
+    if ( sd == s_end )
+      return;
+  }
+
+  do
+  {
+    STBIR_SIMD_NO_UNROLL(sd);
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
+    sd += 4;
+  } while ( sd < s_end );
+}
+
+#else // no SSE2
+
+// when in scalar mode, we let unrolling happen, so this macro just does the __restrict
+#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
+#define STBIR_SIMD_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START
+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+
+#endif // SSE2
+
+
+#ifdef STBIR_PROFILE
+
+#ifndef STBIR_PROFILE_FUNC
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+
+#ifdef _MSC_VER
+
+  STBIRDEF stbir_uint64 __rdtsc();
+  #define STBIR_PROFILE_FUNC() __rdtsc()
+
+#else // non msvc
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
+  {
+    stbir_uint32 lo, hi;
+    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
+    return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
+  }
+
+#endif  // msvc
+
+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
+
+#if defined( _MSC_VER ) && !defined(__clang__)
+
+  #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
+
+#else
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
+  {
+    stbir_uint64 tsc;
+    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+    return tsc;
+  }
+
+#endif
+
+#else // x64, arm
+
+#error Unknown platform for profiling.
+
+#endif  // x64, arm
+
+#endif // STBIR_PROFILE_FUNC
+
+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
+
+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
+
+// super light-weight micro profiler
+#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
+#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
+#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
+#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
+
+// for thread data
+#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
+#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
+#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
+#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
+
+// for build data
+#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
+
+#else  // no profile
+
+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
+
+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO
+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO
+
+#define STBIR_PROFILE_START( wh )
+#define STBIR_PROFILE_END( wh )
+#define STBIR_PROFILE_FIRST_START( wh )
+#define STBIR_PROFILE_CLEAR_EXTRAS( )
+
+#define STBIR_PROFILE_BUILD_START( wh )
+#define STBIR_PROFILE_BUILD_END( wh )
+#define STBIR_PROFILE_BUILD_FIRST_START( wh )
+#define STBIR_PROFILE_BUILD_CLEAR( info )
+
+#endif  // stbir_profile
+
+#ifndef STBIR_CEILF
+#include <math.h>
+#if _MSC_VER <= 1200 // support VC6 for Sean
+#define STBIR_CEILF(x) ((float)ceil((float)(x)))
+#define STBIR_FLOORF(x) ((float)floor((float)(x)))
+#else
+#define STBIR_CEILF(x) ceilf(x)
+#define STBIR_FLOORF(x) floorf(x)
+#endif
+#endif
+
+#ifndef STBIR_MEMCPY
+// For memcpy
+#include <string.h>
+#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
+#endif
+
+#ifndef STBIR_SIMD
+
+// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
+//   the diff between dest and src)
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
+  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
+
+  if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
+  {
+    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
+    STBIR_NO_UNROLL_LOOP_START
+    do
+    {
+      STBIR_NO_UNROLL(sd);
+      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
+      sd += 8;
+    } while ( sd < s_end8 );
+
+    if ( sd == s_end )
+      return;
+  }
+
+  STBIR_NO_UNROLL_LOOP_START
+  do
+  {
+    STBIR_NO_UNROLL(sd);
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
+    sd += 4;
+  } while ( sd < s_end );
+}
+
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale, void * user_data)
+{
+  float halfscale = scale / 2;
+  float t = 0.5f + halfscale;
+  STBIR_ASSERT(scale <= 1);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x >= t)
+    return 0.0f;
+  else
+  {
+    float r = 0.5f - halfscale;
+    if (x <= r)
+      return 1.0f;
+    else
+      return (t - x) / scale;
+  }
+}
+
+static float stbir__support_trapezoid(float scale, void * user_data)
+{
+  STBIR__UNUSED(user_data);
+  return 0.5f + scale / 2.0f;
+}
+
+static float stbir__filter_triangle(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x <= 1.0f)
+    return 1.0f - x;
+  else
+    return 0.0f;
+}
+
+static float stbir__filter_point(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(x);
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  return 1.0f;
+}
+
+static float stbir__filter_cubic(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
+  else if (x < 2.0f)
+    return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
+
+  return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return 1.0f - x*x*(2.5f - 1.5f*x);
+  else if (x < 2.0f)
+    return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
+
+  return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
+  else if (x < 2.0f)
+    return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
+
+  return (0.0f);
+}
+
+static float stbir__support_zeropoint5(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 0.5f;
+}
+
+static float stbir__support_one(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 1;
+}
+
+static float stbir__support_two(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 2;
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter from the output pixel's perspective
+static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
+{
+  STBIR_ASSERT(support != 0);
+
+  if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
+    return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
+  else
+    return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
+}
+
+// this is how many coefficents per run of the filter (which is different
+//   from the filter_pixel_width depending on if we are scattering or gathering)
+static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
+{
+  float scale = samp->scale_info.scale;
+  stbir__support_callback * support = samp->filter_support;
+
+  switch( is_gather )
+  {
+    case 1:
+      return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
+    case 2:
+      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
+    case 0:
+      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
+    default:
+      STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
+      return 0;
+  }
+}
+
+static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
+{
+  if (is_gather)
+      return samp->scale_info.output_sub_size;
+  else
+      return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
+}
+
+static int stbir__edge_zero_full( int n, int max )
+{
+  STBIR__UNUSED(n);
+  STBIR__UNUSED(max);
+  return 0; // NOTREACHED
+}
+
+static int stbir__edge_clamp_full( int n, int max )
+{
+  if (n < 0)
+    return 0;
+
+  if (n >= max)
+    return max - 1;
+
+  return n; // NOTREACHED
+}
+
+static int stbir__edge_reflect_full( int n, int max )
+{
+  if (n < 0)
+  {
+    if (n > -max)
+      return -n;
+    else
+      return max - 1;
+  }
+
+  if (n >= max)
+  {
+    int max2 = max * 2;
+    if (n >= max2)
+      return 0;
+    else
+      return max2 - n - 1;
+  }
+
+  return n; // NOTREACHED
+}
+
+static int stbir__edge_wrap_full( int n, int max )
+{
+  if (n >= 0)
+    return (n % max);
+  else
+  {
+    int m = (-n) % max;
+
+    if (m != 0)
+      m = max - m;
+
+    return (m);
+  }
+}
+
+typedef int stbir__edge_wrap_func( int n, int max );
+static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
+{
+  stbir__edge_clamp_full,    // STBIR_EDGE_CLAMP
+  stbir__edge_reflect_full,  // STBIR_EDGE_REFLECT
+  stbir__edge_wrap_full,     // STBIR_EDGE_WRAP
+  stbir__edge_zero_full,     // STBIR_EDGE_ZERO
+};
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+  // avoid per-pixel switch
+  if (n >= 0 && n < max)
+      return n;
+  return stbir__edge_wrap_slow[edge]( n, max );
+}
+
+#define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
+
+// get information on the extents of a sampler
+static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
+{
+  int j, stop;
+  int left_margin, right_margin;
+  int min_n = 0x7fffffff, max_n = -0x7fffffff;
+  int min_left = 0x7fffffff, max_left = -0x7fffffff;
+  int min_right = 0x7fffffff, max_right = -0x7fffffff;
+  stbir_edge edge = samp->edge;
+  stbir__contributors* contributors = samp->contributors;
+  int output_sub_size = samp->scale_info.output_sub_size;
+  int input_full_size = samp->scale_info.input_full_size;
+  int filter_pixel_margin = samp->filter_pixel_margin;
+
+  STBIR_ASSERT( samp->is_gather );
+
+  stop = output_sub_size;
+  for (j = 0; j < stop; j++ )
+  {
+    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
+    if ( contributors[j].n0 < min_n )
+    {
+      min_n = contributors[j].n0;
+      stop = j + filter_pixel_margin;  // if we find a new min, only scan another filter width
+      if ( stop > output_sub_size ) stop = output_sub_size;
+    }
+  }
+
+  stop = 0;
+  for (j = output_sub_size - 1; j >= stop; j-- )
+  {
+    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
+    if ( contributors[j].n1 > max_n )
+    {
+      max_n = contributors[j].n1;
+      stop = j - filter_pixel_margin;  // if we find a new max, only scan another filter width
+      if (stop<0) stop = 0;
+    }
+  }
+
+  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
+  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
+
+  // now calculate how much into the margins we really read
+  left_margin = 0;
+  if ( min_n < 0 )
+  {
+    left_margin = -min_n;
+    min_n = 0;
+  }
+
+  right_margin = 0;
+  if ( max_n >= input_full_size )
+  {
+    right_margin = max_n - input_full_size + 1;
+    max_n = input_full_size - 1;
+  }
+
+  // index 1 is margin pixel extents (how many pixels we hang over the edge)
+  scanline_extents->edge_sizes[0] = left_margin;
+  scanline_extents->edge_sizes[1] = right_margin;
+
+  // index 2 is pixels read from the input
+  scanline_extents->spans[0].n0 = min_n;
+  scanline_extents->spans[0].n1 = max_n;
+  scanline_extents->spans[0].pixel_offset_for_input = min_n;
+
+  // default to no other input range
+  scanline_extents->spans[1].n0 = 0;
+  scanline_extents->spans[1].n1 = -1;
+  scanline_extents->spans[1].pixel_offset_for_input = 0;
+
+  // don't have to do edge calc for zero clamp
+  if ( edge == STBIR_EDGE_ZERO )
+    return;
+
+  // convert margin pixels to the pixels within the input (min and max)
+  for( j = -left_margin ; j < 0 ; j++ )
+  {
+      int p = stbir__edge_wrap( edge, j, input_full_size );
+      if ( p < min_left )
+        min_left = p;
+      if ( p > max_left )
+        max_left = p;
+  }
+
+  for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
+  {
+      int p = stbir__edge_wrap( edge, j, input_full_size );
+      if ( p < min_right )
+        min_right = p;
+      if ( p > max_right )
+        max_right = p;
+  }
+
+  // merge the left margin pixel region if it connects within 4 pixels of main pixel region
+  if ( min_left != 0x7fffffff )
+  {
+    if ( ( ( min_left <= min_n ) && ( ( max_left  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
+         ( ( min_n <= min_left ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
+    {
+      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
+      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
+      scanline_extents->spans[0].pixel_offset_for_input = min_n;
+      left_margin = 0;
+    }
+  }
+
+  // merge the right margin pixel region if it connects within 4 pixels of main pixel region
+  if ( min_right != 0x7fffffff )
+  {
+    if ( ( ( min_right <= min_n ) && ( ( max_right  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
+         ( ( min_n <= min_right ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
+    {
+      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
+      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
+      scanline_extents->spans[0].pixel_offset_for_input = min_n;
+      right_margin = 0;
+    }
+  }
+
+  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
+  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
+
+  // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
+  //   so you need to get a second run of pixels from the opposite side of the scanline (which you
+  //   wouldn't need except for WRAP)
+
+
+  // if we can't merge the min_left range, add it as a second range
+  if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
+  {
+    stbir__span * newspan = scanline_extents->spans + 1;
+    STBIR_ASSERT( right_margin == 0 );
+    if ( min_left < scanline_extents->spans[0].n0 )
+    {
+      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
+      --newspan;
+    }
+    newspan->pixel_offset_for_input = min_left;
+    newspan->n0 = -left_margin;
+    newspan->n1 = ( max_left - min_left ) - left_margin;
+    scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
+    return;
+  }
+
+  // if we can't merge the min_left range, add it as a second range
+  if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
+  {
+    stbir__span * newspan = scanline_extents->spans + 1;
+    if ( min_right < scanline_extents->spans[0].n0 )
+    {
+      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
+      --newspan;
+    }
+    newspan->pixel_offset_for_input = min_right;
+    newspan->n0 = scanline_extents->spans[1].n1 + 1;
+    newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
+    scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
+    return;
+  }
+}
+
+static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
+{
+  int first, last;
+  float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+  float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
+  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
+
+  first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
+  last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
+  if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
+
+  if ( edge == STBIR_EDGE_WRAP )
+  {
+    if ( first < -input_size )
+      first = -input_size;
+    if ( last >= (input_size*2))
+      last = (input_size*2) - 1;
+  }
+
+  *first_pixel = first;
+  *last_pixel = last;
+}
+
+static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
+{
+  int n, end;
+  float inv_scale = scale_info->inv_scale;
+  float out_shift = scale_info->pixel_shift;
+  int input_size  = scale_info->input_full_size;
+  int numerator = scale_info->scale_numerator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
+
+  // Looping through out pixels
+  end = num_contributors; if ( polyphase ) end = numerator;
+  for (n = 0; n < end; n++)
+  {
+    int i;
+    int last_non_zero;
+    float out_pixel_center = (float)n + 0.5f;
+    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
+
+    int in_first_pixel, in_last_pixel;
+
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
+
+    // make sure we never generate a range larger than our precalculated coeff width
+    //   this only happens in point sample mode, but it's a good safe thing to do anyway
+    if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
+      in_last_pixel = in_first_pixel + coefficient_width - 1;
+
+    last_non_zero = -1;
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+      float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+      float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
+
+      // kill denormals
+      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
+      {
+        if ( i == 0 )  // if we're at the front, just eat zero contributors
+        {
+          STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
+          ++in_first_pixel;
+          i--;
+          continue;
+        }
+        coeff = 0;  // make sure is fully zero (should keep denormals away)
+      }
+      else
+        last_non_zero = i;
+
+      coefficient_group[i] = coeff;
+    }
+
+    in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
+    contributors->n0 = in_first_pixel;
+    contributors->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributors->n1 >= contributors->n0);
+
+    ++contributors;
+    coefficient_group += coefficient_width;
+  }
+}
+
+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
+{
+  if ( new_pixel <= contribs->n1 )  // before the end
+  {
+    if ( new_pixel < contribs->n0 ) // before the front?
+    {
+      if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
+      { 
+        int j, o = contribs->n0 - new_pixel;
+        for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
+          coeffs[ j + o ] = coeffs[ j ];
+        for ( j = 1 ; j < o ; j-- )
+          coeffs[ j ] = coeffs[ 0 ];
+        coeffs[ 0 ] = new_coeff;
+        contribs->n0 = new_pixel;
+      }
+    }
+    else
+    {
+      coeffs[ new_pixel - contribs->n0 ] += new_coeff;
+    }
+  }
+  else
+  {
+    if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
+    {
+      int j, e = new_pixel - contribs->n0;
+      for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
+        coeffs[j] = 0;
+
+      coeffs[ e ] = new_coeff;
+      contribs->n1 = new_pixel;
+    }
+  }
+}
+
+static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
+{
+  float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+  float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+  float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
+  float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
+  int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
+  int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
+
+  if ( out_first_pixel < 0 )
+    out_first_pixel = 0;
+  if ( out_last_pixel >= out_size )
+    out_last_pixel = out_size - 1;
+  *first_pixel = out_first_pixel;
+  *last_pixel = out_last_pixel;
+}
+
+static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
+{
+  int in_pixel;
+  int i;
+  int first_out_inited = -1;
+  float scale = scale_info->scale;
+  float out_shift = scale_info->pixel_shift;
+  int out_size = scale_info->output_sub_size;
+  int numerator = scale_info->scale_numerator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
+
+  STBIR__UNUSED(num_contributors);
+
+  // Loop through the input pixels
+  for (in_pixel = start; in_pixel < end; in_pixel++)
+  {
+    float in_pixel_center = (float)in_pixel + 0.5f;
+    float out_center_of_in = in_pixel_center * scale - out_shift;
+    int out_first_pixel, out_last_pixel;
+
+    stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
+
+    if ( out_first_pixel > out_last_pixel )
+      continue;
+
+    // clamp or exit if we are using polyphase filtering, and the limit is up
+    if ( polyphase )
+    {
+      // when polyphase, you only have to do coeffs up to the numerator count
+      if ( out_first_pixel == numerator )
+        break;
+
+      // don't do any extra work, clamp last pixel at numerator too
+      if ( out_last_pixel >= numerator )
+        out_last_pixel = numerator - 1;
+    }
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+      float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+      float x = out_pixel_center - out_center_of_in;
+      float coeff = kernel(x, scale, user_data) * scale;
+
+      // kill the coeff if it's too small (avoid denormals)
+      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
+        coeff = 0.0f;
+
+      {
+        int out = i + out_first_pixel;
+        float * coeffs = coefficient_group + out * coefficient_width;
+        stbir__contributors * contribs = contributors + out;
+
+        // is this the first time this output pixel has been seen?  Init it.
+        if ( out > first_out_inited )
+        {
+          STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
+          first_out_inited = out;
+          contribs->n0 = in_pixel;
+          contribs->n1 = in_pixel;
+          coeffs[0]  = coeff;
+        }
+        else
+        {
+          // insert on end (always in order)
+          if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
+          {
+            STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
+            contribs->n0 = in_pixel;
+          }
+          contribs->n1 = in_pixel;
+          STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
+          coeffs[in_pixel - contribs->n0]  = coeff;
+        }
+      }
+    }
+  }
+}
+
+#ifdef STBIR_RENORMALIZE_IN_FLOAT
+#define STBIR_RENORM_TYPE float
+#else
+#define STBIR_RENORM_TYPE double
+#endif
+
+static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
+{
+  int input_size = scale_info->input_full_size;
+  int input_last_n1 = input_size - 1;
+  int n, end;
+  int lowest = 0x7fffffff;
+  int highest = -0x7fffffff;
+  int widest = -1;
+  int numerator = scale_info->scale_numerator;
+  int denominator = scale_info->scale_denominator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
+  float * coeffs;
+  stbir__contributors * contribs;
+
+  // weight all the coeffs for each sample
+  coeffs = coefficient_group;
+  contribs = contributors;
+  end = num_contributors; if ( polyphase ) end = numerator;
+  for (n = 0; n < end; n++)
+  {
+    int i;
+    STBIR_RENORM_TYPE filter_scale, total_filter = 0;
+    int e;
+
+    // add all contribs
+    e = contribs->n1 - contribs->n0;
+    for( i = 0 ; i <= e ; i++ )
+    {
+      total_filter += (STBIR_RENORM_TYPE) coeffs[i];
+      STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
+    }
+
+    // rescale
+    if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
+    {
+      // all coeffs are extremely small, just zero it
+      contribs->n1 = contribs->n0;
+      coeffs[0] = 0.0f;
+    }
+    else
+    {
+      // if the total isn't 1.0, rescale everything
+      if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
+      {
+        filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
+
+        // scale them all
+        for (i = 0; i <= e; i++)
+          coeffs[i] = (float) ( coeffs[i] * filter_scale );
+      }
+    }
+    ++contribs;
+    coeffs += coefficient_width;
+  }
+
+  // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
+  //   most of the coefficients, so we copy them here
+  if ( polyphase )
+  {
+    stbir__contributors * prev_contribs = contributors;
+    stbir__contributors * cur_contribs = contributors + numerator;
+
+    for( n = numerator ; n < num_contributors ; n++ )
+    {
+      cur_contribs->n0 = prev_contribs->n0 + denominator;
+      cur_contribs->n1 = prev_contribs->n1 + denominator;
+      ++cur_contribs;
+      ++prev_contribs;
+    }
+    stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
+  }
+
+  coeffs = coefficient_group;
+  contribs = contributors;
+
+  for (n = 0; n < num_contributors; n++)
+  {
+    int i;
+
+    // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
+    if ( edge == STBIR_EDGE_ZERO )
+    {
+      // shrink the right side if necessary
+      if ( contribs->n1 > input_last_n1 )
+        contribs->n1 = input_last_n1;
+
+      // shrink the left side
+      if ( contribs->n0 < 0 )
+      {
+        int j, left, skips = 0;
+
+        skips = -contribs->n0;
+        contribs->n0 = 0;
+
+        // now move down the weights
+        left = contribs->n1 - contribs->n0 + 1;
+        if ( left > 0 )
+        {
+          for( j = 0 ; j < left ; j++ )
+            coeffs[ j ] = coeffs[ j + skips ];
+        }
+      }
+    }
+    else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
+    {
+      // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
+
+      // right hand side first
+      if ( contribs->n1 > input_last_n1 )
+      {
+        int start = contribs->n0;
+        int endi = contribs->n1;
+        contribs->n1 = input_last_n1;
+        for( i = input_size; i <= endi; i++ )
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
+      }
+
+      // now check left hand edge
+      if ( contribs->n0 < 0 )
+      {
+        int save_n0;
+        float save_n0_coeff;
+        float * c = coeffs - ( contribs->n0 + 1 );
+
+        // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
+        for( i = -1 ; i > contribs->n0 ; i-- )
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
+        save_n0 = contribs->n0;
+        save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
+
+        // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
+        contribs->n0 = 0;
+        for(i = 0 ; i <= contribs->n1 ; i++ )
+          coeffs[i] = coeffs[i-save_n0];
+
+        // now that we have shrunk down the contribs, we insert the first one safely
+        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
+      }
+    }
+
+    if ( contribs->n0 <= contribs->n1 )
+    {
+      int diff = contribs->n1 - contribs->n0 + 1;
+      while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
+        --diff;
+
+      contribs->n1 = contribs->n0 + diff - 1;
+
+      if ( contribs->n0 <= contribs->n1 )
+      {
+        if ( contribs->n0 < lowest )
+          lowest = contribs->n0;
+        if ( contribs->n1 > highest )
+          highest = contribs->n1;
+        if ( diff > widest )
+          widest = diff;
+      }
+
+      // re-zero out unused coefficients (if any)
+      for( i = diff ; i < coefficient_width ; i++ )
+        coeffs[i] = 0.0f;
+    }
+
+    ++contribs;
+    coeffs += coefficient_width;
+  }
+  filter_info->lowest = lowest;
+  filter_info->highest = highest;
+  filter_info->widest = widest;
+}
+
+#undef STBIR_RENORM_TYPE 
+
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
+{
+  #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
+  #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
+  #ifdef STBIR_SIMD
+  #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
+  #else
+  #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
+  #endif
+
+  int row_end = row1 + 1;
+  STBIR__UNUSED( row0 ); // only used in an assert
+
+  if ( coefficient_width != widest )
+  {
+    float * pc = coefficents;
+    float * coeffs = coefficents;
+    float * pc_end = coefficents + num_contributors * widest;
+    switch( widest )
+    {
+      case 1:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_1( pc, coeffs );
+          ++pc;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 2:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_2( pc, coeffs );
+          pc += 2;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 3:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_2( pc, coeffs );
+          STBIR_MOVE_1( pc+2, coeffs+2 );
+          pc += 3;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 4:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          pc += 4;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 5:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_1( pc+4, coeffs+4 );
+          pc += 5;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 6:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_2( pc+4, coeffs+4 );
+          pc += 6;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 7:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_2( pc+4, coeffs+4 );
+          STBIR_MOVE_1( pc+6, coeffs+6 );
+          pc += 7;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 8:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          pc += 8;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 9:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_1( pc+8, coeffs+8 );
+          pc += 9;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 10:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_2( pc+8, coeffs+8 );
+          pc += 10;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 11:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_2( pc+8, coeffs+8 );
+          STBIR_MOVE_1( pc+10, coeffs+10 );
+          pc += 11;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 12:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_4( pc+8, coeffs+8 );
+          pc += 12;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      default:
+        STBIR_NO_UNROLL_LOOP_START
+        do {
+          float * copy_end = pc + widest - 4;
+          float * c = coeffs;
+          do {
+            STBIR_NO_UNROLL( pc );
+            STBIR_MOVE_4( pc, c );
+            pc += 4;
+            c += 4;
+          } while ( pc <= copy_end );
+          copy_end += 4;
+          STBIR_NO_UNROLL_LOOP_START
+          while ( pc < copy_end )
+          {
+            STBIR_MOVE_1( pc, c );
+            ++pc; ++c;
+          }
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+    }
+  }
+
+  // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
+  coefficents[ widest * num_contributors ] = 8888.0f;
+
+  // the minimum we might read for unrolled filters widths is 12. So, we need to
+  //   make sure we never read outside the decode buffer, by possibly moving
+  //   the sample area back into the scanline, and putting zeros weights first.
+  // we start on the right edge and check until we're well past the possible
+  //   clip area (2*widest).
+  {
+    stbir__contributors * contribs = contributors + num_contributors - 1;
+    float * coeffs = coefficents + widest * ( num_contributors - 1 );
+
+    // go until no chance of clipping (this is usually less than 8 lops)
+    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
+    {
+      // might we clip??
+      if ( ( contribs->n0 + widest ) > row_end )
+      {
+        int stop_range = widest;
+
+        // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
+        //   of this contrib n1, instead of a fixed widest amount - so calculate this
+        if ( widest > 12 )
+        {
+          int mod;
+
+          // how far will be read in the n_coeff loop (which depends on the widest count mod4);
+          mod = widest & 3;
+          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
+
+          // the n_coeff loops do a minimum amount of coeffs, so factor that in!
+          if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
+        }
+
+        // now see if we still clip with the refined range
+        if ( ( contribs->n0 + stop_range ) > row_end )
+        {
+          int new_n0 = row_end - stop_range;
+          int num = contribs->n1 - contribs->n0 + 1;
+          int backup = contribs->n0 - new_n0;
+          float * from_co = coeffs + num - 1;
+          float * to_co = from_co + backup;
+
+          STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
+
+          // move the coeffs over
+          while( num )
+          {
+            *to_co-- = *from_co--;
+            --num;
+          }
+          // zero new positions
+          while ( to_co >= coeffs )
+            *to_co-- = 0;
+          // set new start point
+          contribs->n0 = new_n0;
+          if ( widest > 12 )
+          {
+            int mod;
+
+            // how far will be read in the n_coeff loop (which depends on the widest count mod4);
+            mod = widest & 3;
+            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
+
+            // the n_coeff loops do a minimum amount of coeffs, so factor that in!
+            if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
+          }
+        }
+      }
+      --contribs;
+      coeffs -= widest;
+    }
+  }
+
+  return widest;
+  #undef STBIR_MOVE_1
+  #undef STBIR_MOVE_2
+  #undef STBIR_MOVE_4
+}
+
+static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
+{
+  int n;
+  float scale = samp->scale_info.scale;
+  stbir__kernel_callback * kernel = samp->filter_kernel;
+  stbir__support_callback * support = samp->filter_support;
+  float inv_scale = samp->scale_info.inv_scale;
+  int input_full_size = samp->scale_info.input_full_size;
+  int gather_num_contributors = samp->num_contributors;
+  stbir__contributors* gather_contributors = samp->contributors;
+  float * gather_coeffs = samp->coefficients;
+  int gather_coefficient_width = samp->coefficient_width;
+
+  switch ( samp->is_gather )
+  {
+    case 1: // gather upsample
+    {
+      float out_pixels_radius = support(inv_scale,user_data) * scale;
+
+      stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
+
+      STBIR_PROFILE_BUILD_START( cleanup );
+      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
+      STBIR_PROFILE_BUILD_END( cleanup );
+    }
+    break;
+
+    case 0: // scatter downsample (only on vertical)
+    case 2: // gather downsample
+    {
+      float in_pixels_radius = support(scale,user_data) * inv_scale;
+      int filter_pixel_margin = samp->filter_pixel_margin;
+      int input_end = input_full_size + filter_pixel_margin;
+
+      // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
+      if ( !samp->is_gather )
+      {
+        // check if we are using the same gather downsample on the horizontal as this vertical,
+        //   if so, then we don't have to generate them, we can just pivot from the horizontal.
+        if ( other_axis_for_pivot )
+        {
+          gather_contributors = other_axis_for_pivot->contributors;
+          gather_coeffs = other_axis_for_pivot->coefficients;
+          gather_coefficient_width = other_axis_for_pivot->coefficient_width;
+          gather_num_contributors = other_axis_for_pivot->num_contributors;
+          samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
+          samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
+          samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
+          goto jump_right_to_pivot;
+        }
+
+        gather_contributors = samp->gather_prescatter_contributors;
+        gather_coeffs = samp->gather_prescatter_coefficients;
+        gather_coefficient_width = samp->gather_prescatter_coefficient_width;
+        gather_num_contributors = samp->gather_prescatter_num_contributors;
+      }
+
+      stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
+
+      STBIR_PROFILE_BUILD_START( cleanup );
+      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
+      STBIR_PROFILE_BUILD_END( cleanup );
+
+      if ( !samp->is_gather )
+      {
+        // if this is a scatter (vertical only), then we need to pivot the coeffs
+        stbir__contributors * scatter_contributors;
+        int highest_set;
+
+        jump_right_to_pivot:
+
+        STBIR_PROFILE_BUILD_START( pivot );
+
+        highest_set = (-filter_pixel_margin) - 1;
+        for (n = 0; n < gather_num_contributors; n++)
+        {
+          int k;
+          int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
+          int scatter_coefficient_width = samp->coefficient_width;
+          float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
+          float * g_coeffs = gather_coeffs;
+          scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
+
+          for (k = gn0 ; k <= gn1 ; k++ )
+          {
+            float gc = *g_coeffs++;
+            
+            // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
+            //   (which happens when pivoting from horizontal, which might have dummy zeros)
+            if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
+            {
+              if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
+              {
+                {
+                  // if we are skipping over several contributors, we need to clear the skipped ones
+                  stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+                  while ( clear_contributors < scatter_contributors )
+                  {
+                    clear_contributors->n0 = 0;
+                    clear_contributors->n1 = -1;
+                    ++clear_contributors;
+                  }
+                }
+                scatter_contributors->n0 = n;
+                scatter_contributors->n1 = n;
+                scatter_coeffs[0]  = gc;
+                highest_set = k;
+              }
+              else
+              {
+                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
+              }
+              STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
+            }
+            ++scatter_contributors;
+            scatter_coeffs += scatter_coefficient_width;
+          }
+
+          ++gather_contributors;
+          gather_coeffs += gather_coefficient_width;
+        }
+
+        // now clear any unset contribs
+        {
+          stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+          stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
+          while ( clear_contributors < end_contributors )
+          {
+            clear_contributors->n0 = 0;
+            clear_contributors->n1 = -1;
+            ++clear_contributors;
+          }
+        }
+
+        STBIR_PROFILE_BUILD_END( pivot );
+      }
+    }
+    break;
+  }
+}
+
+
+//========================================================================================================
+// scanline decoders and encoders
+
+#define stbir__coder_min_num 1
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix BGRA
+#define stbir__decode_swizzle
+#define stbir__decode_order0  2
+#define stbir__decode_order1  1
+#define stbir__decode_order2  0
+#define stbir__decode_order3  3
+#define stbir__encode_order0  2
+#define stbir__encode_order1  1
+#define stbir__encode_order2  0
+#define stbir__encode_order3  3
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix ARGB
+#define stbir__decode_swizzle
+#define stbir__decode_order0  1
+#define stbir__decode_order1  2
+#define stbir__decode_order2  3
+#define stbir__decode_order3  0
+#define stbir__encode_order0  3
+#define stbir__encode_order1  0
+#define stbir__encode_order2  1
+#define stbir__encode_order3  2
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix ABGR
+#define stbir__decode_swizzle
+#define stbir__decode_order0  3
+#define stbir__decode_order1  2
+#define stbir__decode_order2  1
+#define stbir__decode_order3  0
+#define stbir__encode_order0  3
+#define stbir__encode_order1  2
+#define stbir__encode_order2  1
+#define stbir__encode_order3  0
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix AR
+#define stbir__decode_swizzle
+#define stbir__decode_order0  1
+#define stbir__decode_order1  0
+#define stbir__decode_order2  3
+#define stbir__decode_order3  2
+#define stbir__encode_order0  1
+#define stbir__encode_order1  0
+#define stbir__encode_order2  3
+#define stbir__encode_order3  2
+#define stbir__coder_min_num 2
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+
+// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
+static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
+  float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7;  // decode buffer aligned to end of out_buffer
+  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
+
+  // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
+
+  #ifdef STBIR_SIMD
+
+  #ifdef STBIR_SIMD8
+  decode += 16;
+  STBIR_NO_UNROLL_LOOP_START
+  while ( decode <= end_decode )
+  {
+    stbir__simdf8 d0,d1,a0,a1,p0,p1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf8_load( d0, decode-16 );
+    stbir__simdf8_load( d1, decode-16+8 );
+    stbir__simdf8_0123to33333333( a0, d0 );
+    stbir__simdf8_0123to33333333( a1, d1 );
+    stbir__simdf8_mult( p0, a0, d0 );
+    stbir__simdf8_mult( p1, a1, d1 );
+    stbir__simdf8_bot4s( a0, d0, p0 );
+    stbir__simdf8_bot4s( a1, d1, p1 );
+    stbir__simdf8_top4s( d0, d0, p0 );
+    stbir__simdf8_top4s( d1, d1, p1 );
+    stbir__simdf8_store ( out, a0 );
+    stbir__simdf8_store ( out+7, d0 );
+    stbir__simdf8_store ( out+14, a1 );
+    stbir__simdf8_store ( out+21, d1 );
+    decode += 16;
+    out += 28;
+  }
+  decode -= 16;
+  #else
+  decode += 8;
+  STBIR_NO_UNROLL_LOOP_START
+  while ( decode <= end_decode )
+  {
+    stbir__simdf d0,a0,d1,a1,p0,p1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf_load( d0, decode-8 );
+    stbir__simdf_load( d1, decode-8+4 );
+    stbir__simdf_0123to3333( a0, d0 );
+    stbir__simdf_0123to3333( a1, d1 );
+    stbir__simdf_mult( p0, a0, d0 );
+    stbir__simdf_mult( p1, a1, d1 );
+    stbir__simdf_store ( out, d0 );
+    stbir__simdf_store ( out+4, p0 );
+    stbir__simdf_store ( out+7, d1 );
+    stbir__simdf_store ( out+7+4, p1 );
+    decode += 8;
+    out += 14;
+  }
+  decode -= 8;
+  #endif
+
+  // might be one last odd pixel
+  #ifdef STBIR_SIMD8
+  STBIR_NO_UNROLL_LOOP_START
+  while ( decode < end_decode )
+  #else
+  if ( decode < end_decode )
+  #endif
+  {
+    stbir__simdf d,a,p;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf_load( d, decode );
+    stbir__simdf_0123to3333( a, d );
+    stbir__simdf_mult( p, a, d );
+    stbir__simdf_store ( out, d );
+    stbir__simdf_store ( out+4, p );
+    decode += 4;
+    out += 7;
+  }
+
+  #else
+
+  while( decode < end_decode )
+  {
+    float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
+    out[0] = r;
+    out[1] = g;
+    out[2] = b;
+    out[3] = alpha;
+    out[4] = r * alpha;
+    out[5] = g * alpha;
+    out[6] = b * alpha;
+    out += 7;
+    decode += 4;
+  }
+
+  #endif
+}
+
+static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
+  float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
+  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
+
+  //  for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
+
+  #ifdef STBIR_SIMD
+
+  decode += 8;
+  if ( decode <= end_decode )
+  {
+    STBIR_NO_UNROLL_LOOP_START
+    do {
+      #ifdef STBIR_SIMD8
+      stbir__simdf8 d0,a0,p0;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdf8_load( d0, decode-8 );
+      stbir__simdf8_0123to11331133( p0, d0 );
+      stbir__simdf8_0123to00220022( a0, d0 );
+      stbir__simdf8_mult( p0, p0, a0 );
+
+      stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
+      stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
+      stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
+
+      stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
+      stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
+      stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
+      #else
+      stbir__simdf d0,a0,d1,a1,p0,p1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdf_load( d0, decode-8 );
+      stbir__simdf_load( d1, decode-8+4 );
+      stbir__simdf_0123to1133( p0, d0 );
+      stbir__simdf_0123to1133( p1, d1 );
+      stbir__simdf_0123to0022( a0, d0 );
+      stbir__simdf_0123to0022( a1, d1 );
+      stbir__simdf_mult( p0, p0, a0 );
+      stbir__simdf_mult( p1, p1, a1 );
+
+      stbir__simdf_store2( out, d0 );
+      stbir__simdf_store( out+2, p0 );
+      stbir__simdf_store2h( out+3, d0 );
+
+      stbir__simdf_store2( out+6, d1 );
+      stbir__simdf_store( out+8, p1 );
+      stbir__simdf_store2h( out+9, d1 );
+      #endif
+      decode += 8;
+      out += 12;
+    } while ( decode <= end_decode );
+  }
+  decode -= 8;
+  #endif
+
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode < end_decode )
+  {
+    float x = decode[0], y = decode[1];
+    STBIR_SIMD_NO_UNROLL(decode);
+    out[0] = x;
+    out[1] = y;
+    out[2] = x * y;
+    out += 3;
+    decode += 2;
+  }
+}
+
+static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
+
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float alpha = input[3];
+#ifdef STBIR_SIMD
+    stbir__simdf i,ia;
+    STBIR_SIMD_NO_UNROLL(encode);
+    if ( alpha < stbir__small_float )
+    {
+      stbir__simdf_load( i, input );
+      stbir__simdf_store( encode, i );
+    }
+    else
+    {
+      stbir__simdf_load1frep4( ia, 1.0f / alpha );
+      stbir__simdf_load( i, input+4 );
+      stbir__simdf_mult( i, i, ia );
+      stbir__simdf_store( encode, i );
+      encode[3] = alpha;
+    }
+#else
+    if ( alpha < stbir__small_float )
+    {
+      encode[0] = input[0];
+      encode[1] = input[1];
+      encode[2] = input[2];
+    }
+    else
+    {
+      float ialpha = 1.0f / alpha;
+      encode[0] = input[4] * ialpha;
+      encode[1] = input[5] * ialpha;
+      encode[2] = input[6] * ialpha;
+    }
+    encode[3] = alpha;
+#endif
+
+    input += 7;
+    encode += 4;
+  } while ( encode < end_output );
+}
+
+//  format: [X A Xpm][X A Xpm] etc
+static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = input[1];
+    encode[0] = input[0];
+    if ( alpha >= stbir__small_float )
+      encode[0] = input[2] / alpha;
+    encode[1] = alpha;
+
+    input += 3;
+    encode += 2;
+  } while ( encode < end_output );
+}
+
+static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    decode += 2 * stbir__simdfX_float_count;
+    STBIR_NO_UNROLL_LOOP_START
+    while ( decode <= end_decode )
+    {
+      stbir__simdfX d0,a0,d1,a1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
+      stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
+      stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
+      stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
+      stbir__simdfX_mult( d0, d0, a0 );
+      stbir__simdfX_mult( d1, d1, a1 );
+      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
+      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
+      decode += 2 * stbir__simdfX_float_count;
+    }
+    decode -= 2 * stbir__simdfX_float_count;
+
+    // few last pixels remnants
+    #ifdef STBIR_SIMD8
+    STBIR_NO_UNROLL_LOOP_START
+    while ( decode < end_decode )
+    #else
+    if ( decode < end_decode )
+    #endif
+    {
+      stbir__simdf d,a;
+      stbir__simdf_load( d, decode );
+      stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
+      stbir__simdf_mult( d, d, a );
+      stbir__simdf_store ( decode, d );
+      decode += 4;
+    }
+  }
+
+  #else
+
+  while( decode < end_decode )
+  {
+    float alpha = decode[3];
+    decode[0] *= alpha;
+    decode[1] *= alpha;
+    decode[2] *= alpha;
+    decode += 4;
+  }
+
+  #endif
+}
+
+static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  decode += 2 * stbir__simdfX_float_count;
+  STBIR_NO_UNROLL_LOOP_START
+  while ( decode <= end_decode )
+  {
+    stbir__simdfX d0,a0,d1,a1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
+    stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
+    stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
+    stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
+    stbir__simdfX_mult( d0, d0, a0 );
+    stbir__simdfX_mult( d1, d1, a1 );
+    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
+    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
+    decode += 2 * stbir__simdfX_float_count;
+  }
+  decode -= 2 * stbir__simdfX_float_count;
+  #endif
+
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode < end_decode )
+  {
+    float alpha = decode[1];
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0] *= alpha;
+    decode += 2;
+  }
+}
+
+static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float alpha = encode[3];
+
+#ifdef STBIR_SIMD
+    stbir__simdf i,ia;
+    STBIR_SIMD_NO_UNROLL(encode);
+    if ( alpha >= stbir__small_float )
+    {
+      stbir__simdf_load1frep4( ia, 1.0f / alpha );
+      stbir__simdf_load( i, encode );
+      stbir__simdf_mult( i, i, ia );
+      stbir__simdf_store( encode, i );
+      encode[3] = alpha;
+    }
+#else
+    if ( alpha >= stbir__small_float )
+    {
+      float ialpha = 1.0f / alpha;
+      encode[0] *= ialpha;
+      encode[1] *= ialpha;
+      encode[2] *= ialpha;
+    }
+#endif
+    encode += 4;
+  } while ( encode < end_output );
+}
+
+static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = encode[1];
+    if ( alpha >= stbir__small_float )
+      encode[0] /= alpha;
+    encode += 2;
+  } while ( encode < end_output );
+}
+
+
+// only used in RGB->BGR or BGR->RGB
+static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+#ifdef STBIR_SIMD
+    #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
+      end_decode -= 12; 
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // on arm64 8 instructions, no overlapping stores
+        stbir__simdf a,b,c,na,nb;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+4 );
+        stbir__simdf_load( c, decode+8 );
+
+        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
+        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
+        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
+        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   
+
+        stbir__simdf_store( decode, na );
+        stbir__simdf_store( decode+4, nb ); 
+        stbir__simdf_store( decode+8, c );
+        decode += 12;
+      }
+      end_decode += 12;
+    #else
+      end_decode -= 24;
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // 26 instructions on x64
+        stbir__simdf a,b,c,d,e,f,g;
+        float i21, i23;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+3 );
+        stbir__simdf_load( c, decode+6 );
+        stbir__simdf_load( d, decode+9 );
+        stbir__simdf_load( e, decode+12 );
+        stbir__simdf_load( f, decode+15 );
+        stbir__simdf_load( g, decode+18 );
+
+        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
+        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
+        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
+        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
+        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
+        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
+        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   
+
+        // stores overlap, need to be in order, 
+        stbir__simdf_store( decode,    a );
+        i21 = decode[21];
+        stbir__simdf_store( decode+3,  b ); 
+        i23 = decode[23];
+        stbir__simdf_store( decode+6,  c );
+        stbir__simdf_store( decode+9,  d );
+        stbir__simdf_store( decode+12, e );
+        stbir__simdf_store( decode+15, f );
+        stbir__simdf_store( decode+18, g );
+        decode[21] = i23;
+        decode[23] = i21;
+        decode += 24;
+      }
+      end_decode += 24;
+    #endif
+#else
+  end_decode -= 12;
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode <= end_decode )
+  {
+    // 16 instructions
+    float t0,t1,t2,t3;
+    STBIR_NO_UNROLL(decode);
+    t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
+    decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
+    decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
+    decode += 12;
+  }
+  end_decode += 12;
+#endif
+
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < end_decode )
+  {
+    float t = decode[0];
+    STBIR_NO_UNROLL(decode);
+    decode[0] = decode[2];
+    decode[2] = t;
+    decode += 3;
+  }
+}
+
+
+
+static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  int channels = stbir_info->channels;
+  int effective_channels = stbir_info->effective_channels;
+  int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
+  stbir_edge edge_horizontal = stbir_info->horizontal.edge;
+  stbir_edge edge_vertical = stbir_info->vertical.edge;
+  int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
+  stbir__span const * spans = stbir_info->scanline_extents.spans;
+  float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+
+  // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
+  STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
+
+  do
+  {
+    float * decode_buffer;
+    void const * input_data;
+    float * end_decode;
+    int width_times_channels;
+    int width;
+
+    if ( spans->n1 < spans->n0 )
+      break;
+
+    width = spans->n1 + 1 - spans->n0;
+    decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
+    end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
+    width_times_channels = width * channels;
+
+    // read directly out of input plane by default
+    input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
+
+    // if we have an input callback, call it to get the input data
+    if ( stbir_info->in_pixels_cb )
+    {
+      // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
+      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
+    }
+
+    STBIR_PROFILE_START( decode );
+    // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
+    stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
+    STBIR_PROFILE_END( decode );
+
+    if (stbir_info->alpha_weight)
+    {
+      STBIR_PROFILE_START( alpha );
+      stbir_info->alpha_weight( decode_buffer, width_times_channels );
+      STBIR_PROFILE_END( alpha );
+    }
+
+    ++spans;
+  } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
+
+  // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
+  // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
+  //   wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
+  if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
+  {
+    // this code only runs if we're in edge_wrap, and we're doing the entire scanline
+    int e, start_x[2];
+    int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
+
+    start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
+    start_x[1] =  input_full_size;                             // right edge
+
+    for( e = 0; e < 2 ; e++ )
+    {
+      // do each margin
+      int margin = stbir_info->scanline_extents.edge_sizes[e];
+      if ( margin )
+      {
+        int x = start_x[e];
+        float * marg = full_decode_buffer + x * effective_channels;
+        float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
+        STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
+      }
+    }
+  }
+}
+
+
+//=================
+// Do 1 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()          \
+    stbir__simdf tot,c;                \
+    STBIR_SIMD_NO_UNROLL(decode);      \
+    stbir__simdf_load1( c, hc );       \
+    stbir__simdf_mult1_mem( tot, c, decode );
+
+#define stbir__2_coeff_only()          \
+    stbir__simdf tot,c,d;              \
+    STBIR_SIMD_NO_UNROLL(decode);      \
+    stbir__simdf_load2z( c, hc );      \
+    stbir__simdf_load2( d, decode );   \
+    stbir__simdf_mult( tot, c, d );    \
+    stbir__simdf_0123to1230( c, tot ); \
+    stbir__simdf_add1( tot, tot, c );
+
+#define stbir__3_coeff_only()                  \
+    stbir__simdf tot,c,t;                      \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc );                \
+    stbir__simdf_mult_mem( tot, c, decode );   \
+    stbir__simdf_0123to1230( c, tot );         \
+    stbir__simdf_0123to2301( t, tot );         \
+    stbir__simdf_add1( tot, tot, c );          \
+    stbir__simdf_add1( tot, tot, t );
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store1( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#define stbir__4_coeff_start()                 \
+    stbir__simdf tot,c;                        \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc );                \
+    stbir__simdf_mult_mem( tot, c, decode );   \
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc + (ofs) );        \
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
+
+#define stbir__1_coeff_remnant( ofs )          \
+    { stbir__simdf d;                          \
+    stbir__simdf_load1z( c, hc + (ofs) );      \
+    stbir__simdf_load1( d, decode + (ofs) );   \
+    stbir__simdf_madd( tot, tot, d, c ); }
+
+#define stbir__2_coeff_remnant( ofs )          \
+    { stbir__simdf d;                          \
+    stbir__simdf_load2z( c, hc+(ofs) );        \
+    stbir__simdf_load2( d, decode+(ofs) );     \
+    stbir__simdf_madd( tot, tot, d, c ); }
+
+#define stbir__3_coeff_setup()                 \
+    stbir__simdf mask;                         \
+    stbir__simdf_load( mask, STBIR_mask + 3 );
+
+#define stbir__3_coeff_remnant( ofs )                  \
+    stbir__simdf_load( c, hc+(ofs) );                  \
+    stbir__simdf_and( c, c, mask );                    \
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
+
+#define stbir__store_output()                     \
+    stbir__simdf_0123to2301( c, tot );            \
+    stbir__simdf_add( tot, tot, c );              \
+    stbir__simdf_0123to1230( c, tot );            \
+    stbir__simdf_add1( tot, tot, c );             \
+    stbir__simdf_store1( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tot;                 \
+    tot = decode[0]*hc[0];
+
+#define stbir__2_coeff_only()  \
+    float tot;                 \
+    tot = decode[0] * hc[0];   \
+    tot += decode[1] * hc[1];
+
+#define stbir__3_coeff_only()  \
+    float tot;                 \
+    tot = decode[0] * hc[0];   \
+    tot += decode[1] * hc[1];  \
+    tot += decode[2] * hc[2];
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot;                              \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#define stbir__4_coeff_start()  \
+    float tot0,tot1,tot2,tot3;  \
+    tot0 = decode[0] * hc[0];   \
+    tot1 = decode[1] * hc[1];   \
+    tot2 = decode[2] * hc[2];   \
+    tot3 = decode[3] * hc[3];
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
+    tot3 += decode[3+(ofs)] * hc[3+(ofs)];
+
+#define stbir__1_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];
+
+#define stbir__2_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
+
+#define stbir__3_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];
+
+#define stbir__store_output()                     \
+    output[0] = (tot0+tot2)+(tot1+tot3);          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#endif
+
+#define STBIR__horizontal_channels 1
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+//=================
+// Do 2 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()         \
+    stbir__simdf tot,c,d;             \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load1z( c, hc );     \
+    stbir__simdf_0123to0011( c, c );  \
+    stbir__simdf_load2( d, decode );  \
+    stbir__simdf_mult( tot, d, c );
+
+#define stbir__2_coeff_only()         \
+    stbir__simdf tot,c;               \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load2( c, hc );      \
+    stbir__simdf_0123to0011( c, c );  \
+    stbir__simdf_mult_mem( tot, c, decode );
+
+#define stbir__3_coeff_only()                \
+    stbir__simdf tot,c,cs,d;                 \
+    STBIR_SIMD_NO_UNROLL(decode);            \
+    stbir__simdf_load( cs, hc );             \
+    stbir__simdf_0123to0011( c, cs );        \
+    stbir__simdf_mult_mem( tot, c, decode ); \
+    stbir__simdf_0123to2222( c, cs );        \
+    stbir__simdf_load2z( d, decode+4 );      \
+    stbir__simdf_madd( tot, tot, d, c );
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_0123to2301( c, tot );            \
+    stbir__simdf_add( tot, tot, c );              \
+    stbir__simdf_store2( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                    \
+    stbir__simdf8 tot0,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                 \
+    stbir__simdf8_load4b( cs, hc );               \
+    stbir__simdf8_0123to00112233( c, cs );        \
+    stbir__simdf8_mult_mem( tot0, c, decode );
+
+#define stbir__4_coeff_continue_from_4( ofs )        \
+    STBIR_SIMD_NO_UNROLL(decode);                    \
+    stbir__simdf8_load4b( cs, hc + (ofs) );          \
+    stbir__simdf8_0123to00112233( c, cs );           \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
+
+#define stbir__1_coeff_remnant( ofs )                \
+    { stbir__simdf t,d;                              \
+    stbir__simdf_load1z( t, hc + (ofs) );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 );     \
+    stbir__simdf_0123to0011( t, t );                 \
+    stbir__simdf_mult( t, t, d );                    \
+    stbir__simdf8_add4( tot0, tot0, t ); }
+ 
+#define stbir__2_coeff_remnant( ofs )                \
+    { stbir__simdf t;                                \
+    stbir__simdf_load2( t, hc + (ofs) );             \
+    stbir__simdf_0123to0011( t, t );                 \
+    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf8_add4( tot0, tot0, t ); }
+
+#define stbir__3_coeff_remnant( ofs )                \
+    { stbir__simdf8 d;                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );          \
+    stbir__simdf8_0123to00112233( c, cs );           \
+    stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
+    stbir__simdf8_madd( tot0, tot0, c, d ); }
+
+#define stbir__store_output()                     \
+    { stbir__simdf t,d;                           \
+    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
+    stbir__simdf_0123to2301( d, t );              \
+    stbir__simdf_add( t, t, d );                  \
+    stbir__simdf_store2( output, t );             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2; }
+
+#else
+
+#define stbir__4_coeff_start()                   \
+    stbir__simdf tot0,tot1,c,cs;                 \
+    STBIR_SIMD_NO_UNROLL(decode);                \
+    stbir__simdf_load( cs, hc );                 \
+    stbir__simdf_0123to0011( c, cs );            \
+    stbir__simdf_mult_mem( tot0, c, decode );    \
+    stbir__simdf_0123to2233( c, cs );            \
+    stbir__simdf_mult_mem( tot1, c, decode+4 );
+
+#define stbir__4_coeff_continue_from_4( ofs )                \
+    STBIR_SIMD_NO_UNROLL(decode);                            \
+    stbir__simdf_load( cs, hc + (ofs) );                     \
+    stbir__simdf_0123to0011( c, cs );                        \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
+    stbir__simdf_0123to2233( c, cs );                        \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
+
+#define stbir__1_coeff_remnant( ofs )            \
+    { stbir__simdf d;                            \
+    stbir__simdf_load1z( cs, hc + (ofs) );       \
+    stbir__simdf_0123to0011( c, cs );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 ); \
+    stbir__simdf_madd( tot0, tot0, d, c ); }
+
+#define stbir__2_coeff_remnant( ofs )                      \
+    stbir__simdf_load2( cs, hc + (ofs) );                  \
+    stbir__simdf_0123to0011( c, cs );                      \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
+
+#define stbir__3_coeff_remnant( ofs )                       \
+    { stbir__simdf d;                                       \
+    stbir__simdf_load( cs, hc + (ofs) );                    \
+    stbir__simdf_0123to0011( c, cs );                       \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
+    stbir__simdf_0123to2222( c, cs );                       \
+    stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
+    stbir__simdf_madd( tot1, tot1, d, c ); }
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot1 );         \
+    stbir__simdf_0123to2301( c, tot0 );           \
+    stbir__simdf_add( tot0, tot0, c );            \
+    stbir__simdf_store2( output, tot0 );          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;
+
+#define stbir__2_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;        \
+    c = hc[1];                 \
+    tota += decode[2]*c;       \
+    totb += decode[3]*c;
+
+// this weird order of add matches the simd
+#define stbir__3_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;        \
+    c = hc[2];                 \
+    tota += decode[4]*c;       \
+    totb += decode[5]*c;       \
+    c = hc[1];                 \
+    tota += decode[2]*c;       \
+    totb += decode[3]*c;
+
+#define stbir__store_output_tiny()                \
+    output[0] = tota;                             \
+    output[1] = totb;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#define stbir__4_coeff_start()      \
+    float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c;  \
+    c = hc[0];                      \
+    tota0 = decode[0]*c;            \
+    totb0 = decode[1]*c;            \
+    c = hc[1];                      \
+    tota1 = decode[2]*c;            \
+    totb1 = decode[3]*c;            \
+    c = hc[2];                      \
+    tota2 = decode[4]*c;            \
+    totb2 = decode[5]*c;            \
+    c = hc[3];                      \
+    tota3 = decode[6]*c;            \
+    totb3 = decode[7]*c;
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    c = hc[0+(ofs)];                           \
+    tota0 += decode[0+(ofs)*2]*c;              \
+    totb0 += decode[1+(ofs)*2]*c;              \
+    c = hc[1+(ofs)];                           \
+    tota1 += decode[2+(ofs)*2]*c;              \
+    totb1 += decode[3+(ofs)*2]*c;              \
+    c = hc[2+(ofs)];                           \
+    tota2 += decode[4+(ofs)*2]*c;              \
+    totb2 += decode[5+(ofs)*2]*c;              \
+    c = hc[3+(ofs)];                           \
+    tota3 += decode[6+(ofs)*2]*c;              \
+    totb3 += decode[7+(ofs)*2]*c;
+
+#define stbir__1_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;
+
+#define stbir__2_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;    \
+    c = hc[1+(ofs)];                   \
+    tota1 += decode[2+(ofs)*2] * c;    \
+    totb1 += decode[3+(ofs)*2] * c;
+
+#define stbir__3_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;    \
+    c = hc[1+(ofs)];                   \
+    tota1 += decode[2+(ofs)*2] * c;    \
+    totb1 += decode[3+(ofs)*2] * c;    \
+    c = hc[2+(ofs)];                   \
+    tota2 += decode[4+(ofs)*2] * c;    \
+    totb2 += decode[5+(ofs)*2] * c;
+
+#define stbir__store_output()                     \
+    output[0] = (tota0+tota2)+(tota1+tota3);      \
+    output[1] = (totb0+totb2)+(totb1+totb3);      \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#endif
+
+#define STBIR__horizontal_channels 2
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+//=================
+// Do 3 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()         \
+    stbir__simdf tot,c,d;             \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load1z( c, hc );     \
+    stbir__simdf_0123to0001( c, c );  \
+    stbir__simdf_load( d, decode );   \
+    stbir__simdf_mult( tot, d, c );
+
+#define stbir__2_coeff_only()         \
+    stbir__simdf tot,c,cs,d;          \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load2( cs, hc );     \
+    stbir__simdf_0123to0000( c, cs ); \
+    stbir__simdf_load( d, decode );   \
+    stbir__simdf_mult( tot, d, c );   \
+    stbir__simdf_0123to1111( c, cs ); \
+    stbir__simdf_load( d, decode+3 ); \
+    stbir__simdf_madd( tot, tot, d, c );
+
+#define stbir__3_coeff_only()            \
+    stbir__simdf tot,c,d,cs;             \
+    STBIR_SIMD_NO_UNROLL(decode);        \
+    stbir__simdf_load( cs, hc );         \
+    stbir__simdf_0123to0000( c, cs );    \
+    stbir__simdf_load( d, decode );      \
+    stbir__simdf_mult( tot, d, c );      \
+    stbir__simdf_0123to1111( c, cs );    \
+    stbir__simdf_load( d, decode+3 );    \
+    stbir__simdf_madd( tot, tot, d, c ); \
+    stbir__simdf_0123to2222( c, cs );    \
+    stbir__simdf_load( d, decode+6 );    \
+    stbir__simdf_madd( tot, tot, d, c );
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store2( output, tot );           \
+    stbir__simdf_0123to2301( tot, tot );          \
+    stbir__simdf_store1( output+2, tot );         \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#ifdef STBIR_SIMD8
+
+// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t;  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
+
+#define stbir__4_coeff_continue_from_4( ofs )      \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc + (ofs) );        \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
+
+#define stbir__1_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
+
+#define stbir__2_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
+
+ #define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                                \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                      \
+    stbir__simdf8_0123to00001111( c, cs );                       \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
+    stbir__simdf8_0123to2222( t, cs );                           \
+    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
+
+#define stbir__store_output()                       \
+    stbir__simdf8_add( tot0, tot0, tot1 );          \
+    stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
+    stbir__simdf8_add4halves( t, t, tot0 );         \
+    horizontal_coefficients += coefficient_width;   \
+    ++horizontal_contributors;                      \
+    output += 3;                                    \
+    if ( output < output_end )                      \
+    {                                               \
+      stbir__simdf_store( output-3, t );            \
+      continue;                                     \
+    }                                               \
+    { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
+    stbir__simdf_store2( output-3, t );             \
+    stbir__simdf_store1( output+2-3, tt ); }        \
+    break;
+
+
+#else
+
+#define stbir__4_coeff_start()                  \
+    stbir__simdf tot0,tot1,tot2,c,cs;           \
+    STBIR_SIMD_NO_UNROLL(decode);               \
+    stbir__simdf_load( cs, hc );                \
+    stbir__simdf_0123to0001( c, cs );           \
+    stbir__simdf_mult_mem( tot0, c, decode );   \
+    stbir__simdf_0123to1122( c, cs );           \
+    stbir__simdf_mult_mem( tot1, c, decode+4 ); \
+    stbir__simdf_0123to2333( c, cs );           \
+    stbir__simdf_mult_mem( tot2, c, decode+8 );
+
+#define stbir__4_coeff_continue_from_4( ofs )                 \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load( cs, hc + (ofs) );                      \
+    stbir__simdf_0123to0001( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
+    stbir__simdf_0123to1122( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
+    stbir__simdf_0123to2333( c, cs );                         \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
+
+#define stbir__1_coeff_remnant( ofs )         \
+    STBIR_SIMD_NO_UNROLL(decode);             \
+    stbir__simdf_load1z( c, hc + (ofs) );     \
+    stbir__simdf_0123to0001( c, c );          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
+
+#define stbir__2_coeff_remnant( ofs )                       \
+    { stbir__simdf d;                                       \
+    STBIR_SIMD_NO_UNROLL(decode);                           \
+    stbir__simdf_load2z( cs, hc + (ofs) );                  \
+    stbir__simdf_0123to0001( c, cs );                       \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
+    stbir__simdf_0123to1122( c, cs );                       \
+    stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
+    stbir__simdf_madd( tot1, tot1, c, d ); }
+
+#define stbir__3_coeff_remnant( ofs )                         \
+    { stbir__simdf d;                                         \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load( cs, hc + (ofs) );                      \
+    stbir__simdf_0123to0001( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
+    stbir__simdf_0123to1122( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
+    stbir__simdf_0123to2222( c, cs );                         \
+    stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
+    stbir__simdf_madd( tot2, tot2, c, d );  }
+
+#define stbir__store_output()                       \
+    stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
+    stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 );  \
+    stbir__simdf_0123to1230( tot2, tot2 );          \
+    stbir__simdf_add( tot0, tot0, cs );             \
+    stbir__simdf_add( c, c, tot2 );                 \
+    stbir__simdf_add( tot0, tot0, c );              \
+    horizontal_coefficients += coefficient_width;   \
+    ++horizontal_contributors;                      \
+    output += 3;                                    \
+    if ( output < output_end )                      \
+    {                                               \
+      stbir__simdf_store( output-3, tot0 );         \
+      continue;                                     \
+    }                                               \
+    stbir__simdf_0123to2301( tot1, tot0 );          \
+    stbir__simdf_store2( output-3, tot0 );          \
+    stbir__simdf_store1( output+2-3, tot1 );        \
+    break;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;
+
+#define stbir__2_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;        \
+    c = hc[1];                 \
+    tot0 += decode[3]*c;       \
+    tot1 += decode[4]*c;       \
+    tot2 += decode[5]*c;
+
+#define stbir__3_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;        \
+    c = hc[1];                 \
+    tot0 += decode[3]*c;       \
+    tot1 += decode[4]*c;       \
+    tot2 += decode[5]*c;       \
+    c = hc[2];                 \
+    tot0 += decode[6]*c;       \
+    tot1 += decode[7]*c;       \
+    tot2 += decode[8]*c;
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot0;                             \
+    output[1] = tot1;                             \
+    output[2] = tot2;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#define stbir__4_coeff_start()      \
+    float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c;  \
+    c = hc[0];                      \
+    tota0 = decode[0]*c;            \
+    tota1 = decode[1]*c;            \
+    tota2 = decode[2]*c;            \
+    c = hc[1];                      \
+    totb0 = decode[3]*c;            \
+    totb1 = decode[4]*c;            \
+    totb2 = decode[5]*c;            \
+    c = hc[2];                      \
+    totc0 = decode[6]*c;            \
+    totc1 = decode[7]*c;            \
+    totc2 = decode[8]*c;            \
+    c = hc[3];                      \
+    totd0 = decode[9]*c;            \
+    totd1 = decode[10]*c;           \
+    totd2 = decode[11]*c;
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    c = hc[0+(ofs)];                           \
+    tota0 += decode[0+(ofs)*3]*c;              \
+    tota1 += decode[1+(ofs)*3]*c;              \
+    tota2 += decode[2+(ofs)*3]*c;              \
+    c = hc[1+(ofs)];                           \
+    totb0 += decode[3+(ofs)*3]*c;              \
+    totb1 += decode[4+(ofs)*3]*c;              \
+    totb2 += decode[5+(ofs)*3]*c;              \
+    c = hc[2+(ofs)];                           \
+    totc0 += decode[6+(ofs)*3]*c;              \
+    totc1 += decode[7+(ofs)*3]*c;              \
+    totc2 += decode[8+(ofs)*3]*c;              \
+    c = hc[3+(ofs)];                           \
+    totd0 += decode[9+(ofs)*3]*c;              \
+    totd1 += decode[10+(ofs)*3]*c;             \
+    totd2 += decode[11+(ofs)*3]*c;
+
+#define stbir__1_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;
+
+#define stbir__2_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;      \
+    c = hc[1+(ofs)];                   \
+    totb0 += decode[3+(ofs)*3]*c;      \
+    totb1 += decode[4+(ofs)*3]*c;      \
+    totb2 += decode[5+(ofs)*3]*c;      \
+
+#define stbir__3_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;      \
+    c = hc[1+(ofs)];                   \
+    totb0 += decode[3+(ofs)*3]*c;      \
+    totb1 += decode[4+(ofs)*3]*c;      \
+    totb2 += decode[5+(ofs)*3]*c;      \
+    c = hc[2+(ofs)];                   \
+    totc0 += decode[6+(ofs)*3]*c;      \
+    totc1 += decode[7+(ofs)*3]*c;      \
+    totc2 += decode[8+(ofs)*3]*c;
+
+#define stbir__store_output()                     \
+    output[0] = (tota0+totc0)+(totb0+totd0);      \
+    output[1] = (tota1+totc1)+(totb1+totd1);      \
+    output[2] = (tota2+totc2)+(totb2+totd2);      \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#endif
+
+#define STBIR__horizontal_channels 3
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+//=================
+// Do 4 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()             \
+    stbir__simdf tot,c;                   \
+    STBIR_SIMD_NO_UNROLL(decode);         \
+    stbir__simdf_load1( c, hc );          \
+    stbir__simdf_0123to0000( c, c );      \
+    stbir__simdf_mult_mem( tot, c, decode );
+
+#define stbir__2_coeff_only()                       \
+    stbir__simdf tot,c,cs;                          \
+    STBIR_SIMD_NO_UNROLL(decode);                   \
+    stbir__simdf_load2( cs, hc );                   \
+    stbir__simdf_0123to0000( c, cs );               \
+    stbir__simdf_mult_mem( tot, c, decode );        \
+    stbir__simdf_0123to1111( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 );
+
+#define stbir__3_coeff_only()                       \
+    stbir__simdf tot,c,cs;                          \
+    STBIR_SIMD_NO_UNROLL(decode);                   \
+    stbir__simdf_load( cs, hc );                    \
+    stbir__simdf_0123to0000( c, cs );               \
+    stbir__simdf_mult_mem( tot, c, decode );        \
+    stbir__simdf_0123to1111( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
+    stbir__simdf_0123to2222( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+8 );
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store( output, tot );            \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,c,cs; stbir__simdf t;  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode );     \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
+
+#define stbir__4_coeff_continue_from_4( ofs )                  \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
+    stbir__simdf8_0123to00001111( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
+
+#define stbir__1_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
+
+#define stbir__2_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
+
+ #define stbir__3_coeff_remnant( ofs )                         \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
+    stbir__simdf8_0123to00001111( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf8_0123to2222( t, cs );                         \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
+
+#define stbir__store_output()                      \
+    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
+    stbir__simdf_store( output, t );               \
+    horizontal_coefficients += coefficient_width;  \
+    ++horizontal_contributors;                     \
+    output += 4;
+
+#else
+
+#define stbir__4_coeff_start()                        \
+    stbir__simdf tot0,tot1,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                     \
+    stbir__simdf_load( cs, hc );                      \
+    stbir__simdf_0123to0000( c, cs );                 \
+    stbir__simdf_mult_mem( tot0, c, decode );         \
+    stbir__simdf_0123to1111( c, cs );                 \
+    stbir__simdf_mult_mem( tot1, c, decode+4 );       \
+    stbir__simdf_0123to2222( c, cs );                 \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
+    stbir__simdf_0123to3333( c, cs );                 \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
+
+#define stbir__4_coeff_continue_from_4( ofs )                  \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
+    stbir__simdf_0123to1111( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
+    stbir__simdf_0123to2222( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
+    stbir__simdf_0123to3333( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
+
+#define stbir__1_coeff_remnant( ofs )                       \
+    STBIR_SIMD_NO_UNROLL(decode);                           \
+    stbir__simdf_load1( c, hc + (ofs) );                    \
+    stbir__simdf_0123to0000( c, c );                        \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
+
+#define stbir__2_coeff_remnant( ofs )                         \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load2( cs, hc + (ofs) );                     \
+    stbir__simdf_0123to0000( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf_0123to1111( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
+
+#define stbir__3_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
+    stbir__simdf_0123to1111( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
+    stbir__simdf_0123to2222( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;
+
+#define stbir__2_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;               \
+    c = hc[1];                        \
+    p0 += decode[4] * c;              \
+    p1 += decode[5] * c;              \
+    p2 += decode[6] * c;              \
+    p3 += decode[7] * c;
+
+#define stbir__3_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;               \
+    c = hc[1];                        \
+    p0 += decode[4] * c;              \
+    p1 += decode[5] * c;              \
+    p2 += decode[6] * c;              \
+    p3 += decode[7] * c;              \
+    c = hc[2];                        \
+    p0 += decode[8] * c;              \
+    p1 += decode[9] * c;              \
+    p2 += decode[10] * c;             \
+    p3 += decode[11] * c;
+
+#define stbir__store_output_tiny()                \
+    output[0] = p0;                               \
+    output[1] = p1;                               \
+    output[2] = p2;                               \
+    output[3] = p3;                               \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#define stbir__4_coeff_start()        \
+    float x0,x1,x2,x3,y0,y1,y2,y3,c;  \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    x0 = decode[0] * c;               \
+    x1 = decode[1] * c;               \
+    x2 = decode[2] * c;               \
+    x3 = decode[3] * c;               \
+    c = hc[1];                        \
+    y0 = decode[4] * c;               \
+    y1 = decode[5] * c;               \
+    y2 = decode[6] * c;               \
+    y3 = decode[7] * c;               \
+    c = hc[2];                        \
+    x0 += decode[8] * c;              \
+    x1 += decode[9] * c;              \
+    x2 += decode[10] * c;             \
+    x3 += decode[11] * c;             \
+    c = hc[3];                        \
+    y0 += decode[12] * c;             \
+    y1 += decode[13] * c;             \
+    y2 += decode[14] * c;             \
+    y3 += decode[15] * c;
+
+#define stbir__4_coeff_continue_from_4( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;      \
+    c = hc[2+(ofs)];                  \
+    x0 += decode[8+(ofs)*4] * c;      \
+    x1 += decode[9+(ofs)*4] * c;      \
+    x2 += decode[10+(ofs)*4] * c;     \
+    x3 += decode[11+(ofs)*4] * c;     \
+    c = hc[3+(ofs)];                  \
+    y0 += decode[12+(ofs)*4] * c;     \
+    y1 += decode[13+(ofs)*4] * c;     \
+    y2 += decode[14+(ofs)*4] * c;     \
+    y3 += decode[15+(ofs)*4] * c;
+
+#define stbir__1_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;
+
+#define stbir__2_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;
+
+#define stbir__3_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;      \
+    c = hc[2+(ofs)];                  \
+    x0 += decode[8+(ofs)*4] * c;      \
+    x1 += decode[9+(ofs)*4] * c;      \
+    x2 += decode[10+(ofs)*4] * c;     \
+    x3 += decode[11+(ofs)*4] * c;
+
+#define stbir__store_output()                     \
+    output[0] = x0 + y0;                          \
+    output[1] = x1 + y1;                          \
+    output[2] = x2 + y2;                          \
+    output[3] = x3 + y3;                          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#endif
+
+#define STBIR__horizontal_channels 4
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+
+//=================
+// Do 7 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()                   \
+    stbir__simdf tot0,tot1,c;                   \
+    STBIR_SIMD_NO_UNROLL(decode);               \
+    stbir__simdf_load1( c, hc );                \
+    stbir__simdf_0123to0000( c, c );            \
+    stbir__simdf_mult_mem( tot0, c, decode );   \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );
+
+#define stbir__2_coeff_only()                         \
+    stbir__simdf tot0,tot1,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                     \
+    stbir__simdf_load2( cs, hc );                     \
+    stbir__simdf_0123to0000( c, cs );                 \
+    stbir__simdf_mult_mem( tot0, c, decode );         \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );       \
+    stbir__simdf_0123to1111( c, cs );                 \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
+    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
+
+#define stbir__3_coeff_only()                           \
+    stbir__simdf tot0,tot1,c,cs;                        \
+    STBIR_SIMD_NO_UNROLL(decode);                       \
+    stbir__simdf_load( cs, hc );                        \
+    stbir__simdf_0123to0000( c, cs );                   \
+    stbir__simdf_mult_mem( tot0, c, decode );           \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );         \
+    stbir__simdf_0123to1111( c, cs );                   \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 );   \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
+    stbir__simdf_0123to2222( c, cs );                   \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store( output+3, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,tot1,c,cs;                  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00000000( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode );     \
+    stbir__simdf8_0123to11111111( c, cs );         \
+    stbir__simdf8_mult_mem( tot1, c, decode+7 );   \
+    stbir__simdf8_0123to22222222( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf8_0123to33333333( c, cs );         \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
+
+#define stbir__4_coeff_continue_from_4( ofs )                   \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
+    stbir__simdf8_0123to00000000( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_0123to11111111( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
+    stbir__simdf8_0123to22222222( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
+    stbir__simdf8_0123to33333333( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
+
+#define stbir__1_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load1b( c, hc + (ofs) );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
+
+#define stbir__2_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load1b( c, hc + (ofs) );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
+
+#define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
+    stbir__simdf8_0123to00000000( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_0123to11111111( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
+    stbir__simdf8_0123to22222222( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
+
+#define stbir__store_output()                     \
+    stbir__simdf8_add( tot0, tot0, tot1 );        \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;                                  \
+    if ( output < output_end )                    \
+    {                                             \
+      stbir__simdf8_store( output-7, tot0 );      \
+      continue;                                   \
+    }                                             \
+    stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
+    stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) );           \
+    break;
+
+#else
+
+#define stbir__4_coeff_start()                    \
+    stbir__simdf tot0,tot1,tot2,tot3,c,cs;        \
+    STBIR_SIMD_NO_UNROLL(decode);                 \
+    stbir__simdf_load( cs, hc );                  \
+    stbir__simdf_0123to0000( c, cs );             \
+    stbir__simdf_mult_mem( tot0, c, decode );     \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );   \
+    stbir__simdf_0123to1111( c, cs );             \
+    stbir__simdf_mult_mem( tot2, c, decode+7 );   \
+    stbir__simdf_mult_mem( tot3, c, decode+10 );  \
+    stbir__simdf_0123to2222( c, cs );             \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
+    stbir__simdf_0123to3333( c, cs );                   \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
+
+#define stbir__4_coeff_continue_from_4( ofs )                   \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load( cs, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
+    stbir__simdf_0123to2222( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
+    stbir__simdf_0123to3333( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
+
+#define stbir__1_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load1( c, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, c );                            \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+
+#define stbir__2_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load2( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
+
+#define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load( cs, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
+    stbir__simdf_0123to2222( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot2 );         \
+    stbir__simdf_add( tot1, tot1, tot3 );         \
+    stbir__simdf_store( output+3, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;
+
+#define stbir__2_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              \
+    c = hc[1];                       \
+    tot0 += decode[7]*c;             \
+    tot1 += decode[8]*c;             \
+    tot2 += decode[9]*c;             \
+    tot3 += decode[10]*c;            \
+    tot4 += decode[11]*c;            \
+    tot5 += decode[12]*c;            \
+    tot6 += decode[13]*c;            \
+
+#define stbir__3_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              \
+    c = hc[1];                       \
+    tot0 += decode[7]*c;             \
+    tot1 += decode[8]*c;             \
+    tot2 += decode[9]*c;             \
+    tot3 += decode[10]*c;            \
+    tot4 += decode[11]*c;            \
+    tot5 += decode[12]*c;            \
+    tot6 += decode[13]*c;            \
+    c = hc[2];                       \
+    tot0 += decode[14]*c;            \
+    tot1 += decode[15]*c;            \
+    tot2 += decode[16]*c;            \
+    tot3 += decode[17]*c;            \
+    tot4 += decode[18]*c;            \
+    tot5 += decode[19]*c;            \
+    tot6 += decode[20]*c;            \
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot0;                             \
+    output[1] = tot1;                             \
+    output[2] = tot2;                             \
+    output[3] = tot3;                             \
+    output[4] = tot4;                             \
+    output[5] = tot5;                             \
+    output[6] = tot6;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#define stbir__4_coeff_start()    \
+    float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
+    STBIR_SIMD_NO_UNROLL(decode); \
+    c = hc[0];                    \
+    x0 = decode[0] * c;           \
+    x1 = decode[1] * c;           \
+    x2 = decode[2] * c;           \
+    x3 = decode[3] * c;           \
+    x4 = decode[4] * c;           \
+    x5 = decode[5] * c;           \
+    x6 = decode[6] * c;           \
+    c = hc[1];                    \
+    y0 = decode[7] * c;           \
+    y1 = decode[8] * c;           \
+    y2 = decode[9] * c;           \
+    y3 = decode[10] * c;          \
+    y4 = decode[11] * c;          \
+    y5 = decode[12] * c;          \
+    y6 = decode[13] * c;          \
+    c = hc[2];                    \
+    x0 += decode[14] * c;         \
+    x1 += decode[15] * c;         \
+    x2 += decode[16] * c;         \
+    x3 += decode[17] * c;         \
+    x4 += decode[18] * c;         \
+    x5 += decode[19] * c;         \
+    x6 += decode[20] * c;         \
+    c = hc[3];                    \
+    y0 += decode[21] * c;         \
+    y1 += decode[22] * c;         \
+    y2 += decode[23] * c;         \
+    y3 += decode[24] * c;         \
+    y4 += decode[25] * c;         \
+    y5 += decode[26] * c;         \
+    y6 += decode[27] * c;
+
+#define stbir__4_coeff_continue_from_4( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+    c = hc[2+(ofs)];               \
+    x0 += decode[14+(ofs)*7] * c;  \
+    x1 += decode[15+(ofs)*7] * c;  \
+    x2 += decode[16+(ofs)*7] * c;  \
+    x3 += decode[17+(ofs)*7] * c;  \
+    x4 += decode[18+(ofs)*7] * c;  \
+    x5 += decode[19+(ofs)*7] * c;  \
+    x6 += decode[20+(ofs)*7] * c;  \
+    c = hc[3+(ofs)];               \
+    y0 += decode[21+(ofs)*7] * c;  \
+    y1 += decode[22+(ofs)*7] * c;  \
+    y2 += decode[23+(ofs)*7] * c;  \
+    y3 += decode[24+(ofs)*7] * c;  \
+    y4 += decode[25+(ofs)*7] * c;  \
+    y5 += decode[26+(ofs)*7] * c;  \
+    y6 += decode[27+(ofs)*7] * c;
+
+#define stbir__1_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+
+#define stbir__2_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+
+#define stbir__3_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+    c = hc[2+(ofs)];               \
+    x0 += decode[14+(ofs)*7] * c;  \
+    x1 += decode[15+(ofs)*7] * c;  \
+    x2 += decode[16+(ofs)*7] * c;  \
+    x3 += decode[17+(ofs)*7] * c;  \
+    x4 += decode[18+(ofs)*7] * c;  \
+    x5 += decode[19+(ofs)*7] * c;  \
+    x6 += decode[20+(ofs)*7] * c;  \
+
+#define stbir__store_output()                     \
+    output[0] = x0 + y0;                          \
+    output[1] = x1 + y1;                          \
+    output[2] = x2 + y2;                          \
+    output[3] = x3 + y3;                          \
+    output[4] = x4 + y4;                          \
+    output[5] = x5 + y5;                          \
+    output[6] = x6 + y6;                          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#endif
+
+#define STBIR__horizontal_channels 7
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+// include all of the vertical resamplers (both scatter and gather versions)
+
+#define STBIR__vertical_channels 1
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 1
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 2
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 2
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 3
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 3
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 4
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 4
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 5
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 5
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 6
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 6
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 7
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 7
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 8
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 8
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
+
+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
+{
+  stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
+};
+
+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
+{
+  stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
+};
+
+typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
+
+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
+{
+  stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
+};
+
+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
+{
+  stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
+};
+
+
+static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row  STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
+  int channels = stbir_info->channels;
+  int width_times_channels = num_pixels * channels;
+  void * output_buffer;
+
+  // un-alpha weight if we need to
+  if ( stbir_info->alpha_unweight )
+  {
+    STBIR_PROFILE_START( unalpha );
+    stbir_info->alpha_unweight( encode_buffer, width_times_channels );
+    STBIR_PROFILE_END( unalpha );
+  }
+
+  // write directly into output by default
+  output_buffer = output_buffer_data;
+
+  // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
+  if ( stbir_info->out_pixels_cb )
+    output_buffer = encode_buffer;
+
+  STBIR_PROFILE_START( encode );
+  // convert into the output buffer
+  stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
+  STBIR_PROFILE_END( encode );
+
+  // if we have an output callback, call it to send the data
+  if ( stbir_info->out_pixels_cb )
+    stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
+}
+
+
+// Get the ring buffer pointer for an index
+static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
+{
+  STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
+
+  #ifdef STBIR__SEPARATE_ALLOCATIONS
+    return split_info->ring_buffers[ index ];
+  #else
+    return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
+  #endif
+}
+
+// Get the specified scan line from the ring buffer
+static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
+{
+  int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+  return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
+}
+
+static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
+
+  STBIR_PROFILE_START( horizontal );
+  if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
+    STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
+  else
+    stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
+  STBIR_PROFILE_END( horizontal );
+}
+
+static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
+{
+  float* encode_buffer = split_info->vertical_buffer;
+  float* decode_buffer = split_info->decode_buffer;
+  int vertical_first = stbir_info->vertical_first;
+  int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
+  int width_times_channels = stbir_info->effective_channels * width;
+
+  STBIR_ASSERT( stbir_info->vertical.is_gather );
+
+  // loop over the contributing scanlines and scale into the buffer
+  STBIR_PROFILE_START( vertical );
+  {
+    int k = 0, total = contrib_n1 - contrib_n0 + 1;
+    STBIR_ASSERT( total > 0 );
+    do {
+      float const * inputs[8];
+      int i, cnt = total; if ( cnt > 8 ) cnt = 8;
+      for( i = 0 ; i < cnt ; i++ )
+        inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
+
+      // call the N scanlines at a time function (up to 8 scanlines of blending at once)
+      ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
+      k += cnt;
+      total -= cnt;
+    } while ( total );
+  }
+  STBIR_PROFILE_END( vertical );
+
+  if ( vertical_first )
+  {
+    // Now resample the gathered vertical data in the horizontal axis into the encode buffer
+    stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  }
+
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
+                          encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+}
+
+static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
+{
+  int ring_buffer_index;
+  float* ring_buffer;
+
+  // Decode the nth scanline from the source image into the decode buffer.
+  stbir__decode_scanline( stbir_info, n, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // update new end scanline
+  split_info->ring_buffer_last_scanline = n;
+
+  // get ring buffer
+  ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+  ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
+
+  // Now resample it into the ring buffer.
+  stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
+{
+  int y, start_output_y, end_output_y;
+  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
+  float const * vertical_coefficients = stbir_info->vertical.coefficients;
+
+  STBIR_ASSERT( stbir_info->vertical.is_gather );
+
+  start_output_y = split_info->start_output_y;
+  end_output_y = split_info[split_count-1].end_output_y;
+
+  vertical_contributors += start_output_y;
+  vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
+
+  // initialize the ring buffer for gathering
+  split_info->ring_buffer_begin_index = 0;
+  split_info->ring_buffer_first_scanline = vertical_contributors->n0;
+  split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
+
+  for (y = start_output_y; y < end_output_y; y++)
+  {
+    int in_first_scanline, in_last_scanline;
+
+    in_first_scanline = vertical_contributors->n0;
+    in_last_scanline = vertical_contributors->n1;
+
+    // make sure the indexing hasn't broken
+    STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
+
+    // Load in new scanlines
+    while (in_last_scanline > split_info->ring_buffer_last_scanline)
+    {
+      STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
+
+      // make sure there was room in the ring buffer when we add new scanlines
+      if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
+      {
+        split_info->ring_buffer_first_scanline++;
+        split_info->ring_buffer_begin_index++;
+      }
+
+      if ( stbir_info->vertical_first )
+      {
+        float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
+        // Decode the nth scanline from the source image into the decode buffer.
+        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+      }
+      else
+      {
+        stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
+      }
+    }
+
+    // Now all buffers should be ready to write a row of vertical sampling, so do it.
+    stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
+
+    ++vertical_contributors;
+    vertical_coefficients += stbir_info->vertical.coefficient_width;
+  }
+}
+
+#define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
+#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
+
+static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
+{
+  // evict a scanline out into the output buffer
+  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
+
+  // dump the scanline out
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // mark it as empty
+  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
+
+  // advance the first scanline
+  split_info->ring_buffer_first_scanline++;
+  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
+    split_info->ring_buffer_begin_index = 0;
+}
+
+static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
+{
+  // evict a scanline out into the output buffer
+
+  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
+
+  // Now resample it into the buffer.
+  stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // dump the scanline out
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // mark it as empty
+  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
+
+  // advance the first scanline
+  split_info->ring_buffer_first_scanline++;
+  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
+    split_info->ring_buffer_begin_index = 0;
+}
+
+static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
+{
+  STBIR_ASSERT( !stbir_info->vertical.is_gather );
+
+  STBIR_PROFILE_START( vertical );
+  {
+    int k = 0, total = n1 - n0 + 1;
+    STBIR_ASSERT( total > 0 );
+    do {
+      float * outputs[8];
+      int i, n = total; if ( n > 8 ) n = 8;
+      for( i = 0 ; i < n ; i++ )
+      {
+        outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
+        if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
+        {
+          n = i;
+          break;
+        }
+      }
+      // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
+      ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
+      k += n;
+      total -= n;
+    } while ( total );
+  }
+
+  STBIR_PROFILE_END( vertical );
+}
+
+typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
+
+static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
+{
+  int y, start_output_y, end_output_y, start_input_y, end_input_y;
+  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
+  float const * vertical_coefficients = stbir_info->vertical.coefficients;
+  stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
+  void * scanline_scatter_buffer;
+  void * scanline_scatter_buffer_end;
+  int on_first_input_y, last_input_y;
+
+  STBIR_ASSERT( !stbir_info->vertical.is_gather );
+
+  start_output_y = split_info->start_output_y;
+  end_output_y = split_info[split_count-1].end_output_y;  // may do multiple split counts
+
+  start_input_y = split_info->start_input_y;
+  end_input_y = split_info[split_count-1].end_input_y;
+
+  // adjust for starting offset start_input_y
+  y = start_input_y + stbir_info->vertical.filter_pixel_margin;
+  vertical_contributors += y ;
+  vertical_coefficients += stbir_info->vertical.coefficient_width * y;
+
+  if ( stbir_info->vertical_first )
+  {
+    handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
+    scanline_scatter_buffer = split_info->decode_buffer;
+    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
+  }
+  else
+  {
+    handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
+    scanline_scatter_buffer = split_info->vertical_buffer;
+    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
+  }
+
+  // initialize the ring buffer for scattering
+  split_info->ring_buffer_first_scanline = start_output_y;
+  split_info->ring_buffer_last_scanline = -1;
+  split_info->ring_buffer_begin_index = -1;
+
+  // mark all the buffers as empty to start
+  for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
+    stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+
+  // do the loop in input space
+  on_first_input_y = 1; last_input_y = start_input_y;
+  for (y = start_input_y ; y < end_input_y; y++)
+  {
+    int out_first_scanline, out_last_scanline;
+
+    out_first_scanline = vertical_contributors->n0;
+    out_last_scanline = vertical_contributors->n1;
+
+    STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+    if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
+    {
+      float const * vc = vertical_coefficients;
+
+      // keep track of the range actually seen for the next resize
+      last_input_y = y;
+      if ( ( on_first_input_y ) && ( y > start_input_y ) )
+        split_info->start_input_y = y;
+      on_first_input_y = 0;
+
+      // clip the region
+      if ( out_first_scanline < start_output_y )
+      {
+        vc += start_output_y - out_first_scanline;
+        out_first_scanline = start_output_y;
+      }
+
+      if ( out_last_scanline >= end_output_y )
+        out_last_scanline = end_output_y - 1;
+
+      // if very first scanline, init the index
+      if (split_info->ring_buffer_begin_index < 0)
+        split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
+
+      STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
+
+      // Decode the nth scanline from the source image into the decode buffer.
+      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+      // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
+      if ( !stbir_info->vertical_first )
+        stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+      // Now it's sitting in the buffer ready to be distributed into the ring buffers.
+
+      // evict from the ringbuffer, if we need are full
+      if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
+           ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
+        handle_scanline_for_scatter( stbir_info, split_info );
+
+      // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
+      stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
+
+      // update the end of the buffer
+      if ( out_last_scanline > split_info->ring_buffer_last_scanline )
+        split_info->ring_buffer_last_scanline = out_last_scanline;
+    }
+    ++vertical_contributors;
+    vertical_coefficients += stbir_info->vertical.coefficient_width;
+  }
+
+  // now evict the scanlines that are left over in the ring buffer
+  while ( split_info->ring_buffer_first_scanline < end_output_y )
+    handle_scanline_for_scatter(stbir_info, split_info);
+
+  // update the end_input_y if we do multiple resizes with the same data
+  ++last_input_y;
+  for( y = 0 ; y < split_count; y++ )
+    if ( split_info[y].end_input_y > last_input_y )
+      split_info[y].end_input_y = last_input_y;
+}
+
+
+static stbir__kernel_callback * stbir__builtin_kernels[] =   { 0, stbir__filter_trapezoid,  stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point };
+static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one,     stbir__support_two,  stbir__support_two,       stbir__support_two,     stbir__support_zeropoint5 };
+
+static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data )
+{
+  // set filter
+  if (filter == 0)
+  {
+    filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
+    if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
+    {
+      if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
+        filter = STBIR_FILTER_POINT_SAMPLE;
+      else
+        filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
+    }
+  }
+  samp->filter_enum = filter;
+
+  STBIR_ASSERT(samp->filter_enum != 0);
+  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
+  samp->filter_kernel = stbir__builtin_kernels[ filter ];
+  samp->filter_support = stbir__builtin_supports[ filter ];
+
+  if ( kernel && support )
+  {
+    samp->filter_kernel = kernel;
+    samp->filter_support = support;
+    samp->filter_enum = STBIR_FILTER_OTHER;
+  }
+
+  samp->edge = edge;
+  samp->filter_pixel_width  = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data );
+  // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory
+  //    For horizontal, we always have all the pixels, so we always use gather here (always_gather==1).
+  //    For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width
+  //    scanlines in memory at once).
+  samp->is_gather = 0;
+  if ( scale_info->scale >= ( 1.0f - stbir__small_float ) )
+    samp->is_gather = 1;
+  else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) )
+    samp->is_gather = 2;
+
+  // pre calculate stuff based on the above
+  samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
+
+  // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
+  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the 
+  //   filter will extend before or after the scanline beyond just one extra entire copy of the 
+  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total 
+  //   width to 3x the total of input pixel (once for the scanline, once for the left side 
+  //   overhang, and once for the right side). We only do this for edge mode, since the other 
+  //   modes can just re-edge clamp back in again.
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
+      samp->filter_pixel_width = scale_info->input_full_size * 3;
+
+  // This is how much to expand buffers to account for filters seeking outside
+  // the image boundaries.
+  samp->filter_pixel_margin = samp->filter_pixel_width / 2;
+  
+  // filter_pixel_margin is the amount that this filter can overhang on just one side of either 
+  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's 
+  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again, 
+  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix). 
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_margin > scale_info->input_full_size )
+      samp->filter_pixel_margin = scale_info->input_full_size;
+
+  samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
+
+  samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
+  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
+
+  samp->gather_prescatter_contributors = 0;
+  samp->gather_prescatter_coefficients = 0;
+  if ( samp->is_gather == 0 )
+  {
+    samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
+    samp->gather_prescatter_num_contributors  = stbir__get_contributors(samp, 2);
+    samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors);
+    samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float);
+  }
+}
+
+static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data )
+{
+  float scale = samp->scale_info.scale;
+  float out_shift = samp->scale_info.pixel_shift;
+  stbir__support_callback * support = samp->filter_support;
+  int input_full_size = samp->scale_info.input_full_size;
+  stbir_edge edge = samp->edge;
+  float inv_scale = samp->scale_info.inv_scale;
+
+  STBIR_ASSERT( samp->is_gather != 0 );
+
+  if ( samp->is_gather == 1 )
+  {
+    int in_first_pixel, in_last_pixel;
+    float out_filter_radius = support(inv_scale, user_data) * scale;
+
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
+    range->n0 = in_first_pixel;
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
+    range->n1 = in_last_pixel;
+  }
+  else if ( samp->is_gather == 2 ) // downsample gather, refine
+  {
+    float in_pixels_radius = support(scale, user_data) * inv_scale;
+    int filter_pixel_margin = samp->filter_pixel_margin;
+    int output_sub_size = samp->scale_info.output_sub_size;
+    int input_end;
+    int n;
+    int in_first_pixel, in_last_pixel;
+
+    // get a conservative area of the input range
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge );
+    range->n0 = in_first_pixel;
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
+    range->n1 = in_last_pixel;
+
+    // now go through the margin to the start of area to find bottom
+    n = range->n0 + 1;
+    input_end = -filter_pixel_margin;
+    while( n >= input_end )
+    {
+      int out_first_pixel, out_last_pixel;
+      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
+      if ( out_first_pixel > out_last_pixel )
+        break;
+
+      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
+        range->n0 = n;
+      --n;
+    }
+
+    // now go through the end of the area through the margin to find top
+    n = range->n1 - 1;
+    input_end = n + 1 + filter_pixel_margin;
+    while( n <= input_end )
+    {
+      int out_first_pixel, out_last_pixel;
+      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
+      if ( out_first_pixel > out_last_pixel )
+        break;
+      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
+        range->n1 = n;
+      ++n;
+    }
+  }
+
+  if ( samp->edge == STBIR_EDGE_WRAP )
+  {
+    // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge
+    if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) )
+    {
+      int marg = range->n1 - input_full_size + 1;
+      if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 )
+        range->n0 = 0;
+    }
+    if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) )
+    {
+      int marg = -range->n0;
+      if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 )
+        range->n1 = input_full_size - 1;
+    }
+  }
+  else
+  {
+    // for non-edge-wrap modes, we never read over the edge, so clamp
+    if ( range->n0 < 0 )
+      range->n0 = 0;
+    if ( range->n1 >= input_full_size )
+      range->n1 = input_full_size - 1;
+  }
+}
+
+static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height )
+{
+  int i, cur;
+  int left = output_height;
+
+  cur = 0;
+  for( i = 0 ; i < splits ; i++ )
+  {
+    int each;
+    split_info[i].start_output_y = cur;
+    each = left / ( splits - i );
+    split_info[i].end_output_y = cur + each;
+    cur += each;
+    left -= each;
+
+    // scatter range (updated to minimum as you run it)
+    split_info[i].start_input_y = -vertical_pixel_margin;
+    split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
+  }
+}
+
+static void stbir__free_internal_mem( stbir__info *info )
+{
+  #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
+
+  if ( info )
+  {
+  #ifndef STBIR__SEPARATE_ALLOCATIONS
+    STBIR__FREE_AND_CLEAR( info->alloced_mem );
+  #else
+    int i,j;
+
+    if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) )
+    {
+      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients );
+      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors );
+    }
+    for( i = 0 ; i < info->splits ; i++ )
+    {
+      for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
+      {
+        #ifdef STBIR_SIMD8
+        if ( info->effective_channels == 3 )
+          --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
+        #endif
+        STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
+      }
+
+      #ifdef STBIR_SIMD8
+      if ( info->effective_channels == 3 )
+        --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif
+      STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
+      STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
+      STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
+    }
+    STBIR__FREE_AND_CLEAR( info->split_info );
+    if ( info->vertical.coefficients != info->horizontal.coefficients )
+    {
+      STBIR__FREE_AND_CLEAR( info->vertical.coefficients );
+      STBIR__FREE_AND_CLEAR( info->vertical.contributors );
+    }
+    STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
+    STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
+    STBIR__FREE_AND_CLEAR( info->alloced_mem );
+    STBIR_FREE( info, info->user_data );
+  #endif
+  }
+
+  #undef STBIR__FREE_AND_CLEAR
+}
+
+static int stbir__get_max_split( int splits, int height )
+{
+  int i;
+  int max = 0;
+
+  for( i = 0 ; i < splits ; i++ )
+  {
+    int each = height / ( splits - i );
+    if ( each > max )
+      max = each;
+    height -= each;
+  }
+  return max;
+}
+
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] =
+{
+  0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
+};
+
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] =
+{
+  0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
+};
+
+// there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column
+#define STBIR_RESIZE_CLASSIFICATIONS 8
+
+static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]=  // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan
+{
+  {
+    { 1.00000f, 1.00000f, 0.31250f, 1.00000f },
+    { 0.56250f, 0.59375f, 0.00000f, 0.96875f },
+    { 1.00000f, 0.06250f, 0.00000f, 1.00000f },
+    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.03125f },
+  }, {
+    { 0.00000f, 0.84375f, 0.00000f, 0.03125f },
+    { 0.09375f, 0.93750f, 0.00000f, 0.78125f },
+    { 0.87500f, 0.21875f, 0.00000f, 0.96875f },
+    { 0.09375f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.53125f },
+  }, {
+    { 0.00000f, 0.53125f, 0.00000f, 0.03125f },
+    { 0.06250f, 0.96875f, 0.00000f, 0.53125f },
+    { 0.87500f, 0.18750f, 0.00000f, 0.93750f },
+    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.56250f },
+  }, {
+    { 0.00000f, 0.50000f, 0.00000f, 0.71875f },
+    { 0.06250f, 0.84375f, 0.00000f, 0.87500f },
+    { 1.00000f, 0.50000f, 0.50000f, 0.96875f },
+    { 1.00000f, 0.09375f, 0.31250f, 0.50000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 1.00000f, 0.03125f, 0.03125f, 0.53125f },
+    { 0.18750f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.03125f, 0.18750f },
+  }, {
+    { 0.00000f, 0.59375f, 0.00000f, 0.96875f },
+    { 0.06250f, 0.81250f, 0.06250f, 0.59375f },
+    { 0.75000f, 0.43750f, 0.12500f, 0.96875f },
+    { 0.87500f, 0.06250f, 0.18750f, 0.43750f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.15625f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.03125f, 0.34375f },
+  }
+};
+
+// structure that allow us to query and override info for training the costs
+typedef struct STBIR__V_FIRST_INFO
+{
+  double v_cost, h_cost;
+  int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
+  int v_first;
+  int v_resize_classification;
+  int is_gather;
+} STBIR__V_FIRST_INFO;
+
+#ifdef STBIR__V_FIRST_INFO_BUFFER
+static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
+#define STBIR__V_FIRST_INFO_POINTER &STBIR__V_FIRST_INFO_BUFFER
+#else
+#define STBIR__V_FIRST_INFO_POINTER 0
+#endif
+
+// Figure out whether to scale along the horizontal or vertical first.
+//   This only *super* important when you are scaling by a massively
+//   different amount in the vertical vs the horizontal (for example, if
+//   you are scaling by 2x in the width, and 0.5x in the height, then you
+//   want to do the vertical scale first, because it's around 3x faster
+//   in that order.
+//
+//   In more normal circumstances, this makes a 20-40% differences, so
+//     it's good to get right, but not critical. The normal way that you
+//     decide which direction goes first is just figuring out which
+//     direction does more multiplies. But with modern CPUs with their
+//     fancy caches and SIMD and high IPC abilities, so there's just a lot
+//     more that goes into it.
+//
+//   My handwavy sort of solution is to have an app that does a whole
+//     bunch of timing for both vertical and horizontal first modes,
+//     and then another app that can read lots of these timing files
+//     and try to search for the best weights to use. Dotimings.c
+//     is the app that does a bunch of timings, and vf_train.c is the
+//     app that solves for the best weights (and shows how well it
+//     does currently).
+
+static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )
+{
+  double v_cost, h_cost;
+  float * weights;
+  int vertical_first;
+  int v_classification;
+
+  // categorize the resize into buckets
+  if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) )
+    v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
+  else if ( vertical_scale <= 1.0f )
+    v_classification = ( is_gather ) ? 1 : 0;
+  else if ( vertical_scale <= 2.0f)
+    v_classification = 2;
+  else if ( vertical_scale <= 3.0f)
+    v_classification = 3;
+  else if ( vertical_scale <= 4.0f)
+    v_classification = 5;
+  else
+    v_classification = 6;
+
+  // use the right weights
+  weights = weights_table[ v_classification ];
+
+  // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate
+  h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
+  v_cost = (float)vertical_filter_pixel_width  * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
+
+  // use computation estimate to decide vertical first or not
+  vertical_first = ( v_cost <= h_cost ) ? 1 : 0;
+
+  // save these, if requested
+  if ( info )
+  {
+    info->h_cost = h_cost;
+    info->v_cost = v_cost;
+    info->v_resize_classification = v_classification;
+    info->v_first = vertical_first;
+    info->is_gather = is_gather;
+  }
+
+  // and this allows us to override everything for testing (see dotiming.c)
+  if ( ( info ) && ( info->control_v_first ) )
+    vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
+
+  return vertical_first;
+}
+
+// layout lookups - must match stbir_internal_pixel_layout
+static unsigned char stbir__pixel_channels[] = {
+  1,2,3,3,4,   // 1ch, 2ch, rgb, bgr, 4ch
+  4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR
+  4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
+};
+
+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
+static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
+  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA,
+  STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
+  STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
+};
+
+static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
+{
+  static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 };
+
+  stbir__info * info = 0;
+  void * alloced = 0;
+  size_t alloced_total = 0;
+  int vertical_first;
+  int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
+
+  int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
+  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
+  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];
+  stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
+  int channels = stbir__pixel_channels[ input_pixel_layout ];
+  int effective_channels = channels;
+
+  // first figure out what type of alpha weighting to use (if any)
+  if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
+  {
+    if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
+    {
+      if ( fast_alpha )
+      {
+        alpha_weighting_type = 4;
+      }
+      else
+      {
+        static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 };
+        alpha_weighting_type = 2;
+        effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ];
+      }
+    }
+    else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
+    {
+      // input premult, output non-premult
+      alpha_weighting_type = 3;
+    }
+    else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) )
+    {
+      // input non-premult, output premult
+      alpha_weighting_type = 1;
+    }
+  }
+
+  // channel in and out count must match currently
+  if ( channels != stbir__pixel_channels[ output_pixel_layout ] )
+    return 0;
+
+  // get vertical first
+  vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
+
+  // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
+  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+
+#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
+  if ( effective_channels == 3 )
+    decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
+#endif
+
+  ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+
+  // if we do vertical first, the ring buffer holds a whole decoded line
+  if ( vertical_first )
+    ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15;
+
+  if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias
+
+  // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+  alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
+
+  // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode
+  if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
+    alloc_ring_buffer_num_entries = conservative_split_output_size;
+
+  ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes;
+
+  // The vertical buffer is used differently, depending on whether we are scattering
+  //   the vertical scanlines, or gathering them.
+  //   If scattering, it's used at the temp buffer to accumulate each output.
+  //   If gathering, it's just the output buffer.
+  vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
+
+  // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
+  for(;;)
+  {
+    int i;
+    void * advance_mem = alloced;
+    int copy_horizontal = 0;
+    stbir__sampler * possibly_use_horizontal_for_pivot = 0;
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
+#else
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = ((char*)advance_mem) + (size);
+#endif
+
+    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );
+
+    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );
+
+    if ( info )
+    {
+      static stbir__alpha_weight_func * fancy_alpha_weights[6]  =    { stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_2ch,   stbir__fancy_alpha_weight_2ch };
+      static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch };
+      static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch };
+      static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch };
+
+      // initialize info fields
+      info->alloced_mem = alloced;
+      info->alloced_total = alloced_total;
+
+      info->channels = channels;
+      info->effective_channels = effective_channels;
+
+      info->offset_x = new_x;
+      info->offset_y = new_y;
+      info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
+      info->ring_buffer_num_entries = 0;
+      info->ring_buffer_length_bytes = ring_buffer_length_bytes;
+      info->splits = splits;
+      info->vertical_first = vertical_first;
+
+      info->input_pixel_layout_internal = input_pixel_layout;
+      info->output_pixel_layout_internal = output_pixel_layout;
+
+      // setup alpha weight functions
+      info->alpha_weight = 0;
+      info->alpha_unweight = 0;
+
+      // handle alpha weighting functions and overrides
+      if ( alpha_weighting_type == 2 )
+      {
+        // high quality alpha multiplying on the way in, dividing on the way out
+        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
+        info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 4 )
+      {
+        // fast alpha multiplying on the way in, dividing on the way out
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
+        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 1 )
+      {
+        // fast alpha on the way in, leave in premultiplied form on way out
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 3 )
+      {
+        // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output
+        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+
+      // handle 3-chan color flipping, using the alpha weight path
+      if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) ||
+           ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) )
+      {
+        // do the flipping on the smaller of the two ends
+        if ( horizontal->scale_info.scale < 1.0f )
+          info->alpha_unweight = stbir__simple_flip_3ch;
+        else
+          info->alpha_weight = stbir__simple_flip_3ch;
+      }
+
+    }
+
+    // get all the per-split buffers
+    for( i = 0 ; i < splits ; i++ )
+    {
+      STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float );
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+
+      #ifdef STBIR_SIMD8
+      if ( ( info ) && ( effective_channels == 3 ) )
+        ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif
+
+      STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
+      {
+        int j;
+        for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ )
+        {
+          STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float );
+          #ifdef STBIR_SIMD8
+          if ( ( info ) && ( effective_channels == 3 ) )
+            ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
+          #endif
+        }
+      }
+#else
+      STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float );
+#endif
+      STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float );
+    }
+
+    // alloc memory for to-be-pivoted coeffs (if necessary)
+    if ( vertical->is_gather == 0 )
+    {
+      int both;
+      int temp_mem_amt;
+
+      // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
+      //   that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
+      //   is too small, we just allocate extra memory to use as this temp.
+
+      both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size;
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+      temp_mem_amt = decode_buffer_size;
+
+      #ifdef STBIR_SIMD8
+      if ( effective_channels == 3 )
+        --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif
+#else
+      temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
+#endif
+      if ( temp_mem_amt >= both )
+      {
+        if ( info )
+        {
+          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer;
+          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size );
+        }
+      }
+      else
+      {
+        // ring+decode memory is too small, so allocate temp memory
+        STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors );
+        STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float );
+      }
+    }
+
+    STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors );
+    STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float );
+
+    // are the two filters identical?? (happens a lot with mipmap generation)
+    if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) )
+    {
+      float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale;
+      float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift;
+      if ( diff_scale < 0.0f ) diff_scale = -diff_scale;
+      if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
+      if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
+      {
+        if ( horizontal->is_gather == vertical->is_gather )
+        {
+          copy_horizontal = 1;
+          goto no_vert_alloc;
+        }
+        // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs
+        possibly_use_horizontal_for_pivot = horizontal;
+      }
+    }
+
+    STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors );
+    STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float );
+
+   no_vert_alloc:
+
+    if ( info )
+    {
+      STBIR_PROFILE_BUILD_START( horizontal );
+
+      stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+
+      // setup the horizontal gather functions
+      // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover)
+      info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ];
+      // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
+      if ( horizontal->extent_info.widest <= 12 )
+        info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
+
+      info->scanline_extents.conservative.n0 = conservative->n0;
+      info->scanline_extents.conservative.n1 = conservative->n1;
+
+      // get exact extents
+      stbir__get_extents( horizontal, &info->scanline_extents );
+
+      // pack the horizontal coeffs
+      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
+
+      STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
+
+      STBIR_PROFILE_BUILD_END( horizontal );
+
+      if ( copy_horizontal )
+      {
+        STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) );
+      }
+      else
+      {
+        STBIR_PROFILE_BUILD_START( vertical );
+
+        stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+        STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) );
+
+        STBIR_PROFILE_BUILD_END( vertical );
+      }
+
+      // setup the vertical split ranges
+      stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size );
+
+      // now we know precisely how many entries we need
+      info->ring_buffer_num_entries = info->vertical.extent_info.widest;
+
+      // we never need more ring buffer entries than the scanlines we're outputting
+      if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
+        info->ring_buffer_num_entries = conservative_split_output_size;
+      STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
+
+      // a few of the horizontal gather functions read past the end of the decode (but mask it out), 
+      //   so put in normal values so no snans or denormals accidentally sneak in (also, in the ring 
+      //   buffer for vertical first)
+      for( i = 0 ; i < splits ; i++ )
+      {
+        int t, ofs, start;
+
+        ofs = decode_buffer_size / 4;
+
+        #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
+        if ( effective_channels == 3 ) 
+          --ofs; // avx in 3 channel mode needs one float at the start of the buffer, so we snap back for clearing
+        #endif
+
+        start = ofs - 4;
+        if ( start < 0 ) start = 0;
+
+        for( t = start ; t < ofs; t++ )
+          info->split_info[i].decode_buffer[ t ] = 9999.0f;
+
+        if ( vertical_first )
+        {
+          int j;
+          for( j = 0; j < info->ring_buffer_num_entries ; j++ )
+          {
+            for( t = start ; t < ofs; t++ )
+              stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ t ] = 9999.0f;
+          }
+        }
+      }
+    }
+
+    #undef STBIR__NEXT_PTR
+
+
+    // is this the first time through loop?
+    if ( info == 0 )
+    {
+      alloced_total = ( 15 + (size_t)advance_mem );
+      alloced = STBIR_MALLOC( alloced_total, user_data );
+      if ( alloced == 0 )
+        return 0;
+    }
+    else
+      return info;  // success
+  }
+}
+
+static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count )
+{
+  stbir__per_split_info * split_info = info->split_info + split_start;
+
+  STBIR_PROFILE_CLEAR_EXTRAS();
+
+  STBIR_PROFILE_FIRST_START( looping );
+  if (info->vertical.is_gather)
+    stbir__vertical_gather_loop( info, split_info, split_count );
+  else
+    stbir__vertical_scatter_loop( info, split_info, split_count );
+  STBIR_PROFILE_END( looping );
+
+  return 1;
+}
+
+static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize )
+{
+  static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear,
+  };
+
+  static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
+    { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA },
+    { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB },
+    { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR },
+    { /* RA   */ stbir__decode_uint8_srgb2_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
+    { /* AR   */ stbir__decode_uint8_srgb2_linearalpha_AR,   stbir__decode_uint8_srgb_AR,   0, stbir__decode_float_linear_AR,   stbir__decode_half_float_linear_AR },
+  };
+
+  static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]=
+  {
+    { stbir__decode_uint8_linear_scaled,  stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear },
+  };
+
+  static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
+  {
+    { /* RGBA */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
+    { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA,  stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } },
+    { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB,  stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } },
+    { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR,  stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } },
+    { /* RA   */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
+    { /* AR   */ { stbir__decode_uint8_linear_scaled_AR,    stbir__decode_uint8_linear_AR },   { stbir__decode_uint16_linear_scaled_AR,   stbir__decode_uint16_linear_AR } }
+  };
+
+  static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear,
+  };
+
+  static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
+    { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA },
+    { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB },
+    { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR },
+    { /* RA   */ stbir__encode_uint8_srgb2_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
+    { /* AR   */ stbir__encode_uint8_srgb2_linearalpha_AR,   stbir__encode_uint8_srgb_AR,   0, stbir__encode_float_linear_AR,   stbir__encode_half_float_linear_AR }
+  };
+
+  static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]=
+  {
+    { stbir__encode_uint8_linear_scaled,  stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear },
+  };
+
+  static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
+  {
+    { /* RGBA */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
+    { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA,  stbir__encode_uint8_linear_BGRA },  { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } },
+    { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB,  stbir__encode_uint8_linear_ARGB },  { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } },
+    { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR,  stbir__encode_uint8_linear_ABGR },  { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } },
+    { /* RA   */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
+    { /* AR   */ { stbir__encode_uint8_linear_scaled_AR,    stbir__encode_uint8_linear_AR },    { stbir__encode_uint16_linear_scaled_AR,   stbir__encode_uint16_linear_AR } }
+  };
+
+  stbir__decode_pixels_func * decode_pixels = 0;
+  stbir__encode_pixels_func * encode_pixels = 0;
+  stbir_datatype input_type, output_type;
+
+  input_type = resize->input_data_type;
+  output_type = resize->output_data_type;
+  info->input_data = resize->input_pixels;
+  info->input_stride_bytes = resize->input_stride_in_bytes;
+  info->output_stride_bytes = resize->output_stride_in_bytes;
+
+  // if we're completely point sampling, then we can turn off SRGB
+  if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
+  {
+    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) &&
+         ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
+    {
+      input_type = STBIR_TYPE_UINT8;
+      output_type = STBIR_TYPE_UINT8;
+    }
+  }
+
+  // recalc the output and input strides
+  if ( info->input_stride_bytes == 0 )
+    info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
+
+  if ( info->output_stride_bytes == 0 )
+    info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
+
+  // calc offset
+  info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+
+  info->in_pixels_cb = resize->input_cb;
+  info->user_data = resize->user_data;
+  info->out_pixels_cb = resize->output_cb;
+
+  // setup the input format converters
+  if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) )
+  {
+    int non_scaled = 0;
+
+    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
+    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight )  ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
+      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
+        non_scaled = 1;
+
+    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
+      decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+    else
+      decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+  }
+  else
+  {
+    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
+      decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ];
+    else
+      decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ];
+  }
+
+  // setup the output format converters
+  if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
+  {
+    int non_scaled = 0;
+
+    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
+    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
+      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
+        non_scaled = 1;
+
+    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
+      encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+    else
+      encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+  }
+  else
+  {
+    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
+      encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ];
+    else
+      encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ];
+  }
+
+  info->input_type = input_type;
+  info->output_type = output_type;
+  info->decode_pixels = decode_pixels;
+  info->encode_pixels = encode_pixels;
+}
+
+static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
+{
+  double per, adj;
+  int over;
+
+  // do left/top edge
+  if ( *outx < 0 )
+  {
+    per = ( (double)*outx ) / ( (double)*outsubw ); // is negative
+    adj = per * ( *u1 - *u0 );
+    *u0 -= adj; // increases u0
+    *outx = 0;
+  }
+
+  // do right/bot edge
+  over = outw - ( *outx + *outsubw );
+  if ( over < 0 )
+  {
+    per = ( (double)over ) / ( (double)*outsubw ); // is negative
+    adj = per * ( *u1 - *u0 );
+    *u1 += adj; // decrease u1
+    *outsubw = outw - *outx;
+  }
+}
+
+// converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
+static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
+{
+  double err;
+  stbir_uint64 top, bot;
+  stbir_uint64 numer_last = 0;
+  stbir_uint64 denom_last = 1;
+  stbir_uint64 numer_estimate = 1;
+  stbir_uint64 denom_estimate = 0;
+
+  // scale to past float error range
+  top = (stbir_uint64)( f * (double)(1 << 25) );
+  bot = 1 << 25;
+
+  // keep refining, but usually stops in a few loops - usually 5 for bad cases
+  for(;;)
+  {
+    stbir_uint64 est, temp;
+
+    // hit limit, break out and do best full range estimate
+    if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit )
+      break;
+
+    // is the current error less than 1 bit of a float? if so, we're done
+    if ( denom_estimate )
+    {
+      err = ( (double)numer_estimate / (double)denom_estimate ) - f;
+      if ( err < 0.0 ) err = -err;
+      if ( err < ( 1.0 / (double)(1<<24) ) )
+      {
+        // yup, found it
+        *numer = (stbir_uint32) numer_estimate;
+        *denom = (stbir_uint32) denom_estimate;
+        return 1;
+      }
+    }
+
+    // no more refinement bits left? break out and do full range estimate
+    if ( bot == 0 )
+      break;
+
+    // gcd the estimate bits
+    est = top / bot;
+    temp = top % bot;
+    top = bot;
+    bot = temp;
+
+    // move remainders
+    temp = est * denom_estimate + denom_last;
+    denom_last = denom_estimate;
+    denom_estimate = temp;
+
+    // move remainders
+    temp = est * numer_estimate + numer_last;
+    numer_last = numer_estimate;
+    numer_estimate = temp;
+  }
+
+  // we didn't fine anything good enough for float, use a full range estimate
+  if ( limit_denom )
+  {
+    numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 );
+    denom_estimate = limit;
+  }
+  else
+  {
+    numer_estimate = limit;
+    denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 );
+  }
+
+  *numer = (stbir_uint32) numer_estimate;
+  *denom = (stbir_uint32) denom_estimate;
+
+  err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0;
+  if ( err < 0.0 ) err = -err;
+  return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0;
+}
+
+static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 )
+{
+  double output_range, input_range, output_s, input_s, ratio, scale;
+
+  input_s = input_s1 - input_s0;
+
+  // null area
+  if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) ||
+       ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) )
+    return 0;
+
+  // are either of the ranges completely out of bounds?
+  if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) )
+    return 0;
+
+  output_range = (double)output_full_range;
+  input_range = (double)input_full_range;
+
+  output_s = ( (double)output_sub_range) / output_range;
+
+  // figure out the scaling to use
+  ratio = output_s / input_s;
+
+  // save scale before clipping
+  scale = ( output_range / input_range ) * ratio;
+  scale_info->scale = (float)scale;
+  scale_info->inv_scale = (float)( 1.0 / scale );
+
+  // clip output area to left/right output edges (and adjust input area)
+  stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 );
+
+  // recalc input area
+  input_s = input_s1 - input_s0;
+
+  // after clipping do we have zero input area?
+  if ( input_s <= stbir__small_float )
+    return 0;
+
+  // calculate and store the starting source offsets in output pixel space
+  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range );
+
+  scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
+
+  scale_info->input_full_size = input_full_range;
+  scale_info->output_sub_size = output_sub_range;
+
+  return 1;
+}
+
+
+static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type )
+{
+  resize->input_cb = 0;
+  resize->output_cb = 0;
+  resize->user_data = resize;
+  resize->samplers = 0;
+  resize->called_alloc = 0;
+  resize->horizontal_filter = STBIR_FILTER_DEFAULT;
+  resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
+  resize->vertical_filter = STBIR_FILTER_DEFAULT;
+  resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0;
+  resize->horizontal_edge = STBIR_EDGE_CLAMP;
+  resize->vertical_edge = STBIR_EDGE_CLAMP;
+  resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1;
+  resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h;
+  resize->input_data_type = data_type;
+  resize->output_data_type = data_type;
+  resize->input_pixel_layout_public = pixel_layout;
+  resize->output_pixel_layout_public = pixel_layout;
+  resize->needs_rebuild = 1;
+}
+
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
+                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
+                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
+                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type )
+{
+  resize->input_pixels = input_pixels;
+  resize->input_w = input_w;
+  resize->input_h = input_h;
+  resize->input_stride_in_bytes = input_stride_in_bytes;
+  resize->output_pixels = output_pixels;
+  resize->output_w = output_w;
+  resize->output_h = output_h;
+  resize->output_stride_in_bytes = output_stride_in_bytes;
+  resize->fast_alpha = 0;
+
+  stbir__init_and_set_layout( resize, pixel_layout, data_type );
+}
+
+// You can update parameters any time after resize_init
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type )  // by default, datatype from resize_init
+{
+  resize->input_data_type = input_type;
+  resize->output_data_type = output_type;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+    stbir__update_info_from_resize( resize->samplers, resize );
+}
+
+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb )   // no callbacks by default
+{
+  resize->input_cb = input_cb;
+  resize->output_cb = output_cb;
+
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+  {
+    resize->samplers->in_pixels_cb = input_cb;
+    resize->samplers->out_pixels_cb = output_cb;
+  }
+}
+
+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
+{
+  resize->user_data = user_data;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+    resize->samplers->user_data = user_data;
+}
+
+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
+{
+  resize->input_pixels = input_pixels;
+  resize->input_stride_in_bytes = input_stride_in_bytes;
+  resize->output_pixels = output_pixels;
+  resize->output_stride_in_bytes = output_stride_in_bytes;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+    stbir__update_info_from_resize( resize->samplers, resize );
+}
+
+
+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge )       // CLAMP by default
+{
+  resize->horizontal_edge = horizontal_edge;
+  resize->vertical_edge = vertical_edge;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
+{
+  resize->horizontal_filter = horizontal_filter;
+  resize->vertical_filter = vertical_filter;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support )
+{
+  resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support;
+  resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout )   // sets new pixel layouts
+{
+  resize->input_pixel_layout_public = input_pixel_layout;
+  resize->output_pixel_layout_public = output_pixel_layout;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+
+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality )   // sets alpha speed
+{
+  resize->fast_alpha = non_pma_alpha_speed_over_quality;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 )                 // sets input region (full region by default)
+{
+  resize->input_s0 = s0;
+  resize->input_t0 = t0;
+  resize->input_s1 = s1;
+  resize->input_t1 = t1;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) ||
+       ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) ||
+       ( s0 > (1.0f-stbir__small_float) ) ||
+       ( t0 > (1.0f-stbir__small_float) ) )
+    return 0;
+
+  return 1;
+}
+
+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )          // sets input region (full region by default)
+{
+  resize->output_subx = subx;
+  resize->output_suby = suby;
+  resize->output_subw = subw;
+  resize->output_subh = subh;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
+    return 0;
+
+  return 1;
+}
+
+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )                 // sets both regions (full regions by default)
+{
+  double s0, t0, s1, t1;
+
+  s0 = ( (double)subx ) / ( (double)resize->output_w );
+  t0 = ( (double)suby ) / ( (double)resize->output_h );
+  s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w );
+  t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h );
+
+  resize->input_s0 = s0;
+  resize->input_t0 = t0;
+  resize->input_s1 = s1;
+  resize->input_t1 = t1;
+  resize->output_subx = subx;
+  resize->output_suby = suby;
+  resize->output_subw = subw;
+  resize->output_subh = subh;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
+    return 0;
+
+  return 1;
+}
+
+static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
+{
+  stbir__contributors conservative = { 0, 0 };
+  stbir__sampler horizontal, vertical;
+  int new_output_subx, new_output_suby;
+  stbir__info * out_info;
+  #ifdef STBIR_PROFILE
+  stbir__info profile_infod;  // used to contain building profile info before everything is allocated
+  stbir__info * profile_info = &profile_infod;
+  #endif
+
+  // have we already built the samplers?
+  if ( resize->samplers )
+    return 0;
+
+  #define STBIR_RETURN_ERROR_AND_ASSERT( exp )  STBIR_ASSERT( !(exp) ); if (exp) return 0;
+  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
+  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
+  #undef STBIR_RETURN_ERROR_AND_ASSERT
+
+  if ( splits <= 0 )
+    return 0;
+
+  STBIR_PROFILE_BUILD_FIRST_START( build );
+
+  new_output_subx = resize->output_subx;
+  new_output_suby = resize->output_suby;
+
+  // do horizontal clip and scale calcs
+  if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) )
+    return 0;
+
+  // do vertical clip and scale calcs
+  if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) )
+    return 0;
+
+  // if nothing to do, just return
+  if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) )
+    return 0;
+
+  stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
+  stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
+  stbir__set_sampler(&vertical, resize->vertical_filter, resize->horizontal_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
+
+  if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice)
+  {
+    splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
+    if ( splits == 0 ) splits = 1;
+  }
+
+  STBIR_PROFILE_BUILD_START( alloc );
+  out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+  STBIR_PROFILE_BUILD_END( alloc );
+  STBIR_PROFILE_BUILD_END( build );
+
+  if ( out_info )
+  {
+    resize->splits = splits;
+    resize->samplers = out_info;
+    resize->needs_rebuild = 0;
+    #ifdef STBIR_PROFILE
+      STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
+    #endif
+
+    // update anything that can be changed without recalcing samplers
+    stbir__update_info_from_resize( out_info, resize );
+
+    return splits;
+  }
+
+  return 0;
+}
+
+void stbir_free_samplers( STBIR_RESIZE * resize )
+{
+  if ( resize->samplers )
+  {
+    stbir__free_internal_mem( resize->samplers );
+    resize->samplers = 0;
+    resize->called_alloc = 0;
+  }
+}
+
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits )
+{
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+  {
+    if ( resize->samplers )
+      stbir_free_samplers( resize );
+
+    resize->called_alloc = 1;
+    return stbir__perform_build( resize, splits );
+  }
+
+  STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
+
+  return 1;
+}
+
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
+{
+  return stbir_build_samplers_with_splits( resize, 1 );
+}
+
+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
+{
+  int result;
+
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+  {
+    int alloc_state = resize->called_alloc;  // remember allocated state
+
+    if ( resize->samplers )
+    {
+      stbir__free_internal_mem( resize->samplers );
+      resize->samplers = 0;
+    }
+
+    if ( !stbir_build_samplers( resize ) )
+      return 0;
+
+    resize->called_alloc = alloc_state;
+
+    // if build_samplers succeeded (above), but there are no samplers set, then
+    //   the area to stretch into was zero pixels, so don't do anything and return
+    //   success
+    if ( resize->samplers == 0 )
+      return 1;
+  }
+  else
+  {
+    // didn't build anything - clear it
+    STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
+  }
+
+  // do resize
+  result = stbir__perform_resize( resize->samplers, 0, resize->splits );
+
+  // if we alloced, then free
+  if ( !resize->called_alloc )
+  {
+    stbir_free_samplers( resize );
+    resize->samplers = 0;
+  }
+
+  return result;
+}
+
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count )
+{
+  STBIR_ASSERT( resize->samplers );
+
+  // if we're just doing the whole thing, call full
+  if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) )
+    return stbir_resize_extended( resize );
+
+  // you **must** build samplers first when using split resize
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+    return 0;
+
+  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
+    return 0;
+
+  // do resize
+  return stbir__perform_resize( resize->samplers, split_start, split_count );
+}
+
+static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * output_pixels, int type_size, int output_w, int output_h, int output_stride_in_bytes, stbir_internal_pixel_layout pixel_layout )
+{
+  size_t size;
+  int pitch;
+  void * ptr;
+
+  pitch = output_w * type_size * stbir__pixel_channels[ pixel_layout ];
+  if ( pitch == 0 )
+    return 0;
+
+  if ( output_stride_in_bytes == 0 )
+    output_stride_in_bytes = pitch;
+
+  if ( output_stride_in_bytes < pitch )
+    return 0;
+
+  size = (size_t)output_stride_in_bytes * (size_t)output_h;
+  if ( size == 0 )
+    return 0;
+
+  *ret_ptr = 0;
+  *ret_pitch = output_stride_in_bytes;
+
+  if ( output_pixels == 0 )
+  {
+    ptr = STBIR_MALLOC( size, 0 );
+    if ( ptr == 0 )
+      return 0;
+
+    *ret_ptr = ptr;
+    *ret_pitch = pitch;
+  }
+
+  return 1;
+}
+
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  unsigned char * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch,
+                     pixel_layout, STBIR_TYPE_UINT8 );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  unsigned char * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch,
+                     pixel_layout, STBIR_TYPE_UINT8_SRGB );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  float * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( float ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch,
+                     pixel_layout, STBIR_TYPE_FLOAT );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+
+STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                    void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                              stbir_pixel_layout pixel_layout, stbir_datatype data_type,
+                              stbir_edge edge, stbir_filter filter )
+{
+  STBIR_RESIZE resize;
+  float * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, stbir__type_size[data_type], output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes,
+                     pixel_layout, data_type );
+
+  resize.horizontal_edge = edge;
+  resize.vertical_edge = edge;
+  resize.horizontal_filter = filter;
+  resize.vertical_filter = filter;
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+#ifdef STBIR_PROFILE
+
+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
+{
+  static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ;
+  stbir__info* samp = resize->samplers;
+  int i;
+
+  typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1];
+  typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1];
+  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1];
+
+  for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++)
+    info->clocks[i] = samp->profile.array[i+1];
+
+  info->total_clocks = samp->profile.named.total;
+  info->descriptions = bdescriptions;
+  info->count = STBIR__ARRAY_SIZE( bdescriptions );
+}
+
+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count )
+{
+  static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" };
+  stbir__per_split_info * split_info;
+  int s, i;
+
+  typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1];
+  typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1];
+  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1];
+
+  if ( split_start == -1 )
+  {
+    split_start = 0;
+    split_count = resize->samplers->splits;
+  }
+
+  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
+  {
+    info->total_clocks = 0;
+    info->descriptions = 0;
+    info->count = 0;
+    return;
+  }
+
+  split_info = resize->samplers->split_info + split_start;
+
+  // sum up the profile from all the splits
+  for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ )
+  {
+    stbir_uint64 sum = 0;
+    for( s = 0 ; s < split_count ; s++ )
+      sum += split_info[s].profile.array[i+1];
+    info->clocks[i] = sum;
+  }
+
+  info->total_clocks = split_info->profile.named.total;
+  info->descriptions = descriptions;
+  info->count = STBIR__ARRAY_SIZE( descriptions );
+}
+
+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
+{
+  stbir_resize_split_profile_info( info, resize, -1, 0 );
+}
+
+#endif // STBIR_PROFILE
+
+#undef STBIR_BGR
+#undef STBIR_1CHANNEL
+#undef STBIR_2CHANNEL
+#undef STBIR_RGB
+#undef STBIR_RGBA
+#undef STBIR_4CHANNEL
+#undef STBIR_BGRA
+#undef STBIR_ARGB
+#undef STBIR_ABGR
+#undef STBIR_RA
+#undef STBIR_AR
+#undef STBIR_RGBA_PM
+#undef STBIR_BGRA_PM
+#undef STBIR_ARGB_PM
+#undef STBIR_ABGR_PM
+#undef STBIR_RA_PM
+#undef STBIR_AR_PM
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#else  // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
+
+// we reinclude the header file to define all the horizontal functions
+//   specializing each function for the number of coeffs is 20-40% faster *OVERALL*
+
+// by including the header file again this way, we can still debug the functions
+
+#define STBIR_strs_join2( start, mid, end ) start##mid##end
+#define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end )
+
+#define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end
+#define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end )
+
+#ifdef STB_IMAGE_RESIZE_DO_CODERS
+
+#ifdef stbir__decode_suffix
+#define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix )
+#else
+#define STBIR__CODER_NAME( name ) name
+#endif
+
+#ifdef stbir__decode_swizzle
+#define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
+#define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
+#define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
+#define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
+#else
+#define stbir__decode_order0 0
+#define stbir__decode_order1 1
+#define stbir__decode_order2 2
+#define stbir__decode_order3 3
+#define stbir__encode_order0 0
+#define stbir__encode_order1 1
+#define stbir__encode_order2 2
+#define stbir__encode_order3 3
+#define stbir__decode_simdf8_flip(reg)
+#define stbir__decode_simdf4_flip(reg)
+#define stbir__encode_simdf8_unflip(reg)
+#define stbir__encode_simdf4_unflip(reg)
+#endif
+
+#ifdef STBIR_SIMD8
+#define stbir__encode_simdfX_unflip  stbir__encode_simdf8_unflip
+#else
+#define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
+#endif
+
+static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const*)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned char const * end_input_m16 = input + width_times_channels - 16;
+  if ( width_times_channels >= 16 )
+  {
+    decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o0,o1;
+      stbir__simdf8 of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
+      stbir__simdi8_convert_i32_to_float( of0, o0 );
+      stbir__simdi8_convert_i32_to_float( of1, o1 );
+      stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8);
+      stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8);
+      stbir__decode_simdf8_flip( of0 );
+      stbir__decode_simdf8_flip( of1 );
+      stbir__simdf8_store( decode + 0, of0 );
+      stbir__simdf8_store( decode + 8, of1 );
+      #else
+      stbir__simdi i, o0, o1, o2, o3;
+      stbir__simdf of0, of1, of2, of3;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdi_convert_i32_to_float( of2, o2 );
+      stbir__simdi_convert_i32_to_float( of3, o3 );
+      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__decode_simdf4_flip( of2 );
+      stbir__decode_simdf4_flip( of3 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      stbir__simdf_store( decode + 8,  of2 );
+      stbir__simdf_store( decode + 12, of3 );
+      #endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end )
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
+    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
+    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
+    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted;
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
+  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= stbir__simdfX_float_count*2 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+    end_output -= stbir__simdfX_float_count*2;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      stbir__simdi i;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode );
+      stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count );
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      #ifdef STBIR_SIMD8
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
+      stbir__simdi_store( output, i );
+      #else
+      stbir__simdf_pack_to_8bytes( i, e0, e1 );
+      stbir__simdi_store2( output, i );
+      #endif
+      encode += stbir__simdfX_float_count*2;
+      output += stbir__simdfX_float_count*2;
+      if ( output <= end_output )
+        continue;
+      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    stbir__simdi i0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+    stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 );
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
+    *(int*)(output-4) = stbir__simdi_to_int( i0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    stbir__simdf e0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
+    #if stbir__coder_min_num >= 2
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 );
+    #endif
+    #if stbir__coder_min_num >= 3
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
+    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
+    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
+    f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const*)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned char const * end_input_m16 = input + width_times_channels - 16;
+  if ( width_times_channels >= 16 )
+  {
+    decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o0,o1;
+      stbir__simdf8 of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
+      stbir__simdi8_convert_i32_to_float( of0, o0 );
+      stbir__simdi8_convert_i32_to_float( of1, o1 );
+      stbir__decode_simdf8_flip( of0 );
+      stbir__decode_simdf8_flip( of1 );
+      stbir__simdf8_store( decode + 0, of0 );
+      stbir__simdf8_store( decode + 8, of1 );
+      #else
+      stbir__simdi i, o0, o1, o2, o3;
+      stbir__simdf of0, of1, of2, of3;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdi_convert_i32_to_float( of2, o2 );
+      stbir__simdi_convert_i32_to_float( of3, o3 );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__decode_simdf4_flip( of2 );
+      stbir__decode_simdf4_flip( of3 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      stbir__simdf_store( decode + 8,  of2 );
+      stbir__simdf_store( decode + 12, of3 );
+#endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end )
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0]));
+    decode[1-4] = ((float)(input[stbir__decode_order1]));
+    decode[2-4] = ((float)(input[stbir__decode_order2]));
+    decode[3-4] = ((float)(input[stbir__decode_order3]));
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0]));
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1]));
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2]));
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
+  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= stbir__simdfX_float_count*2 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+    end_output -= stbir__simdfX_float_count*2;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      stbir__simdi i;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
+      stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      #ifdef STBIR_SIMD8
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
+      stbir__simdi_store( output, i );
+      #else
+      stbir__simdf_pack_to_8bytes( i, e0, e1 );
+      stbir__simdi_store2( output, i );
+      #endif
+      encode += stbir__simdfX_float_count*2;
+      output += stbir__simdfX_float_count*2;
+      if ( output <= end_output )
+        continue;
+      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    stbir__simdi i0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+    stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 );
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
+    *(int*)(output-4) = stbir__simdi_to_int( i0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
+    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
+    decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
+    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
+    decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ];
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
+    #if stbir__coder_min_num >= 2
+    decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+#define stbir__min_max_shift20( i, f ) \
+    stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
+    stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one  )) ); \
+    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 );
+
+#define stbir__scale_and_convert( i, f ) \
+    stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
+    stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \
+    stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \
+    stbir__simdf_convert_float_to_i32( i, f );
+
+#define stbir__linear_to_srgb_finish( i, f ) \
+{ \
+    stbir__simdi temp;  \
+    stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \
+    stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \
+    stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \
+    stbir__simdi_16madd( i, i, temp ); \
+    stbir__simdi_32shr( i, i, 16 ); \
+}
+
+#define stbir__simdi_table_lookup2( v0,v1, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+}
+
+#define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1,temp2; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp2.m128i_i128 = v2; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+  v2 = temp2.m128i_i128; \
+}
+
+#define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1,temp2,temp3; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp2.m128i_i128 = v2; \
+  temp3.m128i_i128 = v3; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
+  temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+  v2 = temp2.m128i_i128; \
+  v3 = temp3.m128i_i128; \
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3;
+      STBIR_SIMD_NO_UNROLL(encode);
+
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__min_max_shift20( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__min_max_shift20( i3, f3 );
+
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
+
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i1, f1 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+      stbir__linear_to_srgb_finish( i3, f3 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      encode += 16;
+      output += 16;
+      if ( output <= end_output )
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while ( output <= end_output )
+  {
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
+    output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
+    output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
+    output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] );
+
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    STBIR_NO_UNROLL(encode);
+    output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
+    #if stbir__coder_min_num >= 2
+    output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
+    #endif
+    #if stbir__coder_min_num >= 3
+    output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+  do {
+    decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
+    decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
+    decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ];
+    decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted;
+    input += 4;
+    decode += 4;
+  } while( decode < decode_end );
+}
+
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3;
+
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__min_max_shift20( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__scale_and_convert( i3, f3 );
+
+      stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
+
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i1, f1 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      output += 16;
+      encode += 16;
+
+      if ( output <= end_output )
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
+    output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
+    output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] );
+
+    f = encode[3] * stbir__max_uint8_as_float + 0.5f;
+    STBIR_CLAMP(f, 0, 255);
+    output[stbir__decode_order3] = (unsigned char) f;
+
+    output += 4;
+    encode += 4;
+  } while( output < end_output );
+}
+
+#endif
+
+#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
+    decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
+    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ];
+    decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted;
+    input += 4;
+    decode += 4;
+  }
+  decode -= 4;
+  if( decode < decode_end )
+  {
+    decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
+    decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
+  }
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3;
+
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__scale_and_convert( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__scale_and_convert( i3, f3 );
+
+      stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
+
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      output += 16;
+      encode += 16;
+      if ( output <= end_output )
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
+
+    f = encode[1] * stbir__max_uint8_as_float + 0.5f;
+    STBIR_CLAMP(f, 0, 255);
+    output[stbir__decode_order1] = (unsigned char) f;
+
+    output += 2;
+    encode += 2;
+  } while( output < end_output );
+}
+
+#endif
+
+static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned short const * input = (unsigned short const *)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned short const * end_input_m8 = input + width_times_channels - 8;
+  if ( width_times_channels >= 8 )
+  {
+    decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o;
+      stbir__simdf8 of;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u16_to_u32( o, i );
+      stbir__simdi8_convert_i32_to_float( of, o );
+      stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8);
+      stbir__decode_simdf8_flip( of );
+      stbir__simdf8_store( decode + 0, of );
+      #else
+      stbir__simdi i, o0, o1;
+      stbir__simdf of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u16_to_u32( o0,o1,i );
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) );
+      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end )
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
+    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
+    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
+    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted;
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+
+static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
+  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    if ( width_times_channels >= stbir__simdfX_float_count*2 )
+    {
+      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+      end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+      for(;;)
+      {
+        stbir__simdfX e0, e1;
+        stbir__simdiX i;
+        STBIR_SIMD_NO_UNROLL(encode);
+        stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode );
+        stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count );
+        stbir__encode_simdfX_unflip( e0 );
+        stbir__encode_simdfX_unflip( e1 );
+        stbir__simdfX_pack_to_words( i, e0, e1 );
+        stbir__simdiX_store( output, i );
+        encode += stbir__simdfX_float_count*2;
+        output += stbir__simdfX_float_count*2;
+        if ( output <= end_output )
+          continue;
+        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+          break;
+        output = end_output;     // backup and do last couple
+        encode = end_encode_m8;
+      }
+      return;
+    }
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    stbir__simdf e;
+    stbir__simdi i;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e, encode );
+    stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e );
+    stbir__encode_simdf4_unflip( e );
+    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
+    stbir__simdi_store2( output-4, i );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    stbir__simdf e;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e );
+    #if stbir__coder_min_num >= 2
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e );
+    #endif
+    #if stbir__coder_min_num >= 3
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
+    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
+    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
+    f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned short const * input = (unsigned short const *)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned short const * end_input_m8 = input + width_times_channels - 8;
+  if ( width_times_channels >= 8 )
+  {
+    decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o;
+      stbir__simdf8 of;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u16_to_u32( o, i );
+      stbir__simdi8_convert_i32_to_float( of, o );
+      stbir__decode_simdf8_flip( of );
+      stbir__simdf8_store( decode + 0, of );
+      #else
+      stbir__simdi i, o0, o1;
+      stbir__simdf of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u16_to_u32( o0, o1, i );
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end )
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0]));
+    decode[1-4] = ((float)(input[stbir__decode_order1]));
+    decode[2-4] = ((float)(input[stbir__decode_order2]));
+    decode[3-4] = ((float)(input[stbir__decode_order3]));
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0]));
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1]));
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2]));
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
+  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    if ( width_times_channels >= stbir__simdfX_float_count*2 )
+    {
+      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+      end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+      for(;;)
+      {
+        stbir__simdfX e0, e1;
+        stbir__simdiX i;
+        STBIR_SIMD_NO_UNROLL(encode);
+        stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
+        stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
+        stbir__encode_simdfX_unflip( e0 );
+        stbir__encode_simdfX_unflip( e1 );
+        stbir__simdfX_pack_to_words( i, e0, e1 );
+        stbir__simdiX_store( output, i );
+        encode += stbir__simdfX_float_count*2;
+        output += stbir__simdfX_float_count*2;
+        if ( output <= end_output )
+          continue;
+        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+          break;
+        output = end_output; // backup and do last couple
+        encode = end_encode_m8;
+      }
+      return;
+    }
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    stbir__simdf e;
+    stbir__simdi i;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e, encode );
+    stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e );
+    stbir__encode_simdf4_unflip( e );
+    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
+    stbir__simdi_store2( output-4, i );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
+    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  stbir__FP16 const * input = (stbir__FP16 const *)inputp;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 8 )
+  {
+    stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
+    decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      STBIR_NO_UNROLL(decode);
+
+      stbir__half_to_float_SIMD( decode, input );
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of;
+        stbir__simdf8_load( of, decode );
+        stbir__decode_simdf8_flip( of );
+        stbir__simdf8_store( decode, of );
+      }
+      #else
+      {
+        stbir__simdf of0,of1;
+        stbir__simdf_load( of0, decode );
+        stbir__simdf_load( of1, decode+4 );
+        stbir__decode_simdf4_flip( of0 );
+        stbir__decode_simdf4_flip( of1 );
+        stbir__simdf_store( decode, of0 );
+        stbir__simdf_store( decode+4, of1 );
+      }
+      #endif
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end )
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]);
+    decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]);
+    decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]);
+    decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]);
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
+    #if stbir__coder_min_num >= 2
+    decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp;
+  stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 8 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - 8;
+    end_output -= 8;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      STBIR_SIMD_NO_UNROLL(encode);
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of;
+        stbir__simdf8_load( of, encode );
+        stbir__encode_simdf8_unflip( of );
+        stbir__float_to_half_SIMD( output, (float*)&of );
+      }
+      #else
+      {
+        stbir__simdf of[2];
+        stbir__simdf_load( of[0], encode );
+        stbir__simdf_load( of[1], encode+4 );
+        stbir__encode_simdf4_unflip( of[0] );
+        stbir__encode_simdf4_unflip( of[1] );
+        stbir__float_to_half_SIMD( output, (float*)of );
+      }
+      #endif
+      #else
+      stbir__float_to_half_SIMD( output, encode );
+      #endif
+      encode += 8;
+      output += 8;
+      if ( output <= end_output )
+        continue;
+      if ( output == ( end_output + 8 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    STBIR_SIMD_NO_UNROLL(output);
+    output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]);
+    output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]);
+    output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]);
+    output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]);
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    STBIR_NO_UNROLL(output);
+    output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
+    #if stbir__coder_min_num >= 2
+    output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
+    #endif
+    #if stbir__coder_min_num >= 3
+    output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  #ifdef stbir__decode_swizzle
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  float const * input = (float const *)inputp;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_input_m16 = input + width_times_channels - 16;
+    decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      STBIR_NO_UNROLL(decode);
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of0,of1;
+        stbir__simdf8_load( of0, input );
+        stbir__simdf8_load( of1, input+8 );
+        stbir__decode_simdf8_flip( of0 );
+        stbir__decode_simdf8_flip( of1 );
+        stbir__simdf8_store( decode, of0 );
+        stbir__simdf8_store( decode+8, of1 );
+      }
+      #else
+      {
+        stbir__simdf of0,of1,of2,of3;
+        stbir__simdf_load( of0, input );
+        stbir__simdf_load( of1, input+4 );
+        stbir__simdf_load( of2, input+8 );
+        stbir__simdf_load( of3, input+12 );
+        stbir__decode_simdf4_flip( of0 );
+        stbir__decode_simdf4_flip( of1 );
+        stbir__decode_simdf4_flip( of2 );
+        stbir__decode_simdf4_flip( of3 );
+        stbir__simdf_store( decode, of0 );
+        stbir__simdf_store( decode+4, of1 );
+        stbir__simdf_store( decode+8, of2 );
+        stbir__simdf_store( decode+12, of3 );
+      }
+      #endif
+      #endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end )
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = input[stbir__decode_order0];
+    decode[1-4] = input[stbir__decode_order1];
+    decode[2-4] = input[stbir__decode_order2];
+    decode[3-4] = input[stbir__decode_order3];
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = input[stbir__decode_order0];
+    #if stbir__coder_min_num >= 2
+    decode[1] = input[stbir__decode_order1];
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = input[stbir__decode_order2];
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+
+  #else
+
+  if ( (void*)decodep != inputp )
+    STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
+
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle)
+
+  if ( (void*)outputp != (void*) encode )
+    STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) );
+
+  #else
+
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp;
+  float * end_output = ( (float*) output ) + width_times_channels;
+
+  #ifdef STBIR_FLOAT_HIGH_CLAMP
+  #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP;
+  #else
+  #define stbir_scalar_hi_clamp( v )
+  #endif
+  #ifdef STBIR_FLOAT_LOW_CLAMP
+  #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP;
+  #else
+  #define stbir_scalar_lo_clamp( v )
+  #endif
+
+  #ifdef STBIR_SIMD
+
+  #ifdef STBIR_FLOAT_HIGH_CLAMP
+  const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
+  #endif
+  #ifdef STBIR_FLOAT_LOW_CLAMP
+  const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
+  #endif
+
+  if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
+    end_output -= ( stbir__simdfX_float_count * 2 );
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_load( e0, encode );
+      stbir__simdfX_load( e1, encode+stbir__simdfX_float_count );
+#ifdef STBIR_FLOAT_HIGH_CLAMP
+      stbir__simdfX_min( e0, e0, high_clamp );
+      stbir__simdfX_min( e1, e1, high_clamp );
+#endif
+#ifdef STBIR_FLOAT_LOW_CLAMP
+      stbir__simdfX_max( e0, e0, low_clamp );
+      stbir__simdfX_max( e1, e1, low_clamp );
+#endif
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      stbir__simdfX_store( output, e0 );
+      stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
+      encode += stbir__simdfX_float_count * 2;
+      output += stbir__simdfX_float_count * 2;
+      if ( output < end_output )
+        continue;
+      if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+#ifdef STBIR_FLOAT_HIGH_CLAMP
+    stbir__simdf_min( e0, e0, high_clamp );
+#endif
+#ifdef STBIR_FLOAT_LOW_CLAMP
+    stbir__simdf_max( e0, e0, low_clamp );
+#endif
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_store( output-4, e0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  while( output <= end_output )
+  {
+    float e;
+    STBIR_SIMD_NO_UNROLL(encode);
+    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e;
+    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e;
+    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e;
+    e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
+  {
+    float e;
+    STBIR_NO_UNROLL(encode);
+    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e;
+    #if stbir__coder_min_num >= 2
+    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e;
+    #endif
+    #if stbir__coder_min_num >= 3
+    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+
+  #endif
+}
+
+#undef stbir__decode_suffix
+#undef stbir__decode_simdf8_flip
+#undef stbir__decode_simdf4_flip
+#undef stbir__decode_order0
+#undef stbir__decode_order1
+#undef stbir__decode_order2
+#undef stbir__decode_order3
+#undef stbir__encode_order0
+#undef stbir__encode_order1
+#undef stbir__encode_order2
+#undef stbir__encode_order3
+#undef stbir__encode_simdf8_unflip
+#undef stbir__encode_simdf4_unflip
+#undef stbir__encode_simdfX_unflip
+#undef STBIR__CODER_NAME
+#undef stbir__coder_min_num
+#undef stbir__decode_swizzle
+#undef stbir_scalar_hi_clamp
+#undef stbir_scalar_lo_clamp
+#undef STB_IMAGE_RESIZE_DO_CODERS
+
+#elif defined( STB_IMAGE_RESIZE_DO_VERTICALS)
+
+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont)
+#else
+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end)
+#endif
+
+#if STBIR__vertical_channels >= 1
+#define stbIF0( code ) code
+#else
+#define stbIF0( code )
+#endif
+#if STBIR__vertical_channels >= 2
+#define stbIF1( code ) code
+#else
+#define stbIF1( code )
+#endif
+#if STBIR__vertical_channels >= 3
+#define stbIF2( code ) code
+#else
+#define stbIF2( code )
+#endif
+#if STBIR__vertical_channels >= 4
+#define stbIF3( code ) code
+#else
+#define stbIF3( code )
+#endif
+#if STBIR__vertical_channels >= 5
+#define stbIF4( code ) code
+#else
+#define stbIF4( code )
+#endif
+#if STBIR__vertical_channels >= 6
+#define stbIF5( code ) code
+#else
+#define stbIF5( code )
+#endif
+#if STBIR__vertical_channels >= 7
+#define stbIF6( code ) code
+#else
+#define stbIF6( code )
+#endif
+#if STBIR__vertical_channels >= 8
+#define stbIF7( code ) code
+#else
+#define stbIF7( code )
+#endif
+
+static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end )
+{
+  stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; )
+  stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; )
+  stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; )
+  stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; )
+  stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; )
+  stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; )
+  stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; )
+  stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; )
+
+  #ifdef STBIR_SIMD
+  {
+    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
+    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
+    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
+    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
+    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
+    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
+    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
+    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
+    {
+      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
+      STBIR_SIMD_NO_UNROLL(output0);
+
+      stbir__simdfX_load( r0, input );               stbir__simdfX_load( r1, input+stbir__simdfX_float_count );     stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) );      stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) );
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdfX_load( o0, output0 );     stbir__simdfX_load( o1, output0+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );
+              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF1( stbir__simdfX_load( o0, output1 );     stbir__simdfX_load( o1, output1+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );
+              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF2( stbir__simdfX_load( o0, output2 );     stbir__simdfX_load( o1, output2+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );
+              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF3( stbir__simdfX_load( o0, output3 );     stbir__simdfX_load( o1, output3+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );
+              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF4( stbir__simdfX_load( o0, output4 );     stbir__simdfX_load( o1, output4+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );
+              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF5( stbir__simdfX_load( o0, output5 );     stbir__simdfX_load( o1, output5+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count));    stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );
+              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF6( stbir__simdfX_load( o0, output6 );     stbir__simdfX_load( o1, output6+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );
+              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF7( stbir__simdfX_load( o0, output7 );     stbir__simdfX_load( o1, output7+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );
+              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
+      #else
+      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );
+              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );
+              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );
+              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );
+              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );
+              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );
+              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );
+              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );
+              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
+      #endif
+
+      input += (4*stbir__simdfX_float_count);
+      stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
+    }
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input_end - (char*) input ) >= 16 )
+    {
+      stbir__simdf o0, r0;
+      STBIR_SIMD_NO_UNROLL(output0);
+
+      stbir__simdf_load( r0, input );
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdf_load( o0, output0 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );  stbir__simdf_store( output0, o0 ); )
+      stbIF1( stbir__simdf_load( o0, output1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );  stbir__simdf_store( output1, o0 ); )
+      stbIF2( stbir__simdf_load( o0, output2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );  stbir__simdf_store( output2, o0 ); )
+      stbIF3( stbir__simdf_load( o0, output3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );  stbir__simdf_store( output3, o0 ); )
+      stbIF4( stbir__simdf_load( o0, output4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );  stbir__simdf_store( output4, o0 ); )
+      stbIF5( stbir__simdf_load( o0, output5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );  stbir__simdf_store( output5, o0 ); )
+      stbIF6( stbir__simdf_load( o0, output6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );  stbir__simdf_store( output6, o0 ); )
+      stbIF7( stbir__simdf_load( o0, output7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );  stbir__simdf_store( output7, o0 ); )
+      #else
+      stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );   stbir__simdf_store( output0, o0 ); )
+      stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );   stbir__simdf_store( output1, o0 ); )
+      stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );   stbir__simdf_store( output2, o0 ); )
+      stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );   stbir__simdf_store( output3, o0 ); )
+      stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );   stbir__simdf_store( output4, o0 ); )
+      stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );   stbir__simdf_store( output5, o0 ); )
+      stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );   stbir__simdf_store( output6, o0 ); )
+      stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );   stbir__simdf_store( output7, o0 ); )
+      #endif
+
+      input += 4;
+      stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
+    }
+  }
+  #else
+  STBIR_NO_UNROLL_LOOP_START
+  while ( ( (char*)input_end - (char*) input ) >= 16 )
+  {
+    float r0, r1, r2, r3;
+    STBIR_NO_UNROLL(input);
+
+    r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
+
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); )
+    stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); )
+    stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); )
+    stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); )
+    stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); )
+    stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); )
+    stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); )
+    stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); )
+    #else
+    stbIF0( output0[0]  = ( r0 * c0s ); output0[1]  = ( r1 * c0s ); output0[2]  = ( r2 * c0s ); output0[3]  = ( r3 * c0s ); )
+    stbIF1( output1[0]  = ( r0 * c1s ); output1[1]  = ( r1 * c1s ); output1[2]  = ( r2 * c1s ); output1[3]  = ( r3 * c1s ); )
+    stbIF2( output2[0]  = ( r0 * c2s ); output2[1]  = ( r1 * c2s ); output2[2]  = ( r2 * c2s ); output2[3]  = ( r3 * c2s ); )
+    stbIF3( output3[0]  = ( r0 * c3s ); output3[1]  = ( r1 * c3s ); output3[2]  = ( r2 * c3s ); output3[3]  = ( r3 * c3s ); )
+    stbIF4( output4[0]  = ( r0 * c4s ); output4[1]  = ( r1 * c4s ); output4[2]  = ( r2 * c4s ); output4[3]  = ( r3 * c4s ); )
+    stbIF5( output5[0]  = ( r0 * c5s ); output5[1]  = ( r1 * c5s ); output5[2]  = ( r2 * c5s ); output5[3]  = ( r3 * c5s ); )
+    stbIF6( output6[0]  = ( r0 * c6s ); output6[1]  = ( r1 * c6s ); output6[2]  = ( r2 * c6s ); output6[3]  = ( r3 * c6s ); )
+    stbIF7( output7[0]  = ( r0 * c7s ); output7[1]  = ( r1 * c7s ); output7[2]  = ( r2 * c7s ); output7[3]  = ( r3 * c7s ); )
+    #endif
+
+    input += 4;
+    stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
+  }
+  #endif
+  STBIR_NO_UNROLL_LOOP_START
+  while ( input < input_end )
+  {
+    float r = input[0];
+    STBIR_NO_UNROLL(output0);
+
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( output0[0] += ( r * c0s ); )
+    stbIF1( output1[0] += ( r * c1s ); )
+    stbIF2( output2[0] += ( r * c2s ); )
+    stbIF3( output3[0] += ( r * c3s ); )
+    stbIF4( output4[0] += ( r * c4s ); )
+    stbIF5( output5[0] += ( r * c5s ); )
+    stbIF6( output6[0] += ( r * c6s ); )
+    stbIF7( output7[0] += ( r * c7s ); )
+    #else
+    stbIF0( output0[0]  = ( r * c0s ); )
+    stbIF1( output1[0]  = ( r * c1s ); )
+    stbIF2( output2[0]  = ( r * c2s ); )
+    stbIF3( output3[0]  = ( r * c3s ); )
+    stbIF4( output4[0]  = ( r * c4s ); )
+    stbIF5( output5[0]  = ( r * c5s ); )
+    stbIF6( output6[0]  = ( r * c6s ); )
+    stbIF7( output7[0]  = ( r * c7s ); )
+    #endif
+
+    ++input;
+    stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; )
+  }
+}
+
+static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end )
+{
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp;
+
+  stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; )
+  stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; )
+  stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; )
+  stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; )
+  stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; )
+  stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; )
+  stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; )
+  stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; )
+
+#if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
+  // check single channel one weight
+  if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) )
+  {
+    STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
+    return;
+  }
+#endif
+
+  #ifdef STBIR_SIMD
+  {
+    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
+    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
+    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
+    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
+    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
+    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
+    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
+    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
+    {
+      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
+      STBIR_SIMD_NO_UNROLL(output);
+
+      // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
+      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); )
+      stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
+      stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
+      stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
+      stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); )
+      stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); )
+      stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); )
+      stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); )
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdfX_load( o0, output );      stbir__simdfX_load( o1, output+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );                         stbir__simdfX_madd( o2, o2, r2, c0 );                             stbir__simdfX_madd( o3, o3, r3, c0 ); )
+      #else
+      stbIF0( stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );                             stbir__simdfX_mult( o2, r2, c0 );                                 stbir__simdfX_mult( o3, r3, c0 );  )
+      #endif
+
+      stbIF1( stbir__simdfX_load( r0, input1 );      stbir__simdfX_load( r1, input1+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );                         stbir__simdfX_madd( o2, o2, r2, c1 );                             stbir__simdfX_madd( o3, o3, r3, c1 ); )
+      stbIF2( stbir__simdfX_load( r0, input2 );      stbir__simdfX_load( r1, input2+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );                         stbir__simdfX_madd( o2, o2, r2, c2 );                             stbir__simdfX_madd( o3, o3, r3, c2 ); )
+      stbIF3( stbir__simdfX_load( r0, input3 );      stbir__simdfX_load( r1, input3+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );                         stbir__simdfX_madd( o2, o2, r2, c3 );                             stbir__simdfX_madd( o3, o3, r3, c3 ); )
+      stbIF4( stbir__simdfX_load( r0, input4 );      stbir__simdfX_load( r1, input4+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );                         stbir__simdfX_madd( o2, o2, r2, c4 );                             stbir__simdfX_madd( o3, o3, r3, c4 ); )
+      stbIF5( stbir__simdfX_load( r0, input5 );      stbir__simdfX_load( r1, input5+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );                         stbir__simdfX_madd( o2, o2, r2, c5 );                             stbir__simdfX_madd( o3, o3, r3, c5 ); )
+      stbIF6( stbir__simdfX_load( r0, input6 );      stbir__simdfX_load( r1, input6+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );                         stbir__simdfX_madd( o2, o2, r2, c6 );                             stbir__simdfX_madd( o3, o3, r3, c6 ); )
+      stbIF7( stbir__simdfX_load( r0, input7 );      stbir__simdfX_load( r1, input7+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );                         stbir__simdfX_madd( o2, o2, r2, c7 );                             stbir__simdfX_madd( o3, o3, r3, c7 ); )
+
+      stbir__simdfX_store( output, o0 );             stbir__simdfX_store( output+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 );  stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 );
+      output += (4*stbir__simdfX_float_count);
+      stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
+    }
+
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
+    {
+      stbir__simdf o0, r0;
+      STBIR_SIMD_NO_UNROLL(output);
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdf_load( o0, output );   stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
+      #else
+      stbIF0( stbir__simdf_load( r0, input0 );  stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
+      #endif
+      stbIF1( stbir__simdf_load( r0, input1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); )
+      stbIF2( stbir__simdf_load( r0, input2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); )
+      stbIF3( stbir__simdf_load( r0, input3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); )
+      stbIF4( stbir__simdf_load( r0, input4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); )
+      stbIF5( stbir__simdf_load( r0, input5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); )
+      stbIF6( stbir__simdf_load( r0, input6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); )
+      stbIF7( stbir__simdf_load( r0, input7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); )
+
+      stbir__simdf_store( output, o0 );
+      output += 4;
+      stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
+    }
+  }
+  #else
+  STBIR_NO_UNROLL_LOOP_START
+  while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
+  {
+    float o0, o1, o2, o3;
+    STBIR_NO_UNROLL(output);
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; )
+    #else
+    stbIF0( o0  = input0[0] * c0s; o1  = input0[1] * c0s; o2  = input0[2] * c0s; o3  = input0[3] * c0s; )
+    #endif
+    stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; )
+    stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; )
+    stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; )
+    stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; )
+    stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; )
+    stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; )
+    stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; )
+    output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3;
+    output += 4;
+    stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
+  }
+  #endif
+  STBIR_NO_UNROLL_LOOP_START
+  while ( input0 < input0_end )
+  {
+    float o0;
+    STBIR_NO_UNROLL(output);
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( o0 = output[0] + input0[0] * c0s; )
+    #else
+    stbIF0( o0  = input0[0] * c0s; )
+    #endif
+    stbIF1( o0 += input1[0] * c1s; )
+    stbIF2( o0 += input2[0] * c2s; )
+    stbIF3( o0 += input3[0] * c3s; )
+    stbIF4( o0 += input4[0] * c4s; )
+    stbIF5( o0 += input5[0] * c5s; )
+    stbIF6( o0 += input6[0] * c6s; )
+    stbIF7( o0 += input7[0] * c7s; )
+    output[0] = o0;
+    ++output;
+    stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
+  }
+}
+
+#undef stbIF0
+#undef stbIF1
+#undef stbIF2
+#undef stbIF3
+#undef stbIF4
+#undef stbIF5
+#undef stbIF6
+#undef stbIF7
+#undef STB_IMAGE_RESIZE_DO_VERTICALS
+#undef STBIR__vertical_channels
+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
+#undef STBIR_strs_join24
+#undef STBIR_strs_join14
+#undef STBIR_chans
+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#undef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#endif
+
+#else // !STB_IMAGE_RESIZE_DO_VERTICALS
+
+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end)
+
+#ifndef stbir__2_coeff_only
+#define stbir__2_coeff_only()             \
+    stbir__1_coeff_only();                \
+    stbir__1_coeff_remnant(1);
+#endif
+
+#ifndef stbir__2_coeff_remnant
+#define stbir__2_coeff_remnant( ofs )     \
+    stbir__1_coeff_remnant(ofs);          \
+    stbir__1_coeff_remnant((ofs)+1);
+#endif
+
+#ifndef stbir__3_coeff_only
+#define stbir__3_coeff_only()             \
+    stbir__2_coeff_only();                \
+    stbir__1_coeff_remnant(2);
+#endif
+
+#ifndef stbir__3_coeff_remnant
+#define stbir__3_coeff_remnant( ofs )     \
+    stbir__2_coeff_remnant(ofs);          \
+    stbir__1_coeff_remnant((ofs)+2);
+#endif
+
+#ifndef stbir__3_coeff_setup
+#define stbir__3_coeff_setup()
+#endif
+
+#ifndef stbir__4_coeff_start
+#define stbir__4_coeff_start()            \
+    stbir__2_coeff_only();                \
+    stbir__2_coeff_remnant(2);
+#endif
+
+#ifndef stbir__4_coeff_continue_from_4
+#define stbir__4_coeff_continue_from_4( ofs )     \
+    stbir__2_coeff_remnant(ofs);                  \
+    stbir__2_coeff_remnant((ofs)+2);
+#endif
+
+#ifndef stbir__store_output_tiny
+#define stbir__store_output_tiny stbir__store_output
+#endif
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__1_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__2_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__3_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__1_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__2_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    stbir__3_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__1_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__2_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__3_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__4_coeff_continue_from_4(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__1_coeff_remnant( 4 );
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__2_coeff_remnant( 4 );
+
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__3_coeff_remnant( 4 );
+
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
+{
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),
+};
+
+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
+{
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),
+};
+
+#undef STBIR__horizontal_channels
+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
+#undef stbir__1_coeff_only
+#undef stbir__1_coeff_remnant
+#undef stbir__2_coeff_only
+#undef stbir__2_coeff_remnant
+#undef stbir__3_coeff_only
+#undef stbir__3_coeff_remnant
+#undef stbir__3_coeff_setup
+#undef stbir__4_coeff_start
+#undef stbir__4_coeff_continue_from_4
+#undef stbir__store_output
+#undef stbir__store_output_tiny
+#undef STBIR_chans
+
+#endif  // HORIZONALS
+
+#undef STBIR_strs_join2
+#undef STBIR_strs_join1
+
+#endif // STB_IMAGE_RESIZE_DO_HORIZONTALS/VERTICALS/CODERS
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_image_write.h b/lib/stb/stb_image_write.h
new file mode 100644
index 0000000..e4b32ed
--- /dev/null
+++ b/lib/stb/stb_image_write.h
@@ -0,0 +1,1724 @@
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+
+   Before #including,
+
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+
+   in the file that you want to have the implementation.
+
+   Will probably not work correctly with strict-aliasing optimizations.
+
+ABOUT:
+
+   This header file is a library for writing images to C stdio or a callback.
+
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
+
+BUILDING:
+
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+
+UNICODE:
+
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
+
+USAGE:
+
+   There are five functions, one for each image file format:
+
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+
+   Each function returns 0 on failure and non-0 on success.
+
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
+CREDITS:
+
+
+   Sean Barrett           -    PNG/BMP/TGA
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+      Andrew Kensler
+
+LICENSE
+
+  See end of file for license information.
+
+*/
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+
+#include <stdlib.h>
+
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+
+#ifdef STBIW_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+
+
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+
+
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+static int stbi__flip_vertically_on_write = 0;
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+   unsigned char buffer[64];
+   int buf_used;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
+}
+
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      stbiw__write1(s, d[comp - 1]);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            stbiw__write1(s, d[0]);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      stbiw__write1(s, d[comp - 1]);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+
+   if (y <= 0)
+      return;
+
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
+
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      stbiw__write_flush(s);
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
+}
+
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               stbiw__write1(s, header);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               stbiw__write1(s, header);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+      stbiw__write_flush(s);
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+
+      s->func(s->context, scanlineheader, 4);
+
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef __STDC_LIB_EXT1__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH   16384
+
+#endif // STBIW_ZLIB_COMPRESS
+
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) { best=d; bestloc=hlist[j]; }
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
+}
+
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+#endif
+}
+
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+
+   // first loop isn't optimized since it's just one pixel
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int force_filter = stbi_write_force_png_filter;
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int j,zlen;
+
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
+               est += abs((signed char) line_buffer[i]);
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
+         }
+      }
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12;
+
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+
+   STBIW_ASSERT(o == out + *out_len);
+
+   return out;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+
+   f = stbiw__fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, j, n, diff, end0pos, x, y;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k, subsample;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
+      int x, y, pos;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               }
+            }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+
+/* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
+      1.11  (2019-08-11)
+
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+             add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_include.h b/lib/stb/stb_include.h
new file mode 100644
index 0000000..c5db201
--- /dev/null
+++ b/lib/stb/stb_include.h
@@ -0,0 +1,295 @@
+// stb_include.h - v0.02 - parse and process #include directives - public domain
+//
+// To build this, in one source file that includes this file do
+//      #define STB_INCLUDE_IMPLEMENTATION
+//
+// This program parses a string and replaces lines of the form
+//         #include "foo"
+// with the contents of a file named "foo". It also embeds the
+// appropriate #line directives. Note that all include files must
+// reside in the location specified in the path passed to the API;
+// it does not check multiple directories.
+//
+// If the string contains a line of the form
+//         #inject
+// then it will be replaced with the contents of the string 'inject' passed to the API.
+//
+// Options:
+//
+//      Define STB_INCLUDE_LINE_GLSL to get GLSL-style #line directives
+//      which use numbers instead of filenames.
+//
+//      Define STB_INCLUDE_LINE_NONE to disable output of #line directives.
+//
+// Standard libraries:
+//
+//      stdio.h     FILE, fopen, fclose, fseek, ftell
+//      stdlib.h    malloc, realloc, free
+//      string.h    strcpy, strncmp, memcpy
+//
+// Credits:
+//
+// Written by Sean Barrett.
+//
+// Fixes:
+//  Michal Klos
+
+#ifndef STB_INCLUDE_STB_INCLUDE_H
+#define STB_INCLUDE_STB_INCLUDE_H
+
+// Do include-processing on the string 'str'. To free the return value, pass it to free()
+char *stb_include_string(char *str, char *inject, char *path_to_includes, char *filename_for_line_directive, char error[256]);
+
+// Concatenate the strings 'strs' and do include-processing on the result. To free the return value, pass it to free()
+char *stb_include_strings(char **strs, int count, char *inject, char *path_to_includes, char *filename_for_line_directive, char error[256]);
+
+// Load the file 'filename' and do include-processing on the string therein. note that
+// 'filename' is opened directly; 'path_to_includes' is not used. To free the return value, pass it to free()
+char *stb_include_file(char *filename, char *inject, char *path_to_includes, char error[256]);
+
+#endif
+
+
+#ifdef STB_INCLUDE_IMPLEMENTATION
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static char *stb_include_load_file(char *filename, size_t *plen)
+{
+   char *text;
+   size_t len;
+   FILE *f = fopen(filename, "rb");
+   if (f == 0) return 0;
+   fseek(f, 0, SEEK_END);
+   len = (size_t) ftell(f);
+   if (plen) *plen = len;
+   text = (char *) malloc(len+1);
+   if (text == 0) return 0;
+   fseek(f, 0, SEEK_SET);
+   fread(text, 1, len, f);
+   fclose(f);
+   text[len] = 0;
+   return text;
+}
+
+typedef struct
+{
+   int offset;
+   int end;
+   char *filename;
+   int next_line_after;
+} include_info;
+
+static include_info *stb_include_append_include(include_info *array, int len, int offset, int end, char *filename, int next_line)
+{
+   include_info *z = (include_info *) realloc(array, sizeof(*z) * (len+1));
+   z[len].offset   = offset;
+   z[len].end      = end;
+   z[len].filename = filename;
+   z[len].next_line_after = next_line;
+   return z;
+}
+
+static void stb_include_free_includes(include_info *array, int len)
+{
+   int i;
+   for (i=0; i < len; ++i)
+      free(array[i].filename);
+   free(array);
+}
+
+static int stb_include_isspace(int ch)
+{
+   return (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n');
+}
+
+// find location of all #include and #inject
+static int stb_include_find_includes(char *text, include_info **plist)
+{
+   int line_count = 1;
+   int inc_count = 0;
+   char *s = text, *start;
+   include_info *list = NULL;
+   while (*s) {
+      // parse is always at start of line when we reach here
+      start = s;
+      while (*s == ' ' || *s == '\t')
+         ++s;
+      if (*s == '#') {
+         ++s;
+         while (*s == ' ' || *s == '\t')
+            ++s;
+         if (0==strncmp(s, "include", 7) && stb_include_isspace(s[7])) {
+            s += 7;
+            while (*s == ' ' || *s == '\t')
+               ++s;
+            if (*s == '"') {
+               char *t = ++s;
+               while (*t != '"' && *t != '\n' && *t != '\r' && *t != 0)
+                  ++t;
+               if (*t == '"') {
+                  char *filename = (char *) malloc(t-s+1);
+                  memcpy(filename, s, t-s);
+                  filename[t-s] = 0;
+                  s=t;
+                  while (*s != '\r' && *s != '\n' && *s != 0)
+                     ++s;
+                  // s points to the newline, so s-start is everything except the newline
+                  list = stb_include_append_include(list, inc_count++, start-text, s-text, filename, line_count+1);
+               }
+            }
+         } else if (0==strncmp(s, "inject", 6) && (stb_include_isspace(s[6]) || s[6]==0)) {
+            while (*s != '\r' && *s != '\n' && *s != 0)
+               ++s;
+            list = stb_include_append_include(list, inc_count++, start-text, s-text, NULL, line_count+1);
+         }
+      }
+      while (*s != '\r' && *s != '\n' && *s != 0)
+         ++s;
+      if (*s == '\r' || *s == '\n') {
+         s = s + (s[0] + s[1] == '\r' + '\n' ? 2 : 1);
+      }
+      ++line_count;
+   }
+   *plist = list;
+   return inc_count;
+}
+
+// avoid dependency on sprintf()
+static void stb_include_itoa(char str[9], int n)
+{
+   int i;
+   for (i=0; i < 8; ++i)
+      str[i] = ' ';
+   str[i] = 0;
+
+   for (i=1; i < 8; ++i) {
+      str[7-i] = '0' + (n % 10);
+      n /= 10;
+      if (n == 0)
+         break;
+   }
+}
+
+static char *stb_include_append(char *str, size_t *curlen, char *addstr, size_t addlen)
+{
+   str = (char *) realloc(str, *curlen + addlen);
+   memcpy(str + *curlen, addstr, addlen);
+   *curlen += addlen;
+   return str;
+}
+
+char *stb_include_string(char *str, char *inject, char *path_to_includes, char *filename, char error[256])
+{
+   char temp[4096];
+   include_info *inc_list;
+   int i, num = stb_include_find_includes(str, &inc_list);
+   size_t source_len = strlen(str);
+   char *text=0;
+   size_t textlen=0, last=0;
+   for (i=0; i < num; ++i) {
+      text = stb_include_append(text, &textlen, str+last, inc_list[i].offset - last);
+      // write out line directive for the include
+      #ifndef STB_INCLUDE_LINE_NONE
+      #ifdef STB_INCLUDE_LINE_GLSL
+      if (textlen != 0)  // GLSL #version must appear first, so don't put a #line at the top
+      #endif
+      {
+         strcpy(temp, "#line ");
+         stb_include_itoa(temp+6, 1);
+         strcat(temp, " ");
+         #ifdef STB_INCLUDE_LINE_GLSL
+         stb_include_itoa(temp+15, i+1);
+         #else
+         strcat(temp, "\"");
+         if (inc_list[i].filename == 0)
+            strcmp(temp, "INJECT");
+         else
+            strcat(temp, inc_list[i].filename);
+         strcat(temp, "\"");
+         #endif
+         strcat(temp, "\n");
+         text = stb_include_append(text, &textlen, temp, strlen(temp));
+      }
+      #endif
+      if (inc_list[i].filename == 0) {
+         if (inject != 0)
+            text = stb_include_append(text, &textlen, inject, strlen(inject));
+      } else {
+         char *inc;
+         strcpy(temp, path_to_includes);
+         strcat(temp, "/");
+         strcat(temp, inc_list[i].filename);
+         inc = stb_include_file(temp, inject, path_to_includes, error);
+         if (inc == NULL) {
+            stb_include_free_includes(inc_list, num);
+            return NULL;
+         }
+         text = stb_include_append(text, &textlen, inc, strlen(inc));
+         free(inc);
+      }
+      // write out line directive
+      #ifndef STB_INCLUDE_LINE_NONE
+      strcpy(temp, "\n#line ");
+      stb_include_itoa(temp+6, inc_list[i].next_line_after);
+      strcat(temp, " ");
+      #ifdef STB_INCLUDE_LINE_GLSL
+      stb_include_itoa(temp+15, 0);
+      #else
+      strcat(temp, filename != 0 ? filename : "source-file");
+      #endif
+      text = stb_include_append(text, &textlen, temp, strlen(temp));
+      // no newlines, because we kept the #include newlines, which will get appended next
+      #endif
+      last = inc_list[i].end;
+   }
+   text = stb_include_append(text, &textlen, str+last, source_len - last + 1); // append '\0'
+   stb_include_free_includes(inc_list, num);
+   return text;
+}
+
+char *stb_include_strings(char **strs, int count, char *inject, char *path_to_includes, char *filename, char error[256])
+{
+   char *text;
+   char *result;
+   int i;
+   size_t length=0;
+   for (i=0; i < count; ++i)
+      length += strlen(strs[i]);
+   text = (char *) malloc(length+1);
+   length = 0;
+   for (i=0; i < count; ++i) {
+      strcpy(text + length, strs[i]);
+      length += strlen(strs[i]);
+   }
+   result = stb_include_string(text, inject, path_to_includes, filename, error);
+   free(text);
+   return result;
+}
+
+char *stb_include_file(char *filename, char *inject, char *path_to_includes, char error[256])
+{
+   size_t len;
+   char *result;
+   char *text = stb_include_load_file(filename, &len);
+   if (text == NULL) {
+      strcpy(error, "Error: couldn't load '");
+      strcat(error, filename);
+      strcat(error, "'");
+      return 0;
+   }
+   result = stb_include_string(text, inject, path_to_includes, filename, error);
+   free(text);
+   return result;
+}
+
+#if 0 // @TODO, GL_ARB_shader_language_include-style system that doesn't touch filesystem
+char *stb_include_preloaded(char *str, char *inject, char *includes[][2], char error[256])
+{
+
+}
+#endif
+
+#endif // STB_INCLUDE_IMPLEMENTATION
diff --git a/lib/stb/stb_leakcheck.h b/lib/stb/stb_leakcheck.h
new file mode 100644
index 0000000..19ee6e7
--- /dev/null
+++ b/lib/stb/stb_leakcheck.h
@@ -0,0 +1,194 @@
+// stb_leakcheck.h - v0.6 - quick & dirty malloc leak-checking - public domain
+// LICENSE
+//
+//   See end of file.
+
+#ifdef STB_LEAKCHECK_IMPLEMENTATION
+#undef STB_LEAKCHECK_IMPLEMENTATION // don't implement more than once
+
+// if we've already included leakcheck before, undefine the macros
+#ifdef malloc
+#undef malloc
+#undef free
+#undef realloc
+#endif
+
+#ifndef STB_LEAKCHECK_OUTPUT_PIPE
+#define STB_LEAKCHECK_OUTPUT_PIPE stdout
+#endif
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+typedef struct malloc_info stb_leakcheck_malloc_info;
+
+struct malloc_info
+{
+   const char *file;
+   int line;
+   size_t size;
+   stb_leakcheck_malloc_info *next,*prev;
+};
+
+static stb_leakcheck_malloc_info *mi_head;
+
+void *stb_leakcheck_malloc(size_t sz, const char *file, int line)
+{
+   stb_leakcheck_malloc_info *mi = (stb_leakcheck_malloc_info *) malloc(sz + sizeof(*mi));
+   if (mi == NULL) return mi;
+   mi->file = file;
+   mi->line = line;
+   mi->next = mi_head;
+   if (mi_head)
+      mi->next->prev = mi;
+   mi->prev = NULL;
+   mi->size = (int) sz;
+   mi_head = mi;
+   return mi+1;
+}
+
+void stb_leakcheck_free(void *ptr)
+{
+   if (ptr != NULL) {
+      stb_leakcheck_malloc_info *mi = (stb_leakcheck_malloc_info *) ptr - 1;
+      mi->size = ~mi->size;
+      #ifndef STB_LEAKCHECK_SHOWALL
+      if (mi->prev == NULL) {
+         assert(mi_head == mi);
+         mi_head = mi->next;
+      } else
+         mi->prev->next = mi->next;
+      if (mi->next)
+         mi->next->prev = mi->prev;
+      free(mi);
+      #endif
+   }
+}
+
+void *stb_leakcheck_realloc(void *ptr, size_t sz, const char *file, int line)
+{
+   if (ptr == NULL) {
+      return stb_leakcheck_malloc(sz, file, line);
+   } else if (sz == 0) {
+      stb_leakcheck_free(ptr);
+      return NULL;
+   } else {
+      stb_leakcheck_malloc_info *mi = (stb_leakcheck_malloc_info *) ptr - 1;
+      if (sz <= mi->size)
+         return ptr;
+      else {
+         #ifdef STB_LEAKCHECK_REALLOC_PRESERVE_MALLOC_FILELINE
+         void *q = stb_leakcheck_malloc(sz, mi->file, mi->line);
+         #else
+         void *q = stb_leakcheck_malloc(sz, file, line);
+         #endif
+         if (q) {
+            memcpy(q, ptr, mi->size);
+            stb_leakcheck_free(ptr);
+         }
+         return q;
+      }
+   }
+}
+
+static void stblkck_internal_print(const char *reason, stb_leakcheck_malloc_info *mi)
+{
+#if defined(_MSC_VER) && _MSC_VER < 1900 // 1900=VS 2015
+   // Compilers that use the old MS C runtime library don't have %zd
+   // and the older ones don't even have %lld either... however, the old compilers
+   // without "long long" don't support 64-bit targets either, so here's the
+   // compromise:
+   #if _MSC_VER < 1400 // before VS 2005
+      fprintf(STB_LEAKCHECK_OUTPUT_PIPE, "%s: %s (%4d): %8d bytes at %p\n", reason, mi->file, mi->line, (int)mi->size, (void*)(mi+1));
+   #else
+      fprintf(STB_LEAKCHECK_OUTPUT_PIPE, "%s: %s (%4d): %16lld bytes at %p\n", reason, mi->file, mi->line, (long long)mi->size, (void*)(mi+1));
+   #endif
+#else
+   // Assume we have %zd on other targets.
+   #ifdef __MINGW32__
+      __mingw_fprintf(STB_LEAKCHECK_OUTPUT_PIPE, "%s: %s (%4d): %zd bytes at %p\n", reason, mi->file, mi->line, mi->size, (void*)(mi+1));
+   #else
+      fprintf(STB_LEAKCHECK_OUTPUT_PIPE, "%s: %s (%4d): %zd bytes at %p\n", reason, mi->file, mi->line, mi->size, (void*)(mi+1));
+   #endif
+#endif
+}
+
+void stb_leakcheck_dumpmem(void)
+{
+   stb_leakcheck_malloc_info *mi = mi_head;
+   while (mi) {
+      if ((ptrdiff_t) mi->size >= 0)
+         stblkck_internal_print("LEAKED", mi);
+      mi = mi->next;
+   }
+   #ifdef STB_LEAKCHECK_SHOWALL
+   mi = mi_head;
+   while (mi) {
+      if ((ptrdiff_t) mi->size < 0)
+         stblkck_internal_print("FREED ", mi);
+      mi = mi->next;
+   }
+   #endif
+}
+#endif // STB_LEAKCHECK_IMPLEMENTATION
+
+#if !defined(INCLUDE_STB_LEAKCHECK_H) || !defined(malloc)
+#define INCLUDE_STB_LEAKCHECK_H
+
+#include <stdlib.h> // we want to define the macros *after* stdlib to avoid a slew of errors
+
+#define malloc(sz)    stb_leakcheck_malloc(sz, __FILE__, __LINE__)
+#define free(p)       stb_leakcheck_free(p)
+#define realloc(p,sz) stb_leakcheck_realloc(p,sz, __FILE__, __LINE__)
+
+extern void * stb_leakcheck_malloc(size_t sz, const char *file, int line);
+extern void * stb_leakcheck_realloc(void *ptr, size_t sz, const char *file, int line);
+extern void   stb_leakcheck_free(void *ptr);
+extern void   stb_leakcheck_dumpmem(void);
+
+#endif // INCLUDE_STB_LEAKCHECK_H
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_perlin.h b/lib/stb/stb_perlin.h
new file mode 100644
index 0000000..47cb9a4
--- /dev/null
+++ b/lib/stb/stb_perlin.h
@@ -0,0 +1,428 @@
+// stb_perlin.h - v0.5 - perlin noise
+// public domain single-file C implementation by Sean Barrett
+//
+// LICENSE
+//
+//   See end of file.
+//
+//
+// to create the implementation,
+//     #define STB_PERLIN_IMPLEMENTATION
+// in *one* C/CPP file that includes this file.
+//
+//
+// Documentation:
+//
+// float  stb_perlin_noise3( float x,
+//                           float y,
+//                           float z,
+//                           int   x_wrap=0,
+//                           int   y_wrap=0,
+//                           int   z_wrap=0)
+//
+// This function computes a random value at the coordinate (x,y,z).
+// Adjacent random values are continuous but the noise fluctuates
+// its randomness with period 1, i.e. takes on wholly unrelated values
+// at integer points. Specifically, this implements Ken Perlin's
+// revised noise function from 2002.
+//
+// The "wrap" parameters can be used to create wraparound noise that
+// wraps at powers of two. The numbers MUST be powers of two. Specify
+// 0 to mean "don't care". (The noise always wraps every 256 due
+// details of the implementation, even if you ask for larger or no
+// wrapping.)
+//
+// float  stb_perlin_noise3_seed( float x,
+//                                float y,
+//                                float z,
+//                                int   x_wrap=0,
+//                                int   y_wrap=0,
+//                                int   z_wrap=0,
+//                                int   seed)
+//
+// As above, but 'seed' selects from multiple different variations of the
+// noise function. The current implementation only uses the bottom 8 bits
+// of 'seed', but possibly in the future more bits will be used.
+//
+//
+// Fractal Noise:
+//
+// Three common fractal noise functions are included, which produce
+// a wide variety of nice effects depending on the parameters
+// provided. Note that each function will call stb_perlin_noise3
+// 'octaves' times, so this parameter will affect runtime.
+//
+// float stb_perlin_ridge_noise3(float x, float y, float z,
+//                               float lacunarity, float gain, float offset, int octaves)
+//
+// float stb_perlin_fbm_noise3(float x, float y, float z,
+//                             float lacunarity, float gain, int octaves)
+//
+// float stb_perlin_turbulence_noise3(float x, float y, float z,
+//                                    float lacunarity, float gain, int octaves)
+//
+// Typical values to start playing with:
+//     octaves    =   6     -- number of "octaves" of noise3() to sum
+//     lacunarity = ~ 2.0   -- spacing between successive octaves (use exactly 2.0 for wrapping output)
+//     gain       =   0.5   -- relative weighting applied to each successive octave
+//     offset     =   1.0?  -- used to invert the ridges, may need to be larger, not sure
+//
+//
+// Contributors:
+//    Jack Mott - additional noise functions
+//    Jordan Peck - seeded noise
+//
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern float stb_perlin_noise3(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap);
+extern float stb_perlin_noise3_seed(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, int seed);
+extern float stb_perlin_ridge_noise3(float x, float y, float z, float lacunarity, float gain, float offset, int octaves);
+extern float stb_perlin_fbm_noise3(float x, float y, float z, float lacunarity, float gain, int octaves);
+extern float stb_perlin_turbulence_noise3(float x, float y, float z, float lacunarity, float gain, int octaves);
+extern float stb_perlin_noise3_wrap_nonpow2(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef STB_PERLIN_IMPLEMENTATION
+
+#include <math.h> // fabs()
+
+// not same permutation table as Perlin's reference to avoid copyright issues;
+// Perlin's table can be found at http://mrl.nyu.edu/~perlin/noise/
+static unsigned char stb__perlin_randtab[512] =
+{
+   23, 125, 161, 52, 103, 117, 70, 37, 247, 101, 203, 169, 124, 126, 44, 123,
+   152, 238, 145, 45, 171, 114, 253, 10, 192, 136, 4, 157, 249, 30, 35, 72,
+   175, 63, 77, 90, 181, 16, 96, 111, 133, 104, 75, 162, 93, 56, 66, 240,
+   8, 50, 84, 229, 49, 210, 173, 239, 141, 1, 87, 18, 2, 198, 143, 57,
+   225, 160, 58, 217, 168, 206, 245, 204, 199, 6, 73, 60, 20, 230, 211, 233,
+   94, 200, 88, 9, 74, 155, 33, 15, 219, 130, 226, 202, 83, 236, 42, 172,
+   165, 218, 55, 222, 46, 107, 98, 154, 109, 67, 196, 178, 127, 158, 13, 243,
+   65, 79, 166, 248, 25, 224, 115, 80, 68, 51, 184, 128, 232, 208, 151, 122,
+   26, 212, 105, 43, 179, 213, 235, 148, 146, 89, 14, 195, 28, 78, 112, 76,
+   250, 47, 24, 251, 140, 108, 186, 190, 228, 170, 183, 139, 39, 188, 244, 246,
+   132, 48, 119, 144, 180, 138, 134, 193, 82, 182, 120, 121, 86, 220, 209, 3,
+   91, 241, 149, 85, 205, 150, 113, 216, 31, 100, 41, 164, 177, 214, 153, 231,
+   38, 71, 185, 174, 97, 201, 29, 95, 7, 92, 54, 254, 191, 118, 34, 221,
+   131, 11, 163, 99, 234, 81, 227, 147, 156, 176, 17, 142, 69, 12, 110, 62,
+   27, 255, 0, 194, 59, 116, 242, 252, 19, 21, 187, 53, 207, 129, 64, 135,
+   61, 40, 167, 237, 102, 223, 106, 159, 197, 189, 215, 137, 36, 32, 22, 5,
+
+   // and a second copy so we don't need an extra mask or static initializer
+   23, 125, 161, 52, 103, 117, 70, 37, 247, 101, 203, 169, 124, 126, 44, 123,
+   152, 238, 145, 45, 171, 114, 253, 10, 192, 136, 4, 157, 249, 30, 35, 72,
+   175, 63, 77, 90, 181, 16, 96, 111, 133, 104, 75, 162, 93, 56, 66, 240,
+   8, 50, 84, 229, 49, 210, 173, 239, 141, 1, 87, 18, 2, 198, 143, 57,
+   225, 160, 58, 217, 168, 206, 245, 204, 199, 6, 73, 60, 20, 230, 211, 233,
+   94, 200, 88, 9, 74, 155, 33, 15, 219, 130, 226, 202, 83, 236, 42, 172,
+   165, 218, 55, 222, 46, 107, 98, 154, 109, 67, 196, 178, 127, 158, 13, 243,
+   65, 79, 166, 248, 25, 224, 115, 80, 68, 51, 184, 128, 232, 208, 151, 122,
+   26, 212, 105, 43, 179, 213, 235, 148, 146, 89, 14, 195, 28, 78, 112, 76,
+   250, 47, 24, 251, 140, 108, 186, 190, 228, 170, 183, 139, 39, 188, 244, 246,
+   132, 48, 119, 144, 180, 138, 134, 193, 82, 182, 120, 121, 86, 220, 209, 3,
+   91, 241, 149, 85, 205, 150, 113, 216, 31, 100, 41, 164, 177, 214, 153, 231,
+   38, 71, 185, 174, 97, 201, 29, 95, 7, 92, 54, 254, 191, 118, 34, 221,
+   131, 11, 163, 99, 234, 81, 227, 147, 156, 176, 17, 142, 69, 12, 110, 62,
+   27, 255, 0, 194, 59, 116, 242, 252, 19, 21, 187, 53, 207, 129, 64, 135,
+   61, 40, 167, 237, 102, 223, 106, 159, 197, 189, 215, 137, 36, 32, 22, 5,
+};
+
+
+// perlin's gradient has 12 cases so some get used 1/16th of the time
+// and some 2/16ths. We reduce bias by changing those fractions
+// to 5/64ths and 6/64ths
+
+// this array is designed to match the previous implementation
+// of gradient hash: indices[stb__perlin_randtab[i]&63]
+static unsigned char stb__perlin_randtab_grad_idx[512] =
+{
+    7, 9, 5, 0, 11, 1, 6, 9, 3, 9, 11, 1, 8, 10, 4, 7,
+    8, 6, 1, 5, 3, 10, 9, 10, 0, 8, 4, 1, 5, 2, 7, 8,
+    7, 11, 9, 10, 1, 0, 4, 7, 5, 0, 11, 6, 1, 4, 2, 8,
+    8, 10, 4, 9, 9, 2, 5, 7, 9, 1, 7, 2, 2, 6, 11, 5,
+    5, 4, 6, 9, 0, 1, 1, 0, 7, 6, 9, 8, 4, 10, 3, 1,
+    2, 8, 8, 9, 10, 11, 5, 11, 11, 2, 6, 10, 3, 4, 2, 4,
+    9, 10, 3, 2, 6, 3, 6, 10, 5, 3, 4, 10, 11, 2, 9, 11,
+    1, 11, 10, 4, 9, 4, 11, 0, 4, 11, 4, 0, 0, 0, 7, 6,
+    10, 4, 1, 3, 11, 5, 3, 4, 2, 9, 1, 3, 0, 1, 8, 0,
+    6, 7, 8, 7, 0, 4, 6, 10, 8, 2, 3, 11, 11, 8, 0, 2,
+    4, 8, 3, 0, 0, 10, 6, 1, 2, 2, 4, 5, 6, 0, 1, 3,
+    11, 9, 5, 5, 9, 6, 9, 8, 3, 8, 1, 8, 9, 6, 9, 11,
+    10, 7, 5, 6, 5, 9, 1, 3, 7, 0, 2, 10, 11, 2, 6, 1,
+    3, 11, 7, 7, 2, 1, 7, 3, 0, 8, 1, 1, 5, 0, 6, 10,
+    11, 11, 0, 2, 7, 0, 10, 8, 3, 5, 7, 1, 11, 1, 0, 7,
+    9, 0, 11, 5, 10, 3, 2, 3, 5, 9, 7, 9, 8, 4, 6, 5,
+
+    // and a second copy so we don't need an extra mask or static initializer
+    7, 9, 5, 0, 11, 1, 6, 9, 3, 9, 11, 1, 8, 10, 4, 7,
+    8, 6, 1, 5, 3, 10, 9, 10, 0, 8, 4, 1, 5, 2, 7, 8,
+    7, 11, 9, 10, 1, 0, 4, 7, 5, 0, 11, 6, 1, 4, 2, 8,
+    8, 10, 4, 9, 9, 2, 5, 7, 9, 1, 7, 2, 2, 6, 11, 5,
+    5, 4, 6, 9, 0, 1, 1, 0, 7, 6, 9, 8, 4, 10, 3, 1,
+    2, 8, 8, 9, 10, 11, 5, 11, 11, 2, 6, 10, 3, 4, 2, 4,
+    9, 10, 3, 2, 6, 3, 6, 10, 5, 3, 4, 10, 11, 2, 9, 11,
+    1, 11, 10, 4, 9, 4, 11, 0, 4, 11, 4, 0, 0, 0, 7, 6,
+    10, 4, 1, 3, 11, 5, 3, 4, 2, 9, 1, 3, 0, 1, 8, 0,
+    6, 7, 8, 7, 0, 4, 6, 10, 8, 2, 3, 11, 11, 8, 0, 2,
+    4, 8, 3, 0, 0, 10, 6, 1, 2, 2, 4, 5, 6, 0, 1, 3,
+    11, 9, 5, 5, 9, 6, 9, 8, 3, 8, 1, 8, 9, 6, 9, 11,
+    10, 7, 5, 6, 5, 9, 1, 3, 7, 0, 2, 10, 11, 2, 6, 1,
+    3, 11, 7, 7, 2, 1, 7, 3, 0, 8, 1, 1, 5, 0, 6, 10,
+    11, 11, 0, 2, 7, 0, 10, 8, 3, 5, 7, 1, 11, 1, 0, 7,
+    9, 0, 11, 5, 10, 3, 2, 3, 5, 9, 7, 9, 8, 4, 6, 5,
+};
+
+static float stb__perlin_lerp(float a, float b, float t)
+{
+   return a + (b-a) * t;
+}
+
+static int stb__perlin_fastfloor(float a)
+{
+    int ai = (int) a;
+    return (a < ai) ? ai-1 : ai;
+}
+
+// different grad function from Perlin's, but easy to modify to match reference
+static float stb__perlin_grad(int grad_idx, float x, float y, float z)
+{
+   static float basis[12][4] =
+   {
+      {  1, 1, 0 },
+      { -1, 1, 0 },
+      {  1,-1, 0 },
+      { -1,-1, 0 },
+      {  1, 0, 1 },
+      { -1, 0, 1 },
+      {  1, 0,-1 },
+      { -1, 0,-1 },
+      {  0, 1, 1 },
+      {  0,-1, 1 },
+      {  0, 1,-1 },
+      {  0,-1,-1 },
+   };
+
+   float *grad = basis[grad_idx];
+   return grad[0]*x + grad[1]*y + grad[2]*z;
+}
+
+float stb_perlin_noise3_internal(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed)
+{
+   float u,v,w;
+   float n000,n001,n010,n011,n100,n101,n110,n111;
+   float n00,n01,n10,n11;
+   float n0,n1;
+
+   unsigned int x_mask = (x_wrap-1) & 255;
+   unsigned int y_mask = (y_wrap-1) & 255;
+   unsigned int z_mask = (z_wrap-1) & 255;
+   int px = stb__perlin_fastfloor(x);
+   int py = stb__perlin_fastfloor(y);
+   int pz = stb__perlin_fastfloor(z);
+   int x0 = px & x_mask, x1 = (px+1) & x_mask;
+   int y0 = py & y_mask, y1 = (py+1) & y_mask;
+   int z0 = pz & z_mask, z1 = (pz+1) & z_mask;
+   int r0,r1, r00,r01,r10,r11;
+
+   #define stb__perlin_ease(a)   (((a*6-15)*a + 10) * a * a * a)
+
+   x -= px; u = stb__perlin_ease(x);
+   y -= py; v = stb__perlin_ease(y);
+   z -= pz; w = stb__perlin_ease(z);
+
+   r0 = stb__perlin_randtab[x0+seed];
+   r1 = stb__perlin_randtab[x1+seed];
+
+   r00 = stb__perlin_randtab[r0+y0];
+   r01 = stb__perlin_randtab[r0+y1];
+   r10 = stb__perlin_randtab[r1+y0];
+   r11 = stb__perlin_randtab[r1+y1];
+
+   n000 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z0], x  , y  , z   );
+   n001 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z1], x  , y  , z-1 );
+   n010 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z0], x  , y-1, z   );
+   n011 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z1], x  , y-1, z-1 );
+   n100 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z0], x-1, y  , z   );
+   n101 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z1], x-1, y  , z-1 );
+   n110 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z0], x-1, y-1, z   );
+   n111 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z1], x-1, y-1, z-1 );
+
+   n00 = stb__perlin_lerp(n000,n001,w);
+   n01 = stb__perlin_lerp(n010,n011,w);
+   n10 = stb__perlin_lerp(n100,n101,w);
+   n11 = stb__perlin_lerp(n110,n111,w);
+
+   n0 = stb__perlin_lerp(n00,n01,v);
+   n1 = stb__perlin_lerp(n10,n11,v);
+
+   return stb__perlin_lerp(n0,n1,u);
+}
+
+float stb_perlin_noise3(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap)
+{
+    return stb_perlin_noise3_internal(x,y,z,x_wrap,y_wrap,z_wrap,0);
+}
+
+float stb_perlin_noise3_seed(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, int seed)
+{
+    return stb_perlin_noise3_internal(x,y,z,x_wrap,y_wrap,z_wrap, (unsigned char) seed);
+}
+
+float stb_perlin_ridge_noise3(float x, float y, float z, float lacunarity, float gain, float offset, int octaves)
+{
+   int i;
+   float frequency = 1.0f;
+   float prev = 1.0f;
+   float amplitude = 0.5f;
+   float sum = 0.0f;
+
+   for (i = 0; i < octaves; i++) {
+      float r = stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i);
+      r = offset - (float) fabs(r);
+      r = r*r;
+      sum += r*amplitude*prev;
+      prev = r;
+      frequency *= lacunarity;
+      amplitude *= gain;
+   }
+   return sum;
+}
+
+float stb_perlin_fbm_noise3(float x, float y, float z, float lacunarity, float gain, int octaves)
+{
+   int i;
+   float frequency = 1.0f;
+   float amplitude = 1.0f;
+   float sum = 0.0f;
+
+   for (i = 0; i < octaves; i++) {
+      sum += stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i)*amplitude;
+      frequency *= lacunarity;
+      amplitude *= gain;
+   }
+   return sum;
+}
+
+float stb_perlin_turbulence_noise3(float x, float y, float z, float lacunarity, float gain, int octaves)
+{
+   int i;
+   float frequency = 1.0f;
+   float amplitude = 1.0f;
+   float sum = 0.0f;
+
+   for (i = 0; i < octaves; i++) {
+      float r = stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i)*amplitude;
+      sum += (float) fabs(r);
+      frequency *= lacunarity;
+      amplitude *= gain;
+   }
+   return sum;
+}
+
+float stb_perlin_noise3_wrap_nonpow2(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed)
+{
+   float u,v,w;
+   float n000,n001,n010,n011,n100,n101,n110,n111;
+   float n00,n01,n10,n11;
+   float n0,n1;
+
+   int px = stb__perlin_fastfloor(x);
+   int py = stb__perlin_fastfloor(y);
+   int pz = stb__perlin_fastfloor(z);
+   int x_wrap2 = (x_wrap ? x_wrap : 256);
+   int y_wrap2 = (y_wrap ? y_wrap : 256);
+   int z_wrap2 = (z_wrap ? z_wrap : 256);
+   int x0 = px % x_wrap2, x1;
+   int y0 = py % y_wrap2, y1;
+   int z0 = pz % z_wrap2, z1;
+   int r0,r1, r00,r01,r10,r11;
+
+   if (x0 < 0) x0 += x_wrap2;
+   if (y0 < 0) y0 += y_wrap2;
+   if (z0 < 0) z0 += z_wrap2;
+   x1 = (x0+1) % x_wrap2;
+   y1 = (y0+1) % y_wrap2;
+   z1 = (z0+1) % z_wrap2;
+
+   #define stb__perlin_ease(a)   (((a*6-15)*a + 10) * a * a * a)
+
+   x -= px; u = stb__perlin_ease(x);
+   y -= py; v = stb__perlin_ease(y);
+   z -= pz; w = stb__perlin_ease(z);
+
+   r0 = stb__perlin_randtab[x0];
+   r0 = stb__perlin_randtab[r0+seed];
+   r1 = stb__perlin_randtab[x1];
+   r1 = stb__perlin_randtab[r1+seed];
+
+   r00 = stb__perlin_randtab[r0+y0];
+   r01 = stb__perlin_randtab[r0+y1];
+   r10 = stb__perlin_randtab[r1+y0];
+   r11 = stb__perlin_randtab[r1+y1];
+
+   n000 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z0], x  , y  , z   );
+   n001 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z1], x  , y  , z-1 );
+   n010 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z0], x  , y-1, z   );
+   n011 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z1], x  , y-1, z-1 );
+   n100 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z0], x-1, y  , z   );
+   n101 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z1], x-1, y  , z-1 );
+   n110 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z0], x-1, y-1, z   );
+   n111 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z1], x-1, y-1, z-1 );
+
+   n00 = stb__perlin_lerp(n000,n001,w);
+   n01 = stb__perlin_lerp(n010,n011,w);
+   n10 = stb__perlin_lerp(n100,n101,w);
+   n11 = stb__perlin_lerp(n110,n111,w);
+
+   n0 = stb__perlin_lerp(n00,n01,v);
+   n1 = stb__perlin_lerp(n10,n11,v);
+
+   return stb__perlin_lerp(n0,n1,u);
+}
+#endif  // STB_PERLIN_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_rect_pack.h b/lib/stb/stb_rect_pack.h
new file mode 100644
index 0000000..6a633ce
--- /dev/null
+++ b/lib/stb/stb_rect_pack.h
@@ -0,0 +1,623 @@
+// stb_rect_pack.h - v1.01 - public domain - rectangle packing
+// Sean Barrett 2014
+//
+// Useful for e.g. packing rectangular textures into an atlas.
+// Does not do rotation.
+//
+// Before #including,
+//
+//    #define STB_RECT_PACK_IMPLEMENTATION
+//
+// in the file that you want to have the implementation.
+//
+// Not necessarily the awesomest packing method, but better than
+// the totally naive one in stb_truetype (which is primarily what
+// this is meant to replace).
+//
+// Has only had a few tests run, may have issues.
+//
+// More docs to come.
+//
+// No memory allocations; uses qsort() and assert() from stdlib.
+// Can override those by defining STBRP_SORT and STBRP_ASSERT.
+//
+// This library currently uses the Skyline Bottom-Left algorithm.
+//
+// Please note: better rectangle packers are welcome! Please
+// implement them to the same API, but with a different init
+// function.
+//
+// Credits
+//
+//  Library
+//    Sean Barrett
+//  Minor features
+//    Martins Mozeiko
+//    github:IntellectualKitty
+//
+//  Bugfixes / warning fixes
+//    Jeremy Jaussaud
+//    Fabian Giesen
+//
+// Version history:
+//
+//     1.01  (2021-07-11)  always use large rect mode, expose STBRP__MAXVAL in public section
+//     1.00  (2019-02-25)  avoid small space waste; gracefully fail too-wide rectangles
+//     0.99  (2019-02-07)  warning fixes
+//     0.11  (2017-03-03)  return packing success/fail result
+//     0.10  (2016-10-25)  remove cast-away-const to avoid warnings
+//     0.09  (2016-08-27)  fix compiler warnings
+//     0.08  (2015-09-13)  really fix bug with empty rects (w=0 or h=0)
+//     0.07  (2015-09-13)  fix bug with empty rects (w=0 or h=0)
+//     0.06  (2015-04-15)  added STBRP_SORT to allow replacing qsort
+//     0.05:  added STBRP_ASSERT to allow replacing assert
+//     0.04:  fixed minor bug in STBRP_LARGE_RECTS support
+//     0.01:  initial release
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//       INCLUDE SECTION
+//
+
+#ifndef STB_INCLUDE_STB_RECT_PACK_H
+#define STB_INCLUDE_STB_RECT_PACK_H
+
+#define STB_RECT_PACK_VERSION  1
+
+#ifdef STBRP_STATIC
+#define STBRP_DEF static
+#else
+#define STBRP_DEF extern
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct stbrp_context stbrp_context;
+typedef struct stbrp_node    stbrp_node;
+typedef struct stbrp_rect    stbrp_rect;
+
+typedef int            stbrp_coord;
+
+#define STBRP__MAXVAL  0x7fffffff
+// Mostly for internal use, but this is the maximum supported coordinate value.
+
+STBRP_DEF int stbrp_pack_rects (stbrp_context *context, stbrp_rect *rects, int num_rects);
+// Assign packed locations to rectangles. The rectangles are of type
+// 'stbrp_rect' defined below, stored in the array 'rects', and there
+// are 'num_rects' many of them.
+//
+// Rectangles which are successfully packed have the 'was_packed' flag
+// set to a non-zero value and 'x' and 'y' store the minimum location
+// on each axis (i.e. bottom-left in cartesian coordinates, top-left
+// if you imagine y increasing downwards). Rectangles which do not fit
+// have the 'was_packed' flag set to 0.
+//
+// You should not try to access the 'rects' array from another thread
+// while this function is running, as the function temporarily reorders
+// the array while it executes.
+//
+// To pack into another rectangle, you need to call stbrp_init_target
+// again. To continue packing into the same rectangle, you can call
+// this function again. Calling this multiple times with multiple rect
+// arrays will probably produce worse packing results than calling it
+// a single time with the full rectangle array, but the option is
+// available.
+//
+// The function returns 1 if all of the rectangles were successfully
+// packed and 0 otherwise.
+
+struct stbrp_rect
+{
+   // reserved for your use:
+   int            id;
+
+   // input:
+   stbrp_coord    w, h;
+
+   // output:
+   stbrp_coord    x, y;
+   int            was_packed;  // non-zero if valid packing
+
+}; // 16 bytes, nominally
+
+
+STBRP_DEF void stbrp_init_target (stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes);
+// Initialize a rectangle packer to:
+//    pack a rectangle that is 'width' by 'height' in dimensions
+//    using temporary storage provided by the array 'nodes', which is 'num_nodes' long
+//
+// You must call this function every time you start packing into a new target.
+//
+// There is no "shutdown" function. The 'nodes' memory must stay valid for
+// the following stbrp_pack_rects() call (or calls), but can be freed after
+// the call (or calls) finish.
+//
+// Note: to guarantee best results, either:
+//       1. make sure 'num_nodes' >= 'width'
+//   or  2. call stbrp_allow_out_of_mem() defined below with 'allow_out_of_mem = 1'
+//
+// If you don't do either of the above things, widths will be quantized to multiples
+// of small integers to guarantee the algorithm doesn't run out of temporary storage.
+//
+// If you do #2, then the non-quantized algorithm will be used, but the algorithm
+// may run out of temporary storage and be unable to pack some rectangles.
+
+STBRP_DEF void stbrp_setup_allow_out_of_mem (stbrp_context *context, int allow_out_of_mem);
+// Optionally call this function after init but before doing any packing to
+// change the handling of the out-of-temp-memory scenario, described above.
+// If you call init again, this will be reset to the default (false).
+
+
+STBRP_DEF void stbrp_setup_heuristic (stbrp_context *context, int heuristic);
+// Optionally select which packing heuristic the library should use. Different
+// heuristics will produce better/worse results for different data sets.
+// If you call init again, this will be reset to the default.
+
+enum
+{
+   STBRP_HEURISTIC_Skyline_default=0,
+   STBRP_HEURISTIC_Skyline_BL_sortHeight = STBRP_HEURISTIC_Skyline_default,
+   STBRP_HEURISTIC_Skyline_BF_sortHeight
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// the details of the following structures don't matter to you, but they must
+// be visible so you can handle the memory allocations for them
+
+struct stbrp_node
+{
+   stbrp_coord  x,y;
+   stbrp_node  *next;
+};
+
+struct stbrp_context
+{
+   int width;
+   int height;
+   int align;
+   int init_mode;
+   int heuristic;
+   int num_nodes;
+   stbrp_node *active_head;
+   stbrp_node *free_head;
+   stbrp_node extra[2]; // we allocate two extra nodes so optimal user-node-count is 'width' not 'width+2'
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//     IMPLEMENTATION SECTION
+//
+
+#ifdef STB_RECT_PACK_IMPLEMENTATION
+#ifndef STBRP_SORT
+#include <stdlib.h>
+#define STBRP_SORT qsort
+#endif
+
+#ifndef STBRP_ASSERT
+#include <assert.h>
+#define STBRP_ASSERT assert
+#endif
+
+#ifdef _MSC_VER
+#define STBRP__NOTUSED(v)  (void)(v)
+#define STBRP__CDECL       __cdecl
+#else
+#define STBRP__NOTUSED(v)  (void)sizeof(v)
+#define STBRP__CDECL
+#endif
+
+enum
+{
+   STBRP__INIT_skyline = 1
+};
+
+STBRP_DEF void stbrp_setup_heuristic(stbrp_context *context, int heuristic)
+{
+   switch (context->init_mode) {
+      case STBRP__INIT_skyline:
+         STBRP_ASSERT(heuristic == STBRP_HEURISTIC_Skyline_BL_sortHeight || heuristic == STBRP_HEURISTIC_Skyline_BF_sortHeight);
+         context->heuristic = heuristic;
+         break;
+      default:
+         STBRP_ASSERT(0);
+   }
+}
+
+STBRP_DEF void stbrp_setup_allow_out_of_mem(stbrp_context *context, int allow_out_of_mem)
+{
+   if (allow_out_of_mem)
+      // if it's ok to run out of memory, then don't bother aligning them;
+      // this gives better packing, but may fail due to OOM (even though
+      // the rectangles easily fit). @TODO a smarter approach would be to only
+      // quantize once we've hit OOM, then we could get rid of this parameter.
+      context->align = 1;
+   else {
+      // if it's not ok to run out of memory, then quantize the widths
+      // so that num_nodes is always enough nodes.
+      //
+      // I.e. num_nodes * align >= width
+      //                  align >= width / num_nodes
+      //                  align = ceil(width/num_nodes)
+
+      context->align = (context->width + context->num_nodes-1) / context->num_nodes;
+   }
+}
+
+STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes)
+{
+   int i;
+
+   for (i=0; i < num_nodes-1; ++i)
+      nodes[i].next = &nodes[i+1];
+   nodes[i].next = NULL;
+   context->init_mode = STBRP__INIT_skyline;
+   context->heuristic = STBRP_HEURISTIC_Skyline_default;
+   context->free_head = &nodes[0];
+   context->active_head = &context->extra[0];
+   context->width = width;
+   context->height = height;
+   context->num_nodes = num_nodes;
+   stbrp_setup_allow_out_of_mem(context, 0);
+
+   // node 0 is the full width, node 1 is the sentinel (lets us not store width explicitly)
+   context->extra[0].x = 0;
+   context->extra[0].y = 0;
+   context->extra[0].next = &context->extra[1];
+   context->extra[1].x = (stbrp_coord) width;
+   context->extra[1].y = (1<<30);
+   context->extra[1].next = NULL;
+}
+
+// find minimum y position if it starts at x1
+static int stbrp__skyline_find_min_y(stbrp_context *c, stbrp_node *first, int x0, int width, int *pwaste)
+{
+   stbrp_node *node = first;
+   int x1 = x0 + width;
+   int min_y, visited_width, waste_area;
+
+   STBRP__NOTUSED(c);
+
+   STBRP_ASSERT(first->x <= x0);
+
+   #if 0
+   // skip in case we're past the node
+   while (node->next->x <= x0)
+      ++node;
+   #else
+   STBRP_ASSERT(node->next->x > x0); // we ended up handling this in the caller for efficiency
+   #endif
+
+   STBRP_ASSERT(node->x <= x0);
+
+   min_y = 0;
+   waste_area = 0;
+   visited_width = 0;
+   while (node->x < x1) {
+      if (node->y > min_y) {
+         // raise min_y higher.
+         // we've accounted for all waste up to min_y,
+         // but we'll now add more waste for everything we've visted
+         waste_area += visited_width * (node->y - min_y);
+         min_y = node->y;
+         // the first time through, visited_width might be reduced
+         if (node->x < x0)
+            visited_width += node->next->x - x0;
+         else
+            visited_width += node->next->x - node->x;
+      } else {
+         // add waste area
+         int under_width = node->next->x - node->x;
+         if (under_width + visited_width > width)
+            under_width = width - visited_width;
+         waste_area += under_width * (min_y - node->y);
+         visited_width += under_width;
+      }
+      node = node->next;
+   }
+
+   *pwaste = waste_area;
+   return min_y;
+}
+
+typedef struct
+{
+   int x,y;
+   stbrp_node **prev_link;
+} stbrp__findresult;
+
+static stbrp__findresult stbrp__skyline_find_best_pos(stbrp_context *c, int width, int height)
+{
+   int best_waste = (1<<30), best_x, best_y = (1 << 30);
+   stbrp__findresult fr;
+   stbrp_node **prev, *node, *tail, **best = NULL;
+
+   // align to multiple of c->align
+   width = (width + c->align - 1);
+   width -= width % c->align;
+   STBRP_ASSERT(width % c->align == 0);
+
+   // if it can't possibly fit, bail immediately
+   if (width > c->width || height > c->height) {
+      fr.prev_link = NULL;
+      fr.x = fr.y = 0;
+      return fr;
+   }
+
+   node = c->active_head;
+   prev = &c->active_head;
+   while (node->x + width <= c->width) {
+      int y,waste;
+      y = stbrp__skyline_find_min_y(c, node, node->x, width, &waste);
+      if (c->heuristic == STBRP_HEURISTIC_Skyline_BL_sortHeight) { // actually just want to test BL
+         // bottom left
+         if (y < best_y) {
+            best_y = y;
+            best = prev;
+         }
+      } else {
+         // best-fit
+         if (y + height <= c->height) {
+            // can only use it if it first vertically
+            if (y < best_y || (y == best_y && waste < best_waste)) {
+               best_y = y;
+               best_waste = waste;
+               best = prev;
+            }
+         }
+      }
+      prev = &node->next;
+      node = node->next;
+   }
+
+   best_x = (best == NULL) ? 0 : (*best)->x;
+
+   // if doing best-fit (BF), we also have to try aligning right edge to each node position
+   //
+   // e.g, if fitting
+   //
+   //     ____________________
+   //    |____________________|
+   //
+   //            into
+   //
+   //   |                         |
+   //   |             ____________|
+   //   |____________|
+   //
+   // then right-aligned reduces waste, but bottom-left BL is always chooses left-aligned
+   //
+   // This makes BF take about 2x the time
+
+   if (c->heuristic == STBRP_HEURISTIC_Skyline_BF_sortHeight) {
+      tail = c->active_head;
+      node = c->active_head;
+      prev = &c->active_head;
+      // find first node that's admissible
+      while (tail->x < width)
+         tail = tail->next;
+      while (tail) {
+         int xpos = tail->x - width;
+         int y,waste;
+         STBRP_ASSERT(xpos >= 0);
+         // find the left position that matches this
+         while (node->next->x <= xpos) {
+            prev = &node->next;
+            node = node->next;
+         }
+         STBRP_ASSERT(node->next->x > xpos && node->x <= xpos);
+         y = stbrp__skyline_find_min_y(c, node, xpos, width, &waste);
+         if (y + height <= c->height) {
+            if (y <= best_y) {
+               if (y < best_y || waste < best_waste || (waste==best_waste && xpos < best_x)) {
+                  best_x = xpos;
+                  STBRP_ASSERT(y <= best_y);
+                  best_y = y;
+                  best_waste = waste;
+                  best = prev;
+               }
+            }
+         }
+         tail = tail->next;
+      }
+   }
+
+   fr.prev_link = best;
+   fr.x = best_x;
+   fr.y = best_y;
+   return fr;
+}
+
+static stbrp__findresult stbrp__skyline_pack_rectangle(stbrp_context *context, int width, int height)
+{
+   // find best position according to heuristic
+   stbrp__findresult res = stbrp__skyline_find_best_pos(context, width, height);
+   stbrp_node *node, *cur;
+
+   // bail if:
+   //    1. it failed
+   //    2. the best node doesn't fit (we don't always check this)
+   //    3. we're out of memory
+   if (res.prev_link == NULL || res.y + height > context->height || context->free_head == NULL) {
+      res.prev_link = NULL;
+      return res;
+   }
+
+   // on success, create new node
+   node = context->free_head;
+   node->x = (stbrp_coord) res.x;
+   node->y = (stbrp_coord) (res.y + height);
+
+   context->free_head = node->next;
+
+   // insert the new node into the right starting point, and
+   // let 'cur' point to the remaining nodes needing to be
+   // stiched back in
+
+   cur = *res.prev_link;
+   if (cur->x < res.x) {
+      // preserve the existing one, so start testing with the next one
+      stbrp_node *next = cur->next;
+      cur->next = node;
+      cur = next;
+   } else {
+      *res.prev_link = node;
+   }
+
+   // from here, traverse cur and free the nodes, until we get to one
+   // that shouldn't be freed
+   while (cur->next && cur->next->x <= res.x + width) {
+      stbrp_node *next = cur->next;
+      // move the current node to the free list
+      cur->next = context->free_head;
+      context->free_head = cur;
+      cur = next;
+   }
+
+   // stitch the list back in
+   node->next = cur;
+
+   if (cur->x < res.x + width)
+      cur->x = (stbrp_coord) (res.x + width);
+
+#ifdef _DEBUG
+   cur = context->active_head;
+   while (cur->x < context->width) {
+      STBRP_ASSERT(cur->x < cur->next->x);
+      cur = cur->next;
+   }
+   STBRP_ASSERT(cur->next == NULL);
+
+   {
+      int count=0;
+      cur = context->active_head;
+      while (cur) {
+         cur = cur->next;
+         ++count;
+      }
+      cur = context->free_head;
+      while (cur) {
+         cur = cur->next;
+         ++count;
+      }
+      STBRP_ASSERT(count == context->num_nodes+2);
+   }
+#endif
+
+   return res;
+}
+
+static int STBRP__CDECL rect_height_compare(const void *a, const void *b)
+{
+   const stbrp_rect *p = (const stbrp_rect *) a;
+   const stbrp_rect *q = (const stbrp_rect *) b;
+   if (p->h > q->h)
+      return -1;
+   if (p->h < q->h)
+      return  1;
+   return (p->w > q->w) ? -1 : (p->w < q->w);
+}
+
+static int STBRP__CDECL rect_original_order(const void *a, const void *b)
+{
+   const stbrp_rect *p = (const stbrp_rect *) a;
+   const stbrp_rect *q = (const stbrp_rect *) b;
+   return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed);
+}
+
+STBRP_DEF int stbrp_pack_rects(stbrp_context *context, stbrp_rect *rects, int num_rects)
+{
+   int i, all_rects_packed = 1;
+
+   // we use the 'was_packed' field internally to allow sorting/unsorting
+   for (i=0; i < num_rects; ++i) {
+      rects[i].was_packed = i;
+   }
+
+   // sort according to heuristic
+   STBRP_SORT(rects, num_rects, sizeof(rects[0]), rect_height_compare);
+
+   for (i=0; i < num_rects; ++i) {
+      if (rects[i].w == 0 || rects[i].h == 0) {
+         rects[i].x = rects[i].y = 0;  // empty rect needs no space
+      } else {
+         stbrp__findresult fr = stbrp__skyline_pack_rectangle(context, rects[i].w, rects[i].h);
+         if (fr.prev_link) {
+            rects[i].x = (stbrp_coord) fr.x;
+            rects[i].y = (stbrp_coord) fr.y;
+         } else {
+            rects[i].x = rects[i].y = STBRP__MAXVAL;
+         }
+      }
+   }
+
+   // unsort
+   STBRP_SORT(rects, num_rects, sizeof(rects[0]), rect_original_order);
+
+   // set was_packed flags and all_rects_packed status
+   for (i=0; i < num_rects; ++i) {
+      rects[i].was_packed = !(rects[i].x == STBRP__MAXVAL && rects[i].y == STBRP__MAXVAL);
+      if (!rects[i].was_packed)
+         all_rects_packed = 0;
+   }
+
+   // return the all_rects_packed status
+   return all_rects_packed;
+}
+#endif
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_sprintf.h b/lib/stb/stb_sprintf.h
new file mode 100644
index 0000000..ca432a6
--- /dev/null
+++ b/lib/stb/stb_sprintf.h
@@ -0,0 +1,1906 @@
+// stb_sprintf - v1.10 - public domain snprintf() implementation
+// originally by Jeff Roberts / RAD Game Tools, 2015/10/20
+// http://github.com/nothings/stb
+//
+// allowed types:  sc uidBboXx p AaGgEef n
+// lengths      :  hh h ll j z t I64 I32 I
+//
+// Contributors:
+//    Fabian "ryg" Giesen (reformatting)
+//    github:aganm (attribute format)
+//
+// Contributors (bugfixes):
+//    github:d26435
+//    github:trex78
+//    github:account-login
+//    Jari Komppa (SI suffixes)
+//    Rohit Nirmal
+//    Marcin Wojdyr
+//    Leonard Ritter
+//    Stefano Zanotti
+//    Adam Allison
+//    Arvid Gerstmann
+//    Markus Kolb
+//
+// LICENSE:
+//
+//   See end of file for license information.
+
+#ifndef STB_SPRINTF_H_INCLUDE
+#define STB_SPRINTF_H_INCLUDE
+
+/*
+Single file sprintf replacement.
+
+Originally written by Jeff Roberts at RAD Game Tools - 2015/10/20.
+Hereby placed in public domain.
+
+This is a full sprintf replacement that supports everything that
+the C runtime sprintfs support, including float/double, 64-bit integers,
+hex floats, field parameters (%*.*d stuff), length reads backs, etc.
+
+Why would you need this if sprintf already exists?  Well, first off,
+it's *much* faster (see below). It's also much smaller than the CRT
+versions code-space-wise. We've also added some simple improvements
+that are super handy (commas in thousands, callbacks at buffer full,
+for example). Finally, the format strings for MSVC and GCC differ
+for 64-bit integers (among other small things), so this lets you use
+the same format strings in cross platform code.
+
+It uses the standard single file trick of being both the header file
+and the source itself. If you just include it normally, you just get
+the header file function definitions. To get the code, you include
+it from a C or C++ file and define STB_SPRINTF_IMPLEMENTATION first.
+
+It only uses va_args macros from the C runtime to do it's work. It
+does cast doubles to S64s and shifts and divides U64s, which does
+drag in CRT code on most platforms.
+
+It compiles to roughly 8K with float support, and 4K without.
+As a comparison, when using MSVC static libs, calling sprintf drags
+in 16K.
+
+API:
+====
+int stbsp_sprintf( char * buf, char const * fmt, ... )
+int stbsp_snprintf( char * buf, int count, char const * fmt, ... )
+  Convert an arg list into a buffer.  stbsp_snprintf always returns
+  a zero-terminated string (unlike regular snprintf).
+
+int stbsp_vsprintf( char * buf, char const * fmt, va_list va )
+int stbsp_vsnprintf( char * buf, int count, char const * fmt, va_list va )
+  Convert a va_list arg list into a buffer.  stbsp_vsnprintf always returns
+  a zero-terminated string (unlike regular snprintf).
+
+int stbsp_vsprintfcb( STBSP_SPRINTFCB * callback, void * user, char * buf, char const * fmt, va_list va )
+    typedef char * STBSP_SPRINTFCB( char const * buf, void * user, int len );
+  Convert into a buffer, calling back every STB_SPRINTF_MIN chars.
+  Your callback can then copy the chars out, print them or whatever.
+  This function is actually the workhorse for everything else.
+  The buffer you pass in must hold at least STB_SPRINTF_MIN characters.
+    // you return the next buffer to use or 0 to stop converting
+
+void stbsp_set_separators( char comma, char period )
+  Set the comma and period characters to use.
+
+FLOATS/DOUBLES:
+===============
+This code uses a internal float->ascii conversion method that uses
+doubles with error correction (double-doubles, for ~105 bits of
+precision).  This conversion is round-trip perfect - that is, an atof
+of the values output here will give you the bit-exact double back.
+
+One difference is that our insignificant digits will be different than
+with MSVC or GCC (but they don't match each other either).  We also
+don't attempt to find the minimum length matching float (pre-MSVC15
+doesn't either).
+
+If you don't need float or doubles at all, define STB_SPRINTF_NOFLOAT
+and you'll save 4K of code space.
+
+64-BIT INTS:
+============
+This library also supports 64-bit integers and you can use MSVC style or
+GCC style indicators (%I64d or %lld).  It supports the C99 specifiers
+for size_t and ptr_diff_t (%jd %zd) as well.
+
+EXTRAS:
+=======
+Like some GCCs, for integers and floats, you can use a ' (single quote)
+specifier and commas will be inserted on the thousands: "%'d" on 12345
+would print 12,345.
+
+For integers and floats, you can use a "$" specifier and the number
+will be converted to float and then divided to get kilo, mega, giga or
+tera and then printed, so "%$d" 1000 is "1.0 k", "%$.2d" 2536000 is
+"2.53 M", etc. For byte values, use two $:s, like "%$$d" to turn
+2536000 to "2.42 Mi". If you prefer JEDEC suffixes to SI ones, use three
+$:s: "%$$$d" -> "2.42 M". To remove the space between the number and the
+suffix, add "_" specifier: "%_$d" -> "2.53M".
+
+In addition to octal and hexadecimal conversions, you can print
+integers in binary: "%b" for 256 would print 100.
+
+PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC):
+===================================================================
+"%d" across all 32-bit ints (4.8x/4.0x faster than 32-/64-bit MSVC)
+"%24d" across all 32-bit ints (4.5x/4.2x faster)
+"%x" across all 32-bit ints (4.5x/3.8x faster)
+"%08x" across all 32-bit ints (4.3x/3.8x faster)
+"%f" across e-10 to e+10 floats (7.3x/6.0x faster)
+"%e" across e-10 to e+10 floats (8.1x/6.0x faster)
+"%g" across e-10 to e+10 floats (10.0x/7.1x faster)
+"%f" for values near e-300 (7.9x/6.5x faster)
+"%f" for values near e+300 (10.0x/9.1x faster)
+"%e" for values near e-300 (10.1x/7.0x faster)
+"%e" for values near e+300 (9.2x/6.0x faster)
+"%.320f" for values near e-300 (12.6x/11.2x faster)
+"%a" for random values (8.6x/4.3x faster)
+"%I64d" for 64-bits with 32-bit values (4.8x/3.4x faster)
+"%I64d" for 64-bits > 32-bit values (4.9x/5.5x faster)
+"%s%s%s" for 64 char strings (7.1x/7.3x faster)
+"...512 char string..." ( 35.0x/32.5x faster!)
+*/
+
+#if defined(__clang__)
+ #if defined(__has_feature) && defined(__has_attribute)
+  #if __has_feature(address_sanitizer)
+   #if __has_attribute(__no_sanitize__)
+    #define STBSP__ASAN __attribute__((__no_sanitize__("address")))
+   #elif __has_attribute(__no_sanitize_address__)
+    #define STBSP__ASAN __attribute__((__no_sanitize_address__))
+   #elif __has_attribute(__no_address_safety_analysis__)
+    #define STBSP__ASAN __attribute__((__no_address_safety_analysis__))
+   #endif
+  #endif
+ #endif
+#elif defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+ #if defined(__SANITIZE_ADDRESS__) && __SANITIZE_ADDRESS__
+  #define STBSP__ASAN __attribute__((__no_sanitize_address__))
+ #endif
+#endif
+
+#ifndef STBSP__ASAN
+#define STBSP__ASAN
+#endif
+
+#ifdef STB_SPRINTF_STATIC
+#define STBSP__PUBLICDEC static
+#define STBSP__PUBLICDEF static STBSP__ASAN
+#else
+#ifdef __cplusplus
+#define STBSP__PUBLICDEC extern "C"
+#define STBSP__PUBLICDEF extern "C" STBSP__ASAN
+#else
+#define STBSP__PUBLICDEC extern
+#define STBSP__PUBLICDEF STBSP__ASAN
+#endif
+#endif
+
+#if defined(__has_attribute)
+ #if __has_attribute(format)
+   #define STBSP__ATTRIBUTE_FORMAT(fmt,va) __attribute__((format(printf,fmt,va)))
+ #endif
+#endif
+
+#ifndef STBSP__ATTRIBUTE_FORMAT
+#define STBSP__ATTRIBUTE_FORMAT(fmt,va)
+#endif
+
+#ifdef _MSC_VER
+#define STBSP__NOTUSED(v)  (void)(v)
+#else
+#define STBSP__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#include <stdarg.h> // for va_arg(), va_list()
+#include <stddef.h> // size_t, ptrdiff_t
+
+#ifndef STB_SPRINTF_MIN
+#define STB_SPRINTF_MIN 512 // how many characters per callback
+#endif
+typedef char *STBSP_SPRINTFCB(const char *buf, void *user, int len);
+
+#ifndef STB_SPRINTF_DECORATE
+#define STB_SPRINTF_DECORATE(name) stbsp_##name // define this before including if you want to change the names
+#endif
+
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(2,3);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(3,4);
+
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va);
+STBSP__PUBLICDEC void STB_SPRINTF_DECORATE(set_separators)(char comma, char period);
+
+#endif // STB_SPRINTF_H_INCLUDE
+
+#ifdef STB_SPRINTF_IMPLEMENTATION
+
+#define stbsp__uint32 unsigned int
+#define stbsp__int32 signed int
+
+#ifdef _MSC_VER
+#define stbsp__uint64 unsigned __int64
+#define stbsp__int64 signed __int64
+#else
+#define stbsp__uint64 unsigned long long
+#define stbsp__int64 signed long long
+#endif
+#define stbsp__uint16 unsigned short
+
+#ifndef stbsp__uintptr
+#if defined(__ppc64__) || defined(__powerpc64__) || defined(__aarch64__) || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64) || defined(__s390x__)
+#define stbsp__uintptr stbsp__uint64
+#else
+#define stbsp__uintptr stbsp__uint32
+#endif
+#endif
+
+#ifndef STB_SPRINTF_MSVC_MODE // used for MSVC2013 and earlier (MSVC2015 matches GCC)
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define STB_SPRINTF_MSVC_MODE
+#endif
+#endif
+
+#ifdef STB_SPRINTF_NOUNALIGNED // define this before inclusion to force stbsp_sprintf to always use aligned accesses
+#define STBSP__UNALIGNED(code)
+#else
+#define STBSP__UNALIGNED(code) code
+#endif
+
+#ifndef STB_SPRINTF_NOFLOAT
+// internal float utility functions
+static stbsp__int32 stbsp__real_to_str(char const **start, stbsp__uint32 *len, char *out, stbsp__int32 *decimal_pos, double value, stbsp__uint32 frac_digits);
+static stbsp__int32 stbsp__real_to_parts(stbsp__int64 *bits, stbsp__int32 *expo, double value);
+#define STBSP__SPECIAL 0x7000
+#endif
+
+static char stbsp__period = '.';
+static char stbsp__comma = ',';
+static struct
+{
+   short temp; // force next field to be 2-byte aligned
+   char pair[201];
+} stbsp__digitpair =
+{
+  0,
+   "00010203040506070809101112131415161718192021222324"
+   "25262728293031323334353637383940414243444546474849"
+   "50515253545556575859606162636465666768697071727374"
+   "75767778798081828384858687888990919293949596979899"
+};
+
+STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char pcomma, char pperiod)
+{
+   stbsp__period = pperiod;
+   stbsp__comma = pcomma;
+}
+
+#define STBSP__LEFTJUST 1
+#define STBSP__LEADINGPLUS 2
+#define STBSP__LEADINGSPACE 4
+#define STBSP__LEADING_0X 8
+#define STBSP__LEADINGZERO 16
+#define STBSP__INTMAX 32
+#define STBSP__TRIPLET_COMMA 64
+#define STBSP__NEGATIVE 128
+#define STBSP__METRIC_SUFFIX 256
+#define STBSP__HALFWIDTH 512
+#define STBSP__METRIC_NOSPACE 1024
+#define STBSP__METRIC_1024 2048
+#define STBSP__METRIC_JEDEC 4096
+
+static void stbsp__lead_sign(stbsp__uint32 fl, char *sign)
+{
+   sign[0] = 0;
+   if (fl & STBSP__NEGATIVE) {
+      sign[0] = 1;
+      sign[1] = '-';
+   } else if (fl & STBSP__LEADINGSPACE) {
+      sign[0] = 1;
+      sign[1] = ' ';
+   } else if (fl & STBSP__LEADINGPLUS) {
+      sign[0] = 1;
+      sign[1] = '+';
+   }
+}
+
+static STBSP__ASAN stbsp__uint32 stbsp__strlen_limited(char const *s, stbsp__uint32 limit)
+{
+   char const * sn = s;
+
+   // get up to 4-byte alignment
+   for (;;) {
+      if (((stbsp__uintptr)sn & 3) == 0)
+         break;
+
+      if (!limit || *sn == 0)
+         return (stbsp__uint32)(sn - s);
+
+      ++sn;
+      --limit;
+   }
+
+   // scan over 4 bytes at a time to find terminating 0
+   // this will intentionally scan up to 3 bytes past the end of buffers,
+   // but becase it works 4B aligned, it will never cross page boundaries
+   // (hence the STBSP__ASAN markup; the over-read here is intentional
+   // and harmless)
+   while (limit >= 4) {
+      stbsp__uint32 v = *(stbsp__uint32 *)sn;
+      // bit hack to find if there's a 0 byte in there
+      if ((v - 0x01010101) & (~v) & 0x80808080UL)
+         break;
+
+      sn += 4;
+      limit -= 4;
+   }
+
+   // handle the last few characters to find actual size
+   while (limit && *sn) {
+      ++sn;
+      --limit;
+   }
+
+   return (stbsp__uint32)(sn - s);
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va)
+{
+   static char hex[] = "0123456789abcdefxp";
+   static char hexu[] = "0123456789ABCDEFXP";
+   char *bf;
+   char const *f;
+   int tlen = 0;
+
+   bf = buf;
+   f = fmt;
+   for (;;) {
+      stbsp__int32 fw, pr, tz;
+      stbsp__uint32 fl;
+
+      // macros for the callback buffer stuff
+      #define stbsp__chk_cb_bufL(bytes)                        \
+         {                                                     \
+            int len = (int)(bf - buf);                         \
+            if ((len + (bytes)) >= STB_SPRINTF_MIN) {          \
+               tlen += len;                                    \
+               if (0 == (bf = buf = callback(buf, user, len))) \
+                  goto done;                                   \
+            }                                                  \
+         }
+      #define stbsp__chk_cb_buf(bytes)    \
+         {                                \
+            if (callback) {               \
+               stbsp__chk_cb_bufL(bytes); \
+            }                             \
+         }
+      #define stbsp__flush_cb()                      \
+         {                                           \
+            stbsp__chk_cb_bufL(STB_SPRINTF_MIN - 1); \
+         } // flush if there is even one byte in the buffer
+      #define stbsp__cb_buf_clamp(cl, v)                \
+         cl = v;                                        \
+         if (callback) {                                \
+            int lg = STB_SPRINTF_MIN - (int)(bf - buf); \
+            if (cl > lg)                                \
+               cl = lg;                                 \
+         }
+
+      // fast copy everything up to the next % (or end of string)
+      for (;;) {
+         while (((stbsp__uintptr)f) & 3) {
+         schk1:
+            if (f[0] == '%')
+               goto scandd;
+         schk2:
+            if (f[0] == 0)
+               goto endfmt;
+            stbsp__chk_cb_buf(1);
+            *bf++ = f[0];
+            ++f;
+         }
+         for (;;) {
+            // Check if the next 4 bytes contain %(0x25) or end of string.
+            // Using the 'hasless' trick:
+            // https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
+            stbsp__uint32 v, c;
+            v = *(stbsp__uint32 *)f;
+            c = (~v) & 0x80808080;
+            if (((v ^ 0x25252525) - 0x01010101) & c)
+               goto schk1;
+            if ((v - 0x01010101) & c)
+               goto schk2;
+            if (callback)
+               if ((STB_SPRINTF_MIN - (int)(bf - buf)) < 4)
+                  goto schk1;
+            #ifdef STB_SPRINTF_NOUNALIGNED
+                if(((stbsp__uintptr)bf) & 3) {
+                    bf[0] = f[0];
+                    bf[1] = f[1];
+                    bf[2] = f[2];
+                    bf[3] = f[3];
+                } else
+            #endif
+            {
+                *(stbsp__uint32 *)bf = v;
+            }
+            bf += 4;
+            f += 4;
+         }
+      }
+   scandd:
+
+      ++f;
+
+      // ok, we have a percent, read the modifiers first
+      fw = 0;
+      pr = -1;
+      fl = 0;
+      tz = 0;
+
+      // flags
+      for (;;) {
+         switch (f[0]) {
+         // if we have left justify
+         case '-':
+            fl |= STBSP__LEFTJUST;
+            ++f;
+            continue;
+         // if we have leading plus
+         case '+':
+            fl |= STBSP__LEADINGPLUS;
+            ++f;
+            continue;
+         // if we have leading space
+         case ' ':
+            fl |= STBSP__LEADINGSPACE;
+            ++f;
+            continue;
+         // if we have leading 0x
+         case '#':
+            fl |= STBSP__LEADING_0X;
+            ++f;
+            continue;
+         // if we have thousand commas
+         case '\'':
+            fl |= STBSP__TRIPLET_COMMA;
+            ++f;
+            continue;
+         // if we have kilo marker (none->kilo->kibi->jedec)
+         case '$':
+            if (fl & STBSP__METRIC_SUFFIX) {
+               if (fl & STBSP__METRIC_1024) {
+                  fl |= STBSP__METRIC_JEDEC;
+               } else {
+                  fl |= STBSP__METRIC_1024;
+               }
+            } else {
+               fl |= STBSP__METRIC_SUFFIX;
+            }
+            ++f;
+            continue;
+         // if we don't want space between metric suffix and number
+         case '_':
+            fl |= STBSP__METRIC_NOSPACE;
+            ++f;
+            continue;
+         // if we have leading zero
+         case '0':
+            fl |= STBSP__LEADINGZERO;
+            ++f;
+            goto flags_done;
+         default: goto flags_done;
+         }
+      }
+   flags_done:
+
+      // get the field width
+      if (f[0] == '*') {
+         fw = va_arg(va, stbsp__uint32);
+         ++f;
+      } else {
+         while ((f[0] >= '0') && (f[0] <= '9')) {
+            fw = fw * 10 + f[0] - '0';
+            f++;
+         }
+      }
+      // get the precision
+      if (f[0] == '.') {
+         ++f;
+         if (f[0] == '*') {
+            pr = va_arg(va, stbsp__uint32);
+            ++f;
+         } else {
+            pr = 0;
+            while ((f[0] >= '0') && (f[0] <= '9')) {
+               pr = pr * 10 + f[0] - '0';
+               f++;
+            }
+         }
+      }
+
+      // handle integer size overrides
+      switch (f[0]) {
+      // are we halfwidth?
+      case 'h':
+         fl |= STBSP__HALFWIDTH;
+         ++f;
+         if (f[0] == 'h')
+            ++f;  // QUARTERWIDTH
+         break;
+      // are we 64-bit (unix style)
+      case 'l':
+         fl |= ((sizeof(long) == 8) ? STBSP__INTMAX : 0);
+         ++f;
+         if (f[0] == 'l') {
+            fl |= STBSP__INTMAX;
+            ++f;
+         }
+         break;
+      // are we 64-bit on intmax? (c99)
+      case 'j':
+         fl |= (sizeof(size_t) == 8) ? STBSP__INTMAX : 0;
+         ++f;
+         break;
+      // are we 64-bit on size_t or ptrdiff_t? (c99)
+      case 'z':
+         fl |= (sizeof(ptrdiff_t) == 8) ? STBSP__INTMAX : 0;
+         ++f;
+         break;
+      case 't':
+         fl |= (sizeof(ptrdiff_t) == 8) ? STBSP__INTMAX : 0;
+         ++f;
+         break;
+      // are we 64-bit (msft style)
+      case 'I':
+         if ((f[1] == '6') && (f[2] == '4')) {
+            fl |= STBSP__INTMAX;
+            f += 3;
+         } else if ((f[1] == '3') && (f[2] == '2')) {
+            f += 3;
+         } else {
+            fl |= ((sizeof(void *) == 8) ? STBSP__INTMAX : 0);
+            ++f;
+         }
+         break;
+      default: break;
+      }
+
+      // handle each replacement
+      switch (f[0]) {
+         #define STBSP__NUMSZ 512 // big enough for e308 (with commas) or e-307
+         char num[STBSP__NUMSZ];
+         char lead[8];
+         char tail[8];
+         char *s;
+         char const *h;
+         stbsp__uint32 l, n, cs;
+         stbsp__uint64 n64;
+#ifndef STB_SPRINTF_NOFLOAT
+         double fv;
+#endif
+         stbsp__int32 dp;
+         char const *sn;
+
+      case 's':
+         // get the string
+         s = va_arg(va, char *);
+         if (s == 0)
+            s = (char *)"null";
+         // get the length, limited to desired precision
+         // always limit to ~0u chars since our counts are 32b
+         l = stbsp__strlen_limited(s, (pr >= 0) ? pr : ~0u);
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         dp = 0;
+         cs = 0;
+         // copy the string in
+         goto scopy;
+
+      case 'c': // char
+         // get the character
+         s = num + STBSP__NUMSZ - 1;
+         *s = (char)va_arg(va, int);
+         l = 1;
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         dp = 0;
+         cs = 0;
+         goto scopy;
+
+      case 'n': // weird write-bytes specifier
+      {
+         int *d = va_arg(va, int *);
+         *d = tlen + (int)(bf - buf);
+      } break;
+
+#ifdef STB_SPRINTF_NOFLOAT
+      case 'A':              // float
+      case 'a':              // hex float
+      case 'G':              // float
+      case 'g':              // float
+      case 'E':              // float
+      case 'e':              // float
+      case 'f':              // float
+         va_arg(va, double); // eat it
+         s = (char *)"No float";
+         l = 8;
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         cs = 0;
+         STBSP__NOTUSED(dp);
+         goto scopy;
+#else
+      case 'A': // hex float
+      case 'a': // hex float
+         h = (f[0] == 'A') ? hexu : hex;
+         fv = va_arg(va, double);
+         if (pr == -1)
+            pr = 6; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_parts((stbsp__int64 *)&n64, &dp, fv))
+            fl |= STBSP__NEGATIVE;
+
+         s = num + 64;
+
+         stbsp__lead_sign(fl, lead);
+
+         if (dp == -1023)
+            dp = (n64) ? -1022 : 0;
+         else
+            n64 |= (((stbsp__uint64)1) << 52);
+         n64 <<= (64 - 56);
+         if (pr < 15)
+            n64 += ((((stbsp__uint64)8) << 56) >> (pr * 4));
+// add leading chars
+
+#ifdef STB_SPRINTF_MSVC_MODE
+         *s++ = '0';
+         *s++ = 'x';
+#else
+         lead[1 + lead[0]] = '0';
+         lead[2 + lead[0]] = 'x';
+         lead[0] += 2;
+#endif
+         *s++ = h[(n64 >> 60) & 15];
+         n64 <<= 4;
+         if (pr)
+            *s++ = stbsp__period;
+         sn = s;
+
+         // print the bits
+         n = pr;
+         if (n > 13)
+            n = 13;
+         if (pr > (stbsp__int32)n)
+            tz = pr - n;
+         pr = 0;
+         while (n--) {
+            *s++ = h[(n64 >> 60) & 15];
+            n64 <<= 4;
+         }
+
+         // print the expo
+         tail[1] = h[17];
+         if (dp < 0) {
+            tail[2] = '-';
+            dp = -dp;
+         } else
+            tail[2] = '+';
+         n = (dp >= 1000) ? 6 : ((dp >= 100) ? 5 : ((dp >= 10) ? 4 : 3));
+         tail[0] = (char)n;
+         for (;;) {
+            tail[n] = '0' + dp % 10;
+            if (n <= 3)
+               break;
+            --n;
+            dp /= 10;
+         }
+
+         dp = (int)(s - sn);
+         l = (int)(s - (num + 64));
+         s = num + 64;
+         cs = 1 + (3 << 24);
+         goto scopy;
+
+      case 'G': // float
+      case 'g': // float
+         h = (f[0] == 'G') ? hexu : hex;
+         fv = va_arg(va, double);
+         if (pr == -1)
+            pr = 6;
+         else if (pr == 0)
+            pr = 1; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_str(&sn, &l, num, &dp, fv, (pr - 1) | 0x80000000))
+            fl |= STBSP__NEGATIVE;
+
+         // clamp the precision and delete extra zeros after clamp
+         n = pr;
+         if (l > (stbsp__uint32)pr)
+            l = pr;
+         while ((l > 1) && (pr) && (sn[l - 1] == '0')) {
+            --pr;
+            --l;
+         }
+
+         // should we use %e
+         if ((dp <= -4) || (dp > (stbsp__int32)n)) {
+            if (pr > (stbsp__int32)l)
+               pr = l - 1;
+            else if (pr)
+               --pr; // when using %e, there is one digit before the decimal
+            goto doexpfromg;
+         }
+         // this is the insane action to get the pr to match %g semantics for %f
+         if (dp > 0) {
+            pr = (dp < (stbsp__int32)l) ? l - dp : 0;
+         } else {
+            pr = -dp + ((pr > (stbsp__int32)l) ? (stbsp__int32) l : pr);
+         }
+         goto dofloatfromg;
+
+      case 'E': // float
+      case 'e': // float
+         h = (f[0] == 'E') ? hexu : hex;
+         fv = va_arg(va, double);
+         if (pr == -1)
+            pr = 6; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_str(&sn, &l, num, &dp, fv, pr | 0x80000000))
+            fl |= STBSP__NEGATIVE;
+      doexpfromg:
+         tail[0] = 0;
+         stbsp__lead_sign(fl, lead);
+         if (dp == STBSP__SPECIAL) {
+            s = (char *)sn;
+            cs = 0;
+            pr = 0;
+            goto scopy;
+         }
+         s = num + 64;
+         // handle leading chars
+         *s++ = sn[0];
+
+         if (pr)
+            *s++ = stbsp__period;
+
+         // handle after decimal
+         if ((l - 1) > (stbsp__uint32)pr)
+            l = pr + 1;
+         for (n = 1; n < l; n++)
+            *s++ = sn[n];
+         // trailing zeros
+         tz = pr - (l - 1);
+         pr = 0;
+         // dump expo
+         tail[1] = h[0xe];
+         dp -= 1;
+         if (dp < 0) {
+            tail[2] = '-';
+            dp = -dp;
+         } else
+            tail[2] = '+';
+#ifdef STB_SPRINTF_MSVC_MODE
+         n = 5;
+#else
+         n = (dp >= 100) ? 5 : 4;
+#endif
+         tail[0] = (char)n;
+         for (;;) {
+            tail[n] = '0' + dp % 10;
+            if (n <= 3)
+               break;
+            --n;
+            dp /= 10;
+         }
+         cs = 1 + (3 << 24); // how many tens
+         goto flt_lead;
+
+      case 'f': // float
+         fv = va_arg(va, double);
+      doafloat:
+         // do kilos
+         if (fl & STBSP__METRIC_SUFFIX) {
+            double divisor;
+            divisor = 1000.0f;
+            if (fl & STBSP__METRIC_1024)
+               divisor = 1024.0;
+            while (fl < 0x4000000) {
+               if ((fv < divisor) && (fv > -divisor))
+                  break;
+               fv /= divisor;
+               fl += 0x1000000;
+            }
+         }
+         if (pr == -1)
+            pr = 6; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_str(&sn, &l, num, &dp, fv, pr))
+            fl |= STBSP__NEGATIVE;
+      dofloatfromg:
+         tail[0] = 0;
+         stbsp__lead_sign(fl, lead);
+         if (dp == STBSP__SPECIAL) {
+            s = (char *)sn;
+            cs = 0;
+            pr = 0;
+            goto scopy;
+         }
+         s = num + 64;
+
+         // handle the three decimal varieties
+         if (dp <= 0) {
+            stbsp__int32 i;
+            // handle 0.000*000xxxx
+            *s++ = '0';
+            if (pr)
+               *s++ = stbsp__period;
+            n = -dp;
+            if ((stbsp__int32)n > pr)
+               n = pr;
+            i = n;
+            while (i) {
+               if ((((stbsp__uintptr)s) & 3) == 0)
+                  break;
+               *s++ = '0';
+               --i;
+            }
+            while (i >= 4) {
+               *(stbsp__uint32 *)s = 0x30303030;
+               s += 4;
+               i -= 4;
+            }
+            while (i) {
+               *s++ = '0';
+               --i;
+            }
+            if ((stbsp__int32)(l + n) > pr)
+               l = pr - n;
+            i = l;
+            while (i) {
+               *s++ = *sn++;
+               --i;
+            }
+            tz = pr - (n + l);
+            cs = 1 + (3 << 24); // how many tens did we write (for commas below)
+         } else {
+            cs = (fl & STBSP__TRIPLET_COMMA) ? ((600 - (stbsp__uint32)dp) % 3) : 0;
+            if ((stbsp__uint32)dp >= l) {
+               // handle xxxx000*000.0
+               n = 0;
+               for (;;) {
+                  if ((fl & STBSP__TRIPLET_COMMA) && (++cs == 4)) {
+                     cs = 0;
+                     *s++ = stbsp__comma;
+                  } else {
+                     *s++ = sn[n];
+                     ++n;
+                     if (n >= l)
+                        break;
+                  }
+               }
+               if (n < (stbsp__uint32)dp) {
+                  n = dp - n;
+                  if ((fl & STBSP__TRIPLET_COMMA) == 0) {
+                     while (n) {
+                        if ((((stbsp__uintptr)s) & 3) == 0)
+                           break;
+                        *s++ = '0';
+                        --n;
+                     }
+                     while (n >= 4) {
+                        *(stbsp__uint32 *)s = 0x30303030;
+                        s += 4;
+                        n -= 4;
+                     }
+                  }
+                  while (n) {
+                     if ((fl & STBSP__TRIPLET_COMMA) && (++cs == 4)) {
+                        cs = 0;
+                        *s++ = stbsp__comma;
+                     } else {
+                        *s++ = '0';
+                        --n;
+                     }
+                  }
+               }
+               cs = (int)(s - (num + 64)) + (3 << 24); // cs is how many tens
+               if (pr) {
+                  *s++ = stbsp__period;
+                  tz = pr;
+               }
+            } else {
+               // handle xxxxx.xxxx000*000
+               n = 0;
+               for (;;) {
+                  if ((fl & STBSP__TRIPLET_COMMA) && (++cs == 4)) {
+                     cs = 0;
+                     *s++ = stbsp__comma;
+                  } else {
+                     *s++ = sn[n];
+                     ++n;
+                     if (n >= (stbsp__uint32)dp)
+                        break;
+                  }
+               }
+               cs = (int)(s - (num + 64)) + (3 << 24); // cs is how many tens
+               if (pr)
+                  *s++ = stbsp__period;
+               if ((l - dp) > (stbsp__uint32)pr)
+                  l = pr + dp;
+               while (n < l) {
+                  *s++ = sn[n];
+                  ++n;
+               }
+               tz = pr - (l - dp);
+            }
+         }
+         pr = 0;
+
+         // handle k,m,g,t
+         if (fl & STBSP__METRIC_SUFFIX) {
+            char idx;
+            idx = 1;
+            if (fl & STBSP__METRIC_NOSPACE)
+               idx = 0;
+            tail[0] = idx;
+            tail[1] = ' ';
+            {
+               if (fl >> 24) { // SI kilo is 'k', JEDEC and SI kibits are 'K'.
+                  if (fl & STBSP__METRIC_1024)
+                     tail[idx + 1] = "_KMGT"[fl >> 24];
+                  else
+                     tail[idx + 1] = "_kMGT"[fl >> 24];
+                  idx++;
+                  // If printing kibits and not in jedec, add the 'i'.
+                  if (fl & STBSP__METRIC_1024 && !(fl & STBSP__METRIC_JEDEC)) {
+                     tail[idx + 1] = 'i';
+                     idx++;
+                  }
+                  tail[0] = idx;
+               }
+            }
+         };
+
+      flt_lead:
+         // get the length that we copied
+         l = (stbsp__uint32)(s - (num + 64));
+         s = num + 64;
+         goto scopy;
+#endif
+
+      case 'B': // upper binary
+      case 'b': // lower binary
+         h = (f[0] == 'B') ? hexu : hex;
+         lead[0] = 0;
+         if (fl & STBSP__LEADING_0X) {
+            lead[0] = 2;
+            lead[1] = '0';
+            lead[2] = h[0xb];
+         }
+         l = (8 << 4) | (1 << 8);
+         goto radixnum;
+
+      case 'o': // octal
+         h = hexu;
+         lead[0] = 0;
+         if (fl & STBSP__LEADING_0X) {
+            lead[0] = 1;
+            lead[1] = '0';
+         }
+         l = (3 << 4) | (3 << 8);
+         goto radixnum;
+
+      case 'p': // pointer
+         fl |= (sizeof(void *) == 8) ? STBSP__INTMAX : 0;
+         pr = sizeof(void *) * 2;
+         fl &= ~STBSP__LEADINGZERO; // 'p' only prints the pointer with zeros
+                                    // fall through - to X
+
+      case 'X': // upper hex
+      case 'x': // lower hex
+         h = (f[0] == 'X') ? hexu : hex;
+         l = (4 << 4) | (4 << 8);
+         lead[0] = 0;
+         if (fl & STBSP__LEADING_0X) {
+            lead[0] = 2;
+            lead[1] = '0';
+            lead[2] = h[16];
+         }
+      radixnum:
+         // get the number
+         if (fl & STBSP__INTMAX)
+            n64 = va_arg(va, stbsp__uint64);
+         else
+            n64 = va_arg(va, stbsp__uint32);
+
+         s = num + STBSP__NUMSZ;
+         dp = 0;
+         // clear tail, and clear leading if value is zero
+         tail[0] = 0;
+         if (n64 == 0) {
+            lead[0] = 0;
+            if (pr == 0) {
+               l = 0;
+               cs = 0;
+               goto scopy;
+            }
+         }
+         // convert to string
+         for (;;) {
+            *--s = h[n64 & ((1 << (l >> 8)) - 1)];
+            n64 >>= (l >> 8);
+            if (!((n64) || ((stbsp__int32)((num + STBSP__NUMSZ) - s) < pr)))
+               break;
+            if (fl & STBSP__TRIPLET_COMMA) {
+               ++l;
+               if ((l & 15) == ((l >> 4) & 15)) {
+                  l &= ~15;
+                  *--s = stbsp__comma;
+               }
+            }
+         };
+         // get the tens and the comma pos
+         cs = (stbsp__uint32)((num + STBSP__NUMSZ) - s) + ((((l >> 4) & 15)) << 24);
+         // get the length that we copied
+         l = (stbsp__uint32)((num + STBSP__NUMSZ) - s);
+         // copy it
+         goto scopy;
+
+      case 'u': // unsigned
+      case 'i':
+      case 'd': // integer
+         // get the integer and abs it
+         if (fl & STBSP__INTMAX) {
+            stbsp__int64 i64 = va_arg(va, stbsp__int64);
+            n64 = (stbsp__uint64)i64;
+            if ((f[0] != 'u') && (i64 < 0)) {
+               n64 = (stbsp__uint64)-i64;
+               fl |= STBSP__NEGATIVE;
+            }
+         } else {
+            stbsp__int32 i = va_arg(va, stbsp__int32);
+            n64 = (stbsp__uint32)i;
+            if ((f[0] != 'u') && (i < 0)) {
+               n64 = (stbsp__uint32)-i;
+               fl |= STBSP__NEGATIVE;
+            }
+         }
+
+#ifndef STB_SPRINTF_NOFLOAT
+         if (fl & STBSP__METRIC_SUFFIX) {
+            if (n64 < 1024)
+               pr = 0;
+            else if (pr == -1)
+               pr = 1;
+            fv = (double)(stbsp__int64)n64;
+            goto doafloat;
+         }
+#endif
+
+         // convert to string
+         s = num + STBSP__NUMSZ;
+         l = 0;
+
+         for (;;) {
+            // do in 32-bit chunks (avoid lots of 64-bit divides even with constant denominators)
+            char *o = s - 8;
+            if (n64 >= 100000000) {
+               n = (stbsp__uint32)(n64 % 100000000);
+               n64 /= 100000000;
+            } else {
+               n = (stbsp__uint32)n64;
+               n64 = 0;
+            }
+            if ((fl & STBSP__TRIPLET_COMMA) == 0) {
+               do {
+                  s -= 2;
+                  *(stbsp__uint16 *)s = *(stbsp__uint16 *)&stbsp__digitpair.pair[(n % 100) * 2];
+                  n /= 100;
+               } while (n);
+            }
+            while (n) {
+               if ((fl & STBSP__TRIPLET_COMMA) && (l++ == 3)) {
+                  l = 0;
+                  *--s = stbsp__comma;
+                  --o;
+               } else {
+                  *--s = (char)(n % 10) + '0';
+                  n /= 10;
+               }
+            }
+            if (n64 == 0) {
+               if ((s[0] == '0') && (s != (num + STBSP__NUMSZ)))
+                  ++s;
+               break;
+            }
+            while (s != o)
+               if ((fl & STBSP__TRIPLET_COMMA) && (l++ == 3)) {
+                  l = 0;
+                  *--s = stbsp__comma;
+                  --o;
+               } else {
+                  *--s = '0';
+               }
+         }
+
+         tail[0] = 0;
+         stbsp__lead_sign(fl, lead);
+
+         // get the length that we copied
+         l = (stbsp__uint32)((num + STBSP__NUMSZ) - s);
+         if (l == 0) {
+            *--s = '0';
+            l = 1;
+         }
+         cs = l + (3 << 24);
+         if (pr < 0)
+            pr = 0;
+
+      scopy:
+         // get fw=leading/trailing space, pr=leading zeros
+         if (pr < (stbsp__int32)l)
+            pr = l;
+         n = pr + lead[0] + tail[0] + tz;
+         if (fw < (stbsp__int32)n)
+            fw = n;
+         fw -= n;
+         pr -= l;
+
+         // handle right justify and leading zeros
+         if ((fl & STBSP__LEFTJUST) == 0) {
+            if (fl & STBSP__LEADINGZERO) // if leading zeros, everything is in pr
+            {
+               pr = (fw > pr) ? fw : pr;
+               fw = 0;
+            } else {
+               fl &= ~STBSP__TRIPLET_COMMA; // if no leading zeros, then no commas
+            }
+         }
+
+         // copy the spaces and/or zeros
+         if (fw + pr) {
+            stbsp__int32 i;
+            stbsp__uint32 c;
+
+            // copy leading spaces (or when doing %8.4d stuff)
+            if ((fl & STBSP__LEFTJUST) == 0)
+               while (fw > 0) {
+                  stbsp__cb_buf_clamp(i, fw);
+                  fw -= i;
+                  while (i) {
+                     if ((((stbsp__uintptr)bf) & 3) == 0)
+                        break;
+                     *bf++ = ' ';
+                     --i;
+                  }
+                  while (i >= 4) {
+                     *(stbsp__uint32 *)bf = 0x20202020;
+                     bf += 4;
+                     i -= 4;
+                  }
+                  while (i) {
+                     *bf++ = ' ';
+                     --i;
+                  }
+                  stbsp__chk_cb_buf(1);
+               }
+
+            // copy leader
+            sn = lead + 1;
+            while (lead[0]) {
+               stbsp__cb_buf_clamp(i, lead[0]);
+               lead[0] -= (char)i;
+               while (i) {
+                  *bf++ = *sn++;
+                  --i;
+               }
+               stbsp__chk_cb_buf(1);
+            }
+
+            // copy leading zeros
+            c = cs >> 24;
+            cs &= 0xffffff;
+            cs = (fl & STBSP__TRIPLET_COMMA) ? ((stbsp__uint32)(c - ((pr + cs) % (c + 1)))) : 0;
+            while (pr > 0) {
+               stbsp__cb_buf_clamp(i, pr);
+               pr -= i;
+               if ((fl & STBSP__TRIPLET_COMMA) == 0) {
+                  while (i) {
+                     if ((((stbsp__uintptr)bf) & 3) == 0)
+                        break;
+                     *bf++ = '0';
+                     --i;
+                  }
+                  while (i >= 4) {
+                     *(stbsp__uint32 *)bf = 0x30303030;
+                     bf += 4;
+                     i -= 4;
+                  }
+               }
+               while (i) {
+                  if ((fl & STBSP__TRIPLET_COMMA) && (cs++ == c)) {
+                     cs = 0;
+                     *bf++ = stbsp__comma;
+                  } else
+                     *bf++ = '0';
+                  --i;
+               }
+               stbsp__chk_cb_buf(1);
+            }
+         }
+
+         // copy leader if there is still one
+         sn = lead + 1;
+         while (lead[0]) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, lead[0]);
+            lead[0] -= (char)i;
+            while (i) {
+               *bf++ = *sn++;
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // copy the string
+         n = l;
+         while (n) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, n);
+            n -= i;
+            STBSP__UNALIGNED(while (i >= 4) {
+               *(stbsp__uint32 volatile *)bf = *(stbsp__uint32 volatile *)s;
+               bf += 4;
+               s += 4;
+               i -= 4;
+            })
+            while (i) {
+               *bf++ = *s++;
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // copy trailing zeros
+         while (tz) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, tz);
+            tz -= i;
+            while (i) {
+               if ((((stbsp__uintptr)bf) & 3) == 0)
+                  break;
+               *bf++ = '0';
+               --i;
+            }
+            while (i >= 4) {
+               *(stbsp__uint32 *)bf = 0x30303030;
+               bf += 4;
+               i -= 4;
+            }
+            while (i) {
+               *bf++ = '0';
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // copy tail if there is one
+         sn = tail + 1;
+         while (tail[0]) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, tail[0]);
+            tail[0] -= (char)i;
+            while (i) {
+               *bf++ = *sn++;
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // handle the left justify
+         if (fl & STBSP__LEFTJUST)
+            if (fw > 0) {
+               while (fw) {
+                  stbsp__int32 i;
+                  stbsp__cb_buf_clamp(i, fw);
+                  fw -= i;
+                  while (i) {
+                     if ((((stbsp__uintptr)bf) & 3) == 0)
+                        break;
+                     *bf++ = ' ';
+                     --i;
+                  }
+                  while (i >= 4) {
+                     *(stbsp__uint32 *)bf = 0x20202020;
+                     bf += 4;
+                     i -= 4;
+                  }
+                  while (i--)
+                     *bf++ = ' ';
+                  stbsp__chk_cb_buf(1);
+               }
+            }
+         break;
+
+      default: // unknown, just copy code
+         s = num + STBSP__NUMSZ - 1;
+         *s = f[0];
+         l = 1;
+         fw = fl = 0;
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         dp = 0;
+         cs = 0;
+         goto scopy;
+      }
+      ++f;
+   }
+endfmt:
+
+   if (!callback)
+      *bf = 0;
+   else
+      stbsp__flush_cb();
+
+done:
+   return tlen + (int)(bf - buf);
+}
+
+// cleanup
+#undef STBSP__LEFTJUST
+#undef STBSP__LEADINGPLUS
+#undef STBSP__LEADINGSPACE
+#undef STBSP__LEADING_0X
+#undef STBSP__LEADINGZERO
+#undef STBSP__INTMAX
+#undef STBSP__TRIPLET_COMMA
+#undef STBSP__NEGATIVE
+#undef STBSP__METRIC_SUFFIX
+#undef STBSP__NUMSZ
+#undef stbsp__chk_cb_bufL
+#undef stbsp__chk_cb_buf
+#undef stbsp__flush_cb
+#undef stbsp__cb_buf_clamp
+
+// ============================================================================
+//   wrapper functions
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...)
+{
+   int result;
+   va_list va;
+   va_start(va, fmt);
+   result = STB_SPRINTF_DECORATE(vsprintfcb)(0, 0, buf, fmt, va);
+   va_end(va);
+   return result;
+}
+
+typedef struct stbsp__context {
+   char *buf;
+   int count;
+   int length;
+   char tmp[STB_SPRINTF_MIN];
+} stbsp__context;
+
+static char *stbsp__clamp_callback(const char *buf, void *user, int len)
+{
+   stbsp__context *c = (stbsp__context *)user;
+   c->length += len;
+
+   if (len > c->count)
+      len = c->count;
+
+   if (len) {
+      if (buf != c->buf) {
+         const char *s, *se;
+         char *d;
+         d = c->buf;
+         s = buf;
+         se = buf + len;
+         do {
+            *d++ = *s++;
+         } while (s < se);
+      }
+      c->buf += len;
+      c->count -= len;
+   }
+
+   if (c->count <= 0)
+      return c->tmp;
+   return (c->count >= STB_SPRINTF_MIN) ? c->buf : c->tmp; // go direct into buffer if you can
+}
+
+static char * stbsp__count_clamp_callback( const char * buf, void * user, int len )
+{
+   stbsp__context * c = (stbsp__context*)user;
+   (void) sizeof(buf);
+
+   c->length += len;
+   return c->tmp; // go direct into buffer if you can
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE( vsnprintf )( char * buf, int count, char const * fmt, va_list va )
+{
+   stbsp__context c;
+
+   if ( (count == 0) && !buf )
+   {
+      c.length = 0;
+
+      STB_SPRINTF_DECORATE( vsprintfcb )( stbsp__count_clamp_callback, &c, c.tmp, fmt, va );
+   }
+   else
+   {
+      int l;
+
+      c.buf = buf;
+      c.count = count;
+      c.length = 0;
+
+      STB_SPRINTF_DECORATE( vsprintfcb )( stbsp__clamp_callback, &c, stbsp__clamp_callback(0,&c,0), fmt, va );
+
+      // zero-terminate
+      l = (int)( c.buf - buf );
+      if ( l >= count ) // should never be greater, only equal (or less) than count
+         l = count - 1;
+      buf[l] = 0;
+   }
+
+   return c.length;
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...)
+{
+   int result;
+   va_list va;
+   va_start(va, fmt);
+
+   result = STB_SPRINTF_DECORATE(vsnprintf)(buf, count, fmt, va);
+   va_end(va);
+
+   return result;
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va)
+{
+   return STB_SPRINTF_DECORATE(vsprintfcb)(0, 0, buf, fmt, va);
+}
+
+// =======================================================================
+//   low level float utility functions
+
+#ifndef STB_SPRINTF_NOFLOAT
+
+// copies d to bits w/ strict aliasing (this compiles to nothing on /Ox)
+#define STBSP__COPYFP(dest, src)                   \
+   {                                               \
+      int cn;                                      \
+      for (cn = 0; cn < 8; cn++)                   \
+         ((char *)&dest)[cn] = ((char *)&src)[cn]; \
+   }
+
+// get float info
+static stbsp__int32 stbsp__real_to_parts(stbsp__int64 *bits, stbsp__int32 *expo, double value)
+{
+   double d;
+   stbsp__int64 b = 0;
+
+   // load value and round at the frac_digits
+   d = value;
+
+   STBSP__COPYFP(b, d);
+
+   *bits = b & ((((stbsp__uint64)1) << 52) - 1);
+   *expo = (stbsp__int32)(((b >> 52) & 2047) - 1023);
+
+   return (stbsp__int32)((stbsp__uint64) b >> 63);
+}
+
+static double const stbsp__bot[23] = {
+   1e+000, 1e+001, 1e+002, 1e+003, 1e+004, 1e+005, 1e+006, 1e+007, 1e+008, 1e+009, 1e+010, 1e+011,
+   1e+012, 1e+013, 1e+014, 1e+015, 1e+016, 1e+017, 1e+018, 1e+019, 1e+020, 1e+021, 1e+022
+};
+static double const stbsp__negbot[22] = {
+   1e-001, 1e-002, 1e-003, 1e-004, 1e-005, 1e-006, 1e-007, 1e-008, 1e-009, 1e-010, 1e-011,
+   1e-012, 1e-013, 1e-014, 1e-015, 1e-016, 1e-017, 1e-018, 1e-019, 1e-020, 1e-021, 1e-022
+};
+static double const stbsp__negboterr[22] = {
+   -5.551115123125783e-018,  -2.0816681711721684e-019, -2.0816681711721686e-020, -4.7921736023859299e-021, -8.1803053914031305e-022, 4.5251888174113741e-023,
+   4.5251888174113739e-024,  -2.0922560830128471e-025, -6.2281591457779853e-026, -3.6432197315497743e-027, 6.0503030718060191e-028,  2.0113352370744385e-029,
+   -3.0373745563400371e-030, 1.1806906454401013e-032,  -7.7705399876661076e-032, 2.0902213275965398e-033,  -7.1542424054621921e-034, -7.1542424054621926e-035,
+   2.4754073164739869e-036,  5.4846728545790429e-037,  9.2462547772103625e-038,  -4.8596774326570872e-039
+};
+static double const stbsp__top[13] = {
+   1e+023, 1e+046, 1e+069, 1e+092, 1e+115, 1e+138, 1e+161, 1e+184, 1e+207, 1e+230, 1e+253, 1e+276, 1e+299
+};
+static double const stbsp__negtop[13] = {
+   1e-023, 1e-046, 1e-069, 1e-092, 1e-115, 1e-138, 1e-161, 1e-184, 1e-207, 1e-230, 1e-253, 1e-276, 1e-299
+};
+static double const stbsp__toperr[13] = {
+   8388608,
+   6.8601809640529717e+028,
+   -7.253143638152921e+052,
+   -4.3377296974619174e+075,
+   -1.5559416129466825e+098,
+   -3.2841562489204913e+121,
+   -3.7745893248228135e+144,
+   -1.7356668416969134e+167,
+   -3.8893577551088374e+190,
+   -9.9566444326005119e+213,
+   6.3641293062232429e+236,
+   -5.2069140800249813e+259,
+   -5.2504760255204387e+282
+};
+static double const stbsp__negtoperr[13] = {
+   3.9565301985100693e-040,  -2.299904345391321e-063,  3.6506201437945798e-086,  1.1875228833981544e-109,
+   -5.0644902316928607e-132, -6.7156837247865426e-155, -2.812077463003139e-178,  -5.7778912386589953e-201,
+   7.4997100559334532e-224,  -4.6439668915134491e-247, -6.3691100762962136e-270, -9.436808465446358e-293,
+   8.0970921678014997e-317
+};
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1200)
+static stbsp__uint64 const stbsp__powten[20] = {
+   1,
+   10,
+   100,
+   1000,
+   10000,
+   100000,
+   1000000,
+   10000000,
+   100000000,
+   1000000000,
+   10000000000,
+   100000000000,
+   1000000000000,
+   10000000000000,
+   100000000000000,
+   1000000000000000,
+   10000000000000000,
+   100000000000000000,
+   1000000000000000000,
+   10000000000000000000U
+};
+#define stbsp__tento19th ((stbsp__uint64)1000000000000000000)
+#else
+static stbsp__uint64 const stbsp__powten[20] = {
+   1,
+   10,
+   100,
+   1000,
+   10000,
+   100000,
+   1000000,
+   10000000,
+   100000000,
+   1000000000,
+   10000000000ULL,
+   100000000000ULL,
+   1000000000000ULL,
+   10000000000000ULL,
+   100000000000000ULL,
+   1000000000000000ULL,
+   10000000000000000ULL,
+   100000000000000000ULL,
+   1000000000000000000ULL,
+   10000000000000000000ULL
+};
+#define stbsp__tento19th (1000000000000000000ULL)
+#endif
+
+#define stbsp__ddmulthi(oh, ol, xh, yh)                            \
+   {                                                               \
+      double ahi = 0, alo, bhi = 0, blo;                           \
+      stbsp__int64 bt;                                             \
+      oh = xh * yh;                                                \
+      STBSP__COPYFP(bt, xh);                                       \
+      bt &= ((~(stbsp__uint64)0) << 27);                           \
+      STBSP__COPYFP(ahi, bt);                                      \
+      alo = xh - ahi;                                              \
+      STBSP__COPYFP(bt, yh);                                       \
+      bt &= ((~(stbsp__uint64)0) << 27);                           \
+      STBSP__COPYFP(bhi, bt);                                      \
+      blo = yh - bhi;                                              \
+      ol = ((ahi * bhi - oh) + ahi * blo + alo * bhi) + alo * blo; \
+   }
+
+#define stbsp__ddtoS64(ob, xh, xl)          \
+   {                                        \
+      double ahi = 0, alo, vh, t;           \
+      ob = (stbsp__int64)xh;                \
+      vh = (double)ob;                      \
+      ahi = (xh - vh);                      \
+      t = (ahi - xh);                       \
+      alo = (xh - (ahi - t)) - (vh + t);    \
+      ob += (stbsp__int64)(ahi + alo + xl); \
+   }
+
+#define stbsp__ddrenorm(oh, ol) \
+   {                            \
+      double s;                 \
+      s = oh + ol;              \
+      ol = ol - (s - oh);       \
+      oh = s;                   \
+   }
+
+#define stbsp__ddmultlo(oh, ol, xh, xl, yh, yl) ol = ol + (xh * yl + xl * yh);
+
+#define stbsp__ddmultlos(oh, ol, xh, yl) ol = ol + (xh * yl);
+
+static void stbsp__raise_to_power10(double *ohi, double *olo, double d, stbsp__int32 power) // power can be -323 to +350
+{
+   double ph, pl;
+   if ((power >= 0) && (power <= 22)) {
+      stbsp__ddmulthi(ph, pl, d, stbsp__bot[power]);
+   } else {
+      stbsp__int32 e, et, eb;
+      double p2h, p2l;
+
+      e = power;
+      if (power < 0)
+         e = -e;
+      et = (e * 0x2c9) >> 14; /* %23 */
+      if (et > 13)
+         et = 13;
+      eb = e - (et * 23);
+
+      ph = d;
+      pl = 0.0;
+      if (power < 0) {
+         if (eb) {
+            --eb;
+            stbsp__ddmulthi(ph, pl, d, stbsp__negbot[eb]);
+            stbsp__ddmultlos(ph, pl, d, stbsp__negboterr[eb]);
+         }
+         if (et) {
+            stbsp__ddrenorm(ph, pl);
+            --et;
+            stbsp__ddmulthi(p2h, p2l, ph, stbsp__negtop[et]);
+            stbsp__ddmultlo(p2h, p2l, ph, pl, stbsp__negtop[et], stbsp__negtoperr[et]);
+            ph = p2h;
+            pl = p2l;
+         }
+      } else {
+         if (eb) {
+            e = eb;
+            if (eb > 22)
+               eb = 22;
+            e -= eb;
+            stbsp__ddmulthi(ph, pl, d, stbsp__bot[eb]);
+            if (e) {
+               stbsp__ddrenorm(ph, pl);
+               stbsp__ddmulthi(p2h, p2l, ph, stbsp__bot[e]);
+               stbsp__ddmultlos(p2h, p2l, stbsp__bot[e], pl);
+               ph = p2h;
+               pl = p2l;
+            }
+         }
+         if (et) {
+            stbsp__ddrenorm(ph, pl);
+            --et;
+            stbsp__ddmulthi(p2h, p2l, ph, stbsp__top[et]);
+            stbsp__ddmultlo(p2h, p2l, ph, pl, stbsp__top[et], stbsp__toperr[et]);
+            ph = p2h;
+            pl = p2l;
+         }
+      }
+   }
+   stbsp__ddrenorm(ph, pl);
+   *ohi = ph;
+   *olo = pl;
+}
+
+// given a float value, returns the significant bits in bits, and the position of the
+//   decimal point in decimal_pos.  +/-INF and NAN are specified by special values
+//   returned in the decimal_pos parameter.
+// frac_digits is absolute normally, but if you want from first significant digits (got %g and %e), or in 0x80000000
+static stbsp__int32 stbsp__real_to_str(char const **start, stbsp__uint32 *len, char *out, stbsp__int32 *decimal_pos, double value, stbsp__uint32 frac_digits)
+{
+   double d;
+   stbsp__int64 bits = 0;
+   stbsp__int32 expo, e, ng, tens;
+
+   d = value;
+   STBSP__COPYFP(bits, d);
+   expo = (stbsp__int32)((bits >> 52) & 2047);
+   ng = (stbsp__int32)((stbsp__uint64) bits >> 63);
+   if (ng)
+      d = -d;
+
+   if (expo == 2047) // is nan or inf?
+   {
+      *start = (bits & ((((stbsp__uint64)1) << 52) - 1)) ? "NaN" : "Inf";
+      *decimal_pos = STBSP__SPECIAL;
+      *len = 3;
+      return ng;
+   }
+
+   if (expo == 0) // is zero or denormal
+   {
+      if (((stbsp__uint64) bits << 1) == 0) // do zero
+      {
+         *decimal_pos = 1;
+         *start = out;
+         out[0] = '0';
+         *len = 1;
+         return ng;
+      }
+      // find the right expo for denormals
+      {
+         stbsp__int64 v = ((stbsp__uint64)1) << 51;
+         while ((bits & v) == 0) {
+            --expo;
+            v >>= 1;
+         }
+      }
+   }
+
+   // find the decimal exponent as well as the decimal bits of the value
+   {
+      double ph, pl;
+
+      // log10 estimate - very specifically tweaked to hit or undershoot by no more than 1 of log10 of all expos 1..2046
+      tens = expo - 1023;
+      tens = (tens < 0) ? ((tens * 617) / 2048) : (((tens * 1233) / 4096) + 1);
+
+      // move the significant bits into position and stick them into an int
+      stbsp__raise_to_power10(&ph, &pl, d, 18 - tens);
+
+      // get full as much precision from double-double as possible
+      stbsp__ddtoS64(bits, ph, pl);
+
+      // check if we undershot
+      if (((stbsp__uint64)bits) >= stbsp__tento19th)
+         ++tens;
+   }
+
+   // now do the rounding in integer land
+   frac_digits = (frac_digits & 0x80000000) ? ((frac_digits & 0x7ffffff) + 1) : (tens + frac_digits);
+   if ((frac_digits < 24)) {
+      stbsp__uint32 dg = 1;
+      if ((stbsp__uint64)bits >= stbsp__powten[9])
+         dg = 10;
+      while ((stbsp__uint64)bits >= stbsp__powten[dg]) {
+         ++dg;
+         if (dg == 20)
+            goto noround;
+      }
+      if (frac_digits < dg) {
+         stbsp__uint64 r;
+         // add 0.5 at the right position and round
+         e = dg - frac_digits;
+         if ((stbsp__uint32)e >= 24)
+            goto noround;
+         r = stbsp__powten[e];
+         bits = bits + (r / 2);
+         if ((stbsp__uint64)bits >= stbsp__powten[dg])
+            ++tens;
+         bits /= r;
+      }
+   noround:;
+   }
+
+   // kill long trailing runs of zeros
+   if (bits) {
+      stbsp__uint32 n;
+      for (;;) {
+         if (bits <= 0xffffffff)
+            break;
+         if (bits % 1000)
+            goto donez;
+         bits /= 1000;
+      }
+      n = (stbsp__uint32)bits;
+      while ((n % 1000) == 0)
+         n /= 1000;
+      bits = n;
+   donez:;
+   }
+
+   // convert to string
+   out += 64;
+   e = 0;
+   for (;;) {
+      stbsp__uint32 n;
+      char *o = out - 8;
+      // do the conversion in chunks of U32s (avoid most 64-bit divides, worth it, constant denomiators be damned)
+      if (bits >= 100000000) {
+         n = (stbsp__uint32)(bits % 100000000);
+         bits /= 100000000;
+      } else {
+         n = (stbsp__uint32)bits;
+         bits = 0;
+      }
+      while (n) {
+         out -= 2;
+         *(stbsp__uint16 *)out = *(stbsp__uint16 *)&stbsp__digitpair.pair[(n % 100) * 2];
+         n /= 100;
+         e += 2;
+      }
+      if (bits == 0) {
+         if ((e) && (out[0] == '0')) {
+            ++out;
+            --e;
+         }
+         break;
+      }
+      while (out != o) {
+         *--out = '0';
+         ++e;
+      }
+   }
+
+   *decimal_pos = tens;
+   *start = out;
+   *len = e;
+   return ng;
+}
+
+#undef stbsp__ddmulthi
+#undef stbsp__ddrenorm
+#undef stbsp__ddmultlo
+#undef stbsp__ddmultlos
+#undef STBSP__SPECIAL
+#undef STBSP__COPYFP
+
+#endif // STB_SPRINTF_NOFLOAT
+
+// clean up
+#undef stbsp__uint16
+#undef stbsp__uint32
+#undef stbsp__int32
+#undef stbsp__uint64
+#undef stbsp__int64
+#undef STBSP__UNALIGNED
+
+#endif // STB_SPRINTF_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_textedit.h b/lib/stb/stb_textedit.h
new file mode 100644
index 0000000..1442493
--- /dev/null
+++ b/lib/stb/stb_textedit.h
@@ -0,0 +1,1429 @@
+// stb_textedit.h - v1.14  - public domain - Sean Barrett
+// Development of this library was sponsored by RAD Game Tools
+//
+// This C header file implements the guts of a multi-line text-editing
+// widget; you implement display, word-wrapping, and low-level string
+// insertion/deletion, and stb_textedit will map user inputs into
+// insertions & deletions, plus updates to the cursor position,
+// selection state, and undo state.
+//
+// It is intended for use in games and other systems that need to build
+// their own custom widgets and which do not have heavy text-editing
+// requirements (this library is not recommended for use for editing large
+// texts, as its performance does not scale and it has limited undo).
+//
+// Non-trivial behaviors are modelled after Windows text controls.
+//
+//
+// LICENSE
+//
+// See end of file for license information.
+//
+//
+// DEPENDENCIES
+//
+// Uses the C runtime function 'memmove', which you can override
+// by defining STB_TEXTEDIT_memmove before the implementation.
+// Uses no other functions. Performs no runtime allocations.
+//
+//
+// VERSION HISTORY
+//
+//   1.14 (2021-07-11) page up/down, various fixes
+//   1.13 (2019-02-07) fix bug in undo size management
+//   1.12 (2018-01-29) user can change STB_TEXTEDIT_KEYTYPE, fix redo to avoid crash
+//   1.11 (2017-03-03) fix HOME on last line, dragging off single-line textfield
+//   1.10 (2016-10-25) supress warnings about casting away const with -Wcast-qual
+//   1.9  (2016-08-27) customizable move-by-word
+//   1.8  (2016-04-02) better keyboard handling when mouse button is down
+//   1.7  (2015-09-13) change y range handling in case baseline is non-0
+//   1.6  (2015-04-15) allow STB_TEXTEDIT_memmove
+//   1.5  (2014-09-10) add support for secondary keys for OS X
+//   1.4  (2014-08-17) fix signed/unsigned warnings
+//   1.3  (2014-06-19) fix mouse clicking to round to nearest char boundary
+//   1.2  (2014-05-27) fix some RAD types that had crept into the new code
+//   1.1  (2013-12-15) move-by-word (requires STB_TEXTEDIT_IS_SPACE )
+//   1.0  (2012-07-26) improve documentation, initial public release
+//   0.3  (2012-02-24) bugfixes, single-line mode; insert mode
+//   0.2  (2011-11-28) fixes to undo/redo
+//   0.1  (2010-07-08) initial version
+//
+// ADDITIONAL CONTRIBUTORS
+//
+//   Ulf Winklemann: move-by-word in 1.1
+//   Fabian Giesen: secondary key inputs in 1.5
+//   Martins Mozeiko: STB_TEXTEDIT_memmove in 1.6
+//   Louis Schnellbach: page up/down in 1.14
+//
+//   Bugfixes:
+//      Scott Graham
+//      Daniel Keller
+//      Omar Cornut
+//      Dan Thompson
+//
+// USAGE
+//
+// This file behaves differently depending on what symbols you define
+// before including it.
+//
+//
+// Header-file mode:
+//
+//   If you do not define STB_TEXTEDIT_IMPLEMENTATION before including this,
+//   it will operate in "header file" mode. In this mode, it declares a
+//   single public symbol, STB_TexteditState, which encapsulates the current
+//   state of a text widget (except for the string, which you will store
+//   separately).
+//
+//   To compile in this mode, you must define STB_TEXTEDIT_CHARTYPE to a
+//   primitive type that defines a single character (e.g. char, wchar_t, etc).
+//
+//   To save space or increase undo-ability, you can optionally define the
+//   following things that are used by the undo system:
+//
+//      STB_TEXTEDIT_POSITIONTYPE         small int type encoding a valid cursor position
+//      STB_TEXTEDIT_UNDOSTATECOUNT       the number of undo states to allow
+//      STB_TEXTEDIT_UNDOCHARCOUNT        the number of characters to store in the undo buffer
+//
+//   If you don't define these, they are set to permissive types and
+//   moderate sizes. The undo system does no memory allocations, so
+//   it grows STB_TexteditState by the worst-case storage which is (in bytes):
+//
+//        [4 + 3 * sizeof(STB_TEXTEDIT_POSITIONTYPE)] * STB_TEXTEDIT_UNDOSTATECOUNT
+//      +          sizeof(STB_TEXTEDIT_CHARTYPE)      * STB_TEXTEDIT_UNDOCHARCOUNT
+//
+//
+// Implementation mode:
+//
+//   If you define STB_TEXTEDIT_IMPLEMENTATION before including this, it
+//   will compile the implementation of the text edit widget, depending
+//   on a large number of symbols which must be defined before the include.
+//
+//   The implementation is defined only as static functions. You will then
+//   need to provide your own APIs in the same file which will access the
+//   static functions.
+//
+//   The basic concept is that you provide a "string" object which
+//   behaves like an array of characters. stb_textedit uses indices to
+//   refer to positions in the string, implicitly representing positions
+//   in the displayed textedit. This is true for both plain text and
+//   rich text; even with rich text stb_truetype interacts with your
+//   code as if there was an array of all the displayed characters.
+//
+// Symbols that must be the same in header-file and implementation mode:
+//
+//     STB_TEXTEDIT_CHARTYPE             the character type
+//     STB_TEXTEDIT_POSITIONTYPE         small type that is a valid cursor position
+//     STB_TEXTEDIT_UNDOSTATECOUNT       the number of undo states to allow
+//     STB_TEXTEDIT_UNDOCHARCOUNT        the number of characters to store in the undo buffer
+//
+// Symbols you must define for implementation mode:
+//
+//    STB_TEXTEDIT_STRING               the type of object representing a string being edited,
+//                                      typically this is a wrapper object with other data you need
+//
+//    STB_TEXTEDIT_STRINGLEN(obj)       the length of the string (ideally O(1))
+//    STB_TEXTEDIT_LAYOUTROW(&r,obj,n)  returns the results of laying out a line of characters
+//                                        starting from character #n (see discussion below)
+//    STB_TEXTEDIT_GETWIDTH(obj,n,i)    returns the pixel delta from the xpos of the i'th character
+//                                        to the xpos of the i+1'th char for a line of characters
+//                                        starting at character #n (i.e. accounts for kerning
+//                                        with previous char)
+//    STB_TEXTEDIT_KEYTOTEXT(k)         maps a keyboard input to an insertable character
+//                                        (return type is int, -1 means not valid to insert)
+//    STB_TEXTEDIT_GETCHAR(obj,i)       returns the i'th character of obj, 0-based
+//    STB_TEXTEDIT_NEWLINE              the character returned by _GETCHAR() we recognize
+//                                        as manually wordwrapping for end-of-line positioning
+//
+//    STB_TEXTEDIT_DELETECHARS(obj,i,n)      delete n characters starting at i
+//    STB_TEXTEDIT_INSERTCHARS(obj,i,c*,n)   insert n characters at i (pointed to by STB_TEXTEDIT_CHARTYPE*)
+//
+//    STB_TEXTEDIT_K_SHIFT       a power of two that is or'd in to a keyboard input to represent the shift key
+//
+//    STB_TEXTEDIT_K_LEFT        keyboard input to move cursor left
+//    STB_TEXTEDIT_K_RIGHT       keyboard input to move cursor right
+//    STB_TEXTEDIT_K_UP          keyboard input to move cursor up
+//    STB_TEXTEDIT_K_DOWN        keyboard input to move cursor down
+//    STB_TEXTEDIT_K_PGUP        keyboard input to move cursor up a page
+//    STB_TEXTEDIT_K_PGDOWN      keyboard input to move cursor down a page
+//    STB_TEXTEDIT_K_LINESTART   keyboard input to move cursor to start of line  // e.g. HOME
+//    STB_TEXTEDIT_K_LINEEND     keyboard input to move cursor to end of line    // e.g. END
+//    STB_TEXTEDIT_K_TEXTSTART   keyboard input to move cursor to start of text  // e.g. ctrl-HOME
+//    STB_TEXTEDIT_K_TEXTEND     keyboard input to move cursor to end of text    // e.g. ctrl-END
+//    STB_TEXTEDIT_K_DELETE      keyboard input to delete selection or character under cursor
+//    STB_TEXTEDIT_K_BACKSPACE   keyboard input to delete selection or character left of cursor
+//    STB_TEXTEDIT_K_UNDO        keyboard input to perform undo
+//    STB_TEXTEDIT_K_REDO        keyboard input to perform redo
+//
+// Optional:
+//    STB_TEXTEDIT_K_INSERT              keyboard input to toggle insert mode
+//    STB_TEXTEDIT_IS_SPACE(ch)          true if character is whitespace (e.g. 'isspace'),
+//                                          required for default WORDLEFT/WORDRIGHT handlers
+//    STB_TEXTEDIT_MOVEWORDLEFT(obj,i)   custom handler for WORDLEFT, returns index to move cursor to
+//    STB_TEXTEDIT_MOVEWORDRIGHT(obj,i)  custom handler for WORDRIGHT, returns index to move cursor to
+//    STB_TEXTEDIT_K_WORDLEFT            keyboard input to move cursor left one word // e.g. ctrl-LEFT
+//    STB_TEXTEDIT_K_WORDRIGHT           keyboard input to move cursor right one word // e.g. ctrl-RIGHT
+//    STB_TEXTEDIT_K_LINESTART2          secondary keyboard input to move cursor to start of line
+//    STB_TEXTEDIT_K_LINEEND2            secondary keyboard input to move cursor to end of line
+//    STB_TEXTEDIT_K_TEXTSTART2          secondary keyboard input to move cursor to start of text
+//    STB_TEXTEDIT_K_TEXTEND2            secondary keyboard input to move cursor to end of text
+//
+// Keyboard input must be encoded as a single integer value; e.g. a character code
+// and some bitflags that represent shift states. to simplify the interface, SHIFT must
+// be a bitflag, so we can test the shifted state of cursor movements to allow selection,
+// i.e. (STB_TEXTEDIT_K_RIGHT|STB_TEXTEDIT_K_SHIFT) should be shifted right-arrow.
+//
+// You can encode other things, such as CONTROL or ALT, in additional bits, and
+// then test for their presence in e.g. STB_TEXTEDIT_K_WORDLEFT. For example,
+// my Windows implementations add an additional CONTROL bit, and an additional KEYDOWN
+// bit. Then all of the STB_TEXTEDIT_K_ values bitwise-or in the KEYDOWN bit,
+// and I pass both WM_KEYDOWN and WM_CHAR events to the "key" function in the
+// API below. The control keys will only match WM_KEYDOWN events because of the
+// keydown bit I add, and STB_TEXTEDIT_KEYTOTEXT only tests for the KEYDOWN
+// bit so it only decodes WM_CHAR events.
+//
+// STB_TEXTEDIT_LAYOUTROW returns information about the shape of one displayed
+// row of characters assuming they start on the i'th character--the width and
+// the height and the number of characters consumed. This allows this library
+// to traverse the entire layout incrementally. You need to compute word-wrapping
+// here.
+//
+// Each textfield keeps its own insert mode state, which is not how normal
+// applications work. To keep an app-wide insert mode, update/copy the
+// "insert_mode" field of STB_TexteditState before/after calling API functions.
+//
+// API
+//
+//    void stb_textedit_initialize_state(STB_TexteditState *state, int is_single_line)
+//
+//    void stb_textedit_click(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, float x, float y)
+//    void stb_textedit_drag(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, float x, float y)
+//    int  stb_textedit_cut(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+//    int  stb_textedit_paste(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, STB_TEXTEDIT_CHARTYPE *text, int len)
+//    void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, STB_TEXEDIT_KEYTYPE key)
+//
+//    Each of these functions potentially updates the string and updates the
+//    state.
+//
+//      initialize_state:
+//          set the textedit state to a known good default state when initially
+//          constructing the textedit.
+//
+//      click:
+//          call this with the mouse x,y on a mouse down; it will update the cursor
+//          and reset the selection start/end to the cursor point. the x,y must
+//          be relative to the text widget, with (0,0) being the top left.
+//
+//      drag:
+//          call this with the mouse x,y on a mouse drag/up; it will update the
+//          cursor and the selection end point
+//
+//      cut:
+//          call this to delete the current selection; returns true if there was
+//          one. you should FIRST copy the current selection to the system paste buffer.
+//          (To copy, just copy the current selection out of the string yourself.)
+//
+//      paste:
+//          call this to paste text at the current cursor point or over the current
+//          selection if there is one.
+//
+//      key:
+//          call this for keyboard inputs sent to the textfield. you can use it
+//          for "key down" events or for "translated" key events. if you need to
+//          do both (as in Win32), or distinguish Unicode characters from control
+//          inputs, set a high bit to distinguish the two; then you can define the
+//          various definitions like STB_TEXTEDIT_K_LEFT have the is-key-event bit
+//          set, and make STB_TEXTEDIT_KEYTOCHAR check that the is-key-event bit is
+//          clear. STB_TEXTEDIT_KEYTYPE defaults to int, but you can #define it to
+//          anything other type you wante before including.
+//
+//
+//   When rendering, you can read the cursor position and selection state from
+//   the STB_TexteditState.
+//
+//
+// Notes:
+//
+// This is designed to be usable in IMGUI, so it allows for the possibility of
+// running in an IMGUI that has NOT cached the multi-line layout. For this
+// reason, it provides an interface that is compatible with computing the
+// layout incrementally--we try to make sure we make as few passes through
+// as possible. (For example, to locate the mouse pointer in the text, we
+// could define functions that return the X and Y positions of characters
+// and binary search Y and then X, but if we're doing dynamic layout this
+// will run the layout algorithm many times, so instead we manually search
+// forward in one pass. Similar logic applies to e.g. up-arrow and
+// down-arrow movement.)
+//
+// If it's run in a widget that *has* cached the layout, then this is less
+// efficient, but it's not horrible on modern computers. But you wouldn't
+// want to edit million-line files with it.
+
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+////
+////   Header-file mode
+////
+////
+
+#ifndef INCLUDE_STB_TEXTEDIT_H
+#define INCLUDE_STB_TEXTEDIT_H
+
+////////////////////////////////////////////////////////////////////////
+//
+//     STB_TexteditState
+//
+// Definition of STB_TexteditState which you should store
+// per-textfield; it includes cursor position, selection state,
+// and undo state.
+//
+
+#ifndef STB_TEXTEDIT_UNDOSTATECOUNT
+#define STB_TEXTEDIT_UNDOSTATECOUNT   99
+#endif
+#ifndef STB_TEXTEDIT_UNDOCHARCOUNT
+#define STB_TEXTEDIT_UNDOCHARCOUNT   999
+#endif
+#ifndef STB_TEXTEDIT_CHARTYPE
+#define STB_TEXTEDIT_CHARTYPE        int
+#endif
+#ifndef STB_TEXTEDIT_POSITIONTYPE
+#define STB_TEXTEDIT_POSITIONTYPE    int
+#endif
+
+typedef struct
+{
+   // private data
+   STB_TEXTEDIT_POSITIONTYPE  where;
+   STB_TEXTEDIT_POSITIONTYPE  insert_length;
+   STB_TEXTEDIT_POSITIONTYPE  delete_length;
+   int                        char_storage;
+} StbUndoRecord;
+
+typedef struct
+{
+   // private data
+   StbUndoRecord          undo_rec [STB_TEXTEDIT_UNDOSTATECOUNT];
+   STB_TEXTEDIT_CHARTYPE  undo_char[STB_TEXTEDIT_UNDOCHARCOUNT];
+   short undo_point, redo_point;
+   int undo_char_point, redo_char_point;
+} StbUndoState;
+
+typedef struct
+{
+   /////////////////////
+   //
+   // public data
+   //
+
+   int cursor;
+   // position of the text cursor within the string
+
+   int select_start;          // selection start point
+   int select_end;
+   // selection start and end point in characters; if equal, no selection.
+   // note that start may be less than or greater than end (e.g. when
+   // dragging the mouse, start is where the initial click was, and you
+   // can drag in either direction)
+
+   unsigned char insert_mode;
+   // each textfield keeps its own insert mode state. to keep an app-wide
+   // insert mode, copy this value in/out of the app state
+
+   int row_count_per_page;
+   // page size in number of row.
+   // this value MUST be set to >0 for pageup or pagedown in multilines documents.
+
+   /////////////////////
+   //
+   // private data
+   //
+   unsigned char cursor_at_end_of_line; // not implemented yet
+   unsigned char initialized;
+   unsigned char has_preferred_x;
+   unsigned char single_line;
+   unsigned char padding1, padding2, padding3;
+   float preferred_x; // this determines where the cursor up/down tries to seek to along x
+   StbUndoState undostate;
+} STB_TexteditState;
+
+
+////////////////////////////////////////////////////////////////////////
+//
+//     StbTexteditRow
+//
+// Result of layout query, used by stb_textedit to determine where
+// the text in each row is.
+
+// result of layout query
+typedef struct
+{
+   float x0,x1;             // starting x location, end x location (allows for align=right, etc)
+   float baseline_y_delta;  // position of baseline relative to previous row's baseline
+   float ymin,ymax;         // height of row above and below baseline
+   int num_chars;
+} StbTexteditRow;
+#endif //INCLUDE_STB_TEXTEDIT_H
+
+
+////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+////
+////   Implementation mode
+////
+////
+
+
+// implementation isn't include-guarded, since it might have indirectly
+// included just the "header" portion
+#ifdef STB_TEXTEDIT_IMPLEMENTATION
+
+#ifndef STB_TEXTEDIT_memmove
+#include <string.h>
+#define STB_TEXTEDIT_memmove memmove
+#endif
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+//      Mouse input handling
+//
+
+// traverse the layout to locate the nearest character to a display position
+static int stb_text_locate_coord(STB_TEXTEDIT_STRING *str, float x, float y)
+{
+   StbTexteditRow r;
+   int n = STB_TEXTEDIT_STRINGLEN(str);
+   float base_y = 0, prev_x;
+   int i=0, k;
+
+   r.x0 = r.x1 = 0;
+   r.ymin = r.ymax = 0;
+   r.num_chars = 0;
+
+   // search rows to find one that straddles 'y'
+   while (i < n) {
+      STB_TEXTEDIT_LAYOUTROW(&r, str, i);
+      if (r.num_chars <= 0)
+         return n;
+
+      if (i==0 && y < base_y + r.ymin)
+         return 0;
+
+      if (y < base_y + r.ymax)
+         break;
+
+      i += r.num_chars;
+      base_y += r.baseline_y_delta;
+   }
+
+   // below all text, return 'after' last character
+   if (i >= n)
+      return n;
+
+   // check if it's before the beginning of the line
+   if (x < r.x0)
+      return i;
+
+   // check if it's before the end of the line
+   if (x < r.x1) {
+      // search characters in row for one that straddles 'x'
+      prev_x = r.x0;
+      for (k=0; k < r.num_chars; ++k) {
+         float w = STB_TEXTEDIT_GETWIDTH(str, i, k);
+         if (x < prev_x+w) {
+            if (x < prev_x+w/2)
+               return k+i;
+            else
+               return k+i+1;
+         }
+         prev_x += w;
+      }
+      // shouldn't happen, but if it does, fall through to end-of-line case
+   }
+
+   // if the last character is a newline, return that. otherwise return 'after' the last character
+   if (STB_TEXTEDIT_GETCHAR(str, i+r.num_chars-1) == STB_TEXTEDIT_NEWLINE)
+      return i+r.num_chars-1;
+   else
+      return i+r.num_chars;
+}
+
+// API click: on mouse down, move the cursor to the clicked location, and reset the selection
+static void stb_textedit_click(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, float x, float y)
+{
+   // In single-line mode, just always make y = 0. This lets the drag keep working if the mouse
+   // goes off the top or bottom of the text
+   if( state->single_line )
+   {
+      StbTexteditRow r;
+      STB_TEXTEDIT_LAYOUTROW(&r, str, 0);
+      y = r.ymin;
+   }
+
+   state->cursor = stb_text_locate_coord(str, x, y);
+   state->select_start = state->cursor;
+   state->select_end = state->cursor;
+   state->has_preferred_x = 0;
+}
+
+// API drag: on mouse drag, move the cursor and selection endpoint to the clicked location
+static void stb_textedit_drag(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, float x, float y)
+{
+   int p = 0;
+
+   // In single-line mode, just always make y = 0. This lets the drag keep working if the mouse
+   // goes off the top or bottom of the text
+   if( state->single_line )
+   {
+      StbTexteditRow r;
+      STB_TEXTEDIT_LAYOUTROW(&r, str, 0);
+      y = r.ymin;
+   }
+
+   if (state->select_start == state->select_end)
+      state->select_start = state->cursor;
+
+   p = stb_text_locate_coord(str, x, y);
+   state->cursor = state->select_end = p;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+//      Keyboard input handling
+//
+
+// forward declarations
+static void stb_text_undo(STB_TEXTEDIT_STRING *str, STB_TexteditState *state);
+static void stb_text_redo(STB_TEXTEDIT_STRING *str, STB_TexteditState *state);
+static void stb_text_makeundo_delete(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, int where, int length);
+static void stb_text_makeundo_insert(STB_TexteditState *state, int where, int length);
+static void stb_text_makeundo_replace(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, int where, int old_length, int new_length);
+
+typedef struct
+{
+   float x,y;    // position of n'th character
+   float height; // height of line
+   int first_char, length; // first char of row, and length
+   int prev_first;  // first char of previous row
+} StbFindState;
+
+// find the x/y location of a character, and remember info about the previous row in
+// case we get a move-up event (for page up, we'll have to rescan)
+static void stb_textedit_find_charpos(StbFindState *find, STB_TEXTEDIT_STRING *str, int n, int single_line)
+{
+   StbTexteditRow r;
+   int prev_start = 0;
+   int z = STB_TEXTEDIT_STRINGLEN(str);
+   int i=0, first;
+
+   if (n == z) {
+      // if it's at the end, then find the last line -- simpler than trying to
+      // explicitly handle this case in the regular code
+      if (single_line) {
+         STB_TEXTEDIT_LAYOUTROW(&r, str, 0);
+         find->y = 0;
+         find->first_char = 0;
+         find->length = z;
+         find->height = r.ymax - r.ymin;
+         find->x = r.x1;
+      } else {
+         find->y = 0;
+         find->x = 0;
+         find->height = 1;
+         while (i < z) {
+            STB_TEXTEDIT_LAYOUTROW(&r, str, i);
+            prev_start = i;
+            i += r.num_chars;
+         }
+         find->first_char = i;
+         find->length = 0;
+         find->prev_first = prev_start;
+      }
+      return;
+   }
+
+   // search rows to find the one that straddles character n
+   find->y = 0;
+
+   for(;;) {
+      STB_TEXTEDIT_LAYOUTROW(&r, str, i);
+      if (n < i + r.num_chars)
+         break;
+      prev_start = i;
+      i += r.num_chars;
+      find->y += r.baseline_y_delta;
+   }
+
+   find->first_char = first = i;
+   find->length = r.num_chars;
+   find->height = r.ymax - r.ymin;
+   find->prev_first = prev_start;
+
+   // now scan to find xpos
+   find->x = r.x0;
+   for (i=0; first+i < n; ++i)
+      find->x += STB_TEXTEDIT_GETWIDTH(str, first, i);
+}
+
+#define STB_TEXT_HAS_SELECTION(s)   ((s)->select_start != (s)->select_end)
+
+// make the selection/cursor state valid if client altered the string
+static void stb_textedit_clamp(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+{
+   int n = STB_TEXTEDIT_STRINGLEN(str);
+   if (STB_TEXT_HAS_SELECTION(state)) {
+      if (state->select_start > n) state->select_start = n;
+      if (state->select_end   > n) state->select_end = n;
+      // if clamping forced them to be equal, move the cursor to match
+      if (state->select_start == state->select_end)
+         state->cursor = state->select_start;
+   }
+   if (state->cursor > n) state->cursor = n;
+}
+
+// delete characters while updating undo
+static void stb_textedit_delete(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, int where, int len)
+{
+   stb_text_makeundo_delete(str, state, where, len);
+   STB_TEXTEDIT_DELETECHARS(str, where, len);
+   state->has_preferred_x = 0;
+}
+
+// delete the section
+static void stb_textedit_delete_selection(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+{
+   stb_textedit_clamp(str, state);
+   if (STB_TEXT_HAS_SELECTION(state)) {
+      if (state->select_start < state->select_end) {
+         stb_textedit_delete(str, state, state->select_start, state->select_end - state->select_start);
+         state->select_end = state->cursor = state->select_start;
+      } else {
+         stb_textedit_delete(str, state, state->select_end, state->select_start - state->select_end);
+         state->select_start = state->cursor = state->select_end;
+      }
+      state->has_preferred_x = 0;
+   }
+}
+
+// canoncialize the selection so start <= end
+static void stb_textedit_sortselection(STB_TexteditState *state)
+{
+   if (state->select_end < state->select_start) {
+      int temp = state->select_end;
+      state->select_end = state->select_start;
+      state->select_start = temp;
+   }
+}
+
+// move cursor to first character of selection
+static void stb_textedit_move_to_first(STB_TexteditState *state)
+{
+   if (STB_TEXT_HAS_SELECTION(state)) {
+      stb_textedit_sortselection(state);
+      state->cursor = state->select_start;
+      state->select_end = state->select_start;
+      state->has_preferred_x = 0;
+   }
+}
+
+// move cursor to last character of selection
+static void stb_textedit_move_to_last(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+{
+   if (STB_TEXT_HAS_SELECTION(state)) {
+      stb_textedit_sortselection(state);
+      stb_textedit_clamp(str, state);
+      state->cursor = state->select_end;
+      state->select_start = state->select_end;
+      state->has_preferred_x = 0;
+   }
+}
+
+#ifdef STB_TEXTEDIT_IS_SPACE
+static int is_word_boundary( STB_TEXTEDIT_STRING *str, int idx )
+{
+   return idx > 0 ? (STB_TEXTEDIT_IS_SPACE( STB_TEXTEDIT_GETCHAR(str,idx-1) ) && !STB_TEXTEDIT_IS_SPACE( STB_TEXTEDIT_GETCHAR(str, idx) ) ) : 1;
+}
+
+#ifndef STB_TEXTEDIT_MOVEWORDLEFT
+static int stb_textedit_move_to_word_previous( STB_TEXTEDIT_STRING *str, int c )
+{
+   --c; // always move at least one character
+   while( c >= 0 && !is_word_boundary( str, c ) )
+      --c;
+
+   if( c < 0 )
+      c = 0;
+
+   return c;
+}
+#define STB_TEXTEDIT_MOVEWORDLEFT stb_textedit_move_to_word_previous
+#endif
+
+#ifndef STB_TEXTEDIT_MOVEWORDRIGHT
+static int stb_textedit_move_to_word_next( STB_TEXTEDIT_STRING *str, int c )
+{
+   const int len = STB_TEXTEDIT_STRINGLEN(str);
+   ++c; // always move at least one character
+   while( c < len && !is_word_boundary( str, c ) )
+      ++c;
+
+   if( c > len )
+      c = len;
+
+   return c;
+}
+#define STB_TEXTEDIT_MOVEWORDRIGHT stb_textedit_move_to_word_next
+#endif
+
+#endif
+
+// update selection and cursor to match each other
+static void stb_textedit_prep_selection_at_cursor(STB_TexteditState *state)
+{
+   if (!STB_TEXT_HAS_SELECTION(state))
+      state->select_start = state->select_end = state->cursor;
+   else
+      state->cursor = state->select_end;
+}
+
+// API cut: delete selection
+static int stb_textedit_cut(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+{
+   if (STB_TEXT_HAS_SELECTION(state)) {
+      stb_textedit_delete_selection(str,state); // implicitly clamps
+      state->has_preferred_x = 0;
+      return 1;
+   }
+   return 0;
+}
+
+// API paste: replace existing selection with passed-in text
+static int stb_textedit_paste_internal(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, STB_TEXTEDIT_CHARTYPE *text, int len)
+{
+   // if there's a selection, the paste should delete it
+   stb_textedit_clamp(str, state);
+   stb_textedit_delete_selection(str,state);
+   // try to insert the characters
+   if (STB_TEXTEDIT_INSERTCHARS(str, state->cursor, text, len)) {
+      stb_text_makeundo_insert(state, state->cursor, len);
+      state->cursor += len;
+      state->has_preferred_x = 0;
+      return 1;
+   }
+   // note: paste failure will leave deleted selection, may be restored with an undo (see https://github.com/nothings/stb/issues/734 for details)
+   return 0;
+}
+
+#ifndef STB_TEXTEDIT_KEYTYPE
+#define STB_TEXTEDIT_KEYTYPE int
+#endif
+
+// API key: process a keyboard input
+static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, STB_TEXTEDIT_KEYTYPE key)
+{
+retry:
+   switch (key) {
+      default: {
+         int c = STB_TEXTEDIT_KEYTOTEXT(key);
+         if (c > 0) {
+            STB_TEXTEDIT_CHARTYPE ch = (STB_TEXTEDIT_CHARTYPE) c;
+
+            // can't add newline in single-line mode
+            if (c == '\n' && state->single_line)
+               break;
+
+            if (state->insert_mode && !STB_TEXT_HAS_SELECTION(state) && state->cursor < STB_TEXTEDIT_STRINGLEN(str)) {
+               stb_text_makeundo_replace(str, state, state->cursor, 1, 1);
+               STB_TEXTEDIT_DELETECHARS(str, state->cursor, 1);
+               if (STB_TEXTEDIT_INSERTCHARS(str, state->cursor, &ch, 1)) {
+                  ++state->cursor;
+                  state->has_preferred_x = 0;
+               }
+            } else {
+               stb_textedit_delete_selection(str,state); // implicitly clamps
+               if (STB_TEXTEDIT_INSERTCHARS(str, state->cursor, &ch, 1)) {
+                  stb_text_makeundo_insert(state, state->cursor, 1);
+                  ++state->cursor;
+                  state->has_preferred_x = 0;
+               }
+            }
+         }
+         break;
+      }
+
+#ifdef STB_TEXTEDIT_K_INSERT
+      case STB_TEXTEDIT_K_INSERT:
+         state->insert_mode = !state->insert_mode;
+         break;
+#endif
+
+      case STB_TEXTEDIT_K_UNDO:
+         stb_text_undo(str, state);
+         state->has_preferred_x = 0;
+         break;
+
+      case STB_TEXTEDIT_K_REDO:
+         stb_text_redo(str, state);
+         state->has_preferred_x = 0;
+         break;
+
+      case STB_TEXTEDIT_K_LEFT:
+         // if currently there's a selection, move cursor to start of selection
+         if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_move_to_first(state);
+         else
+            if (state->cursor > 0)
+               --state->cursor;
+         state->has_preferred_x = 0;
+         break;
+
+      case STB_TEXTEDIT_K_RIGHT:
+         // if currently there's a selection, move cursor to end of selection
+         if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_move_to_last(str, state);
+         else
+            ++state->cursor;
+         stb_textedit_clamp(str, state);
+         state->has_preferred_x = 0;
+         break;
+
+      case STB_TEXTEDIT_K_LEFT | STB_TEXTEDIT_K_SHIFT:
+         stb_textedit_clamp(str, state);
+         stb_textedit_prep_selection_at_cursor(state);
+         // move selection left
+         if (state->select_end > 0)
+            --state->select_end;
+         state->cursor = state->select_end;
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_MOVEWORDLEFT
+      case STB_TEXTEDIT_K_WORDLEFT:
+         if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_move_to_first(state);
+         else {
+            state->cursor = STB_TEXTEDIT_MOVEWORDLEFT(str, state->cursor);
+            stb_textedit_clamp( str, state );
+         }
+         break;
+
+      case STB_TEXTEDIT_K_WORDLEFT | STB_TEXTEDIT_K_SHIFT:
+         if( !STB_TEXT_HAS_SELECTION( state ) )
+            stb_textedit_prep_selection_at_cursor(state);
+
+         state->cursor = STB_TEXTEDIT_MOVEWORDLEFT(str, state->cursor);
+         state->select_end = state->cursor;
+
+         stb_textedit_clamp( str, state );
+         break;
+#endif
+
+#ifdef STB_TEXTEDIT_MOVEWORDRIGHT
+      case STB_TEXTEDIT_K_WORDRIGHT:
+         if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_move_to_last(str, state);
+         else {
+            state->cursor = STB_TEXTEDIT_MOVEWORDRIGHT(str, state->cursor);
+            stb_textedit_clamp( str, state );
+         }
+         break;
+
+      case STB_TEXTEDIT_K_WORDRIGHT | STB_TEXTEDIT_K_SHIFT:
+         if( !STB_TEXT_HAS_SELECTION( state ) )
+            stb_textedit_prep_selection_at_cursor(state);
+
+         state->cursor = STB_TEXTEDIT_MOVEWORDRIGHT(str, state->cursor);
+         state->select_end = state->cursor;
+
+         stb_textedit_clamp( str, state );
+         break;
+#endif
+
+      case STB_TEXTEDIT_K_RIGHT | STB_TEXTEDIT_K_SHIFT:
+         stb_textedit_prep_selection_at_cursor(state);
+         // move selection right
+         ++state->select_end;
+         stb_textedit_clamp(str, state);
+         state->cursor = state->select_end;
+         state->has_preferred_x = 0;
+         break;
+
+      case STB_TEXTEDIT_K_DOWN:
+      case STB_TEXTEDIT_K_DOWN | STB_TEXTEDIT_K_SHIFT:
+      case STB_TEXTEDIT_K_PGDOWN:
+      case STB_TEXTEDIT_K_PGDOWN | STB_TEXTEDIT_K_SHIFT: {
+         StbFindState find;
+         StbTexteditRow row;
+         int i, j, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0;
+         int is_page = (key & ~STB_TEXTEDIT_K_SHIFT) == STB_TEXTEDIT_K_PGDOWN;
+         int row_count = is_page ? state->row_count_per_page : 1;
+
+         if (!is_page && state->single_line) {
+            // on windows, up&down in single-line behave like left&right
+            key = STB_TEXTEDIT_K_RIGHT | (key & STB_TEXTEDIT_K_SHIFT);
+            goto retry;
+         }
+
+         if (sel)
+            stb_textedit_prep_selection_at_cursor(state);
+         else if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_move_to_last(str, state);
+
+         // compute current position of cursor point
+         stb_textedit_clamp(str, state);
+         stb_textedit_find_charpos(&find, str, state->cursor, state->single_line);
+
+         for (j = 0; j < row_count; ++j) {
+            float x, goal_x = state->has_preferred_x ? state->preferred_x : find.x;
+            int start = find.first_char + find.length;
+
+            if (find.length == 0)
+               break;
+
+            // now find character position down a row
+            state->cursor = start;
+            STB_TEXTEDIT_LAYOUTROW(&row, str, state->cursor);
+            x = row.x0;
+            for (i=0; i < row.num_chars; ++i) {
+               float dx = STB_TEXTEDIT_GETWIDTH(str, start, i);
+               #ifdef STB_TEXTEDIT_GETWIDTH_NEWLINE
+               if (dx == STB_TEXTEDIT_GETWIDTH_NEWLINE)
+                  break;
+               #endif
+               x += dx;
+               if (x > goal_x)
+                  break;
+               ++state->cursor;
+            }
+            stb_textedit_clamp(str, state);
+
+            state->has_preferred_x = 1;
+            state->preferred_x = goal_x;
+
+            if (sel)
+               state->select_end = state->cursor;
+
+            // go to next line
+            find.first_char = find.first_char + find.length;
+            find.length = row.num_chars;
+         }
+         break;
+      }
+
+      case STB_TEXTEDIT_K_UP:
+      case STB_TEXTEDIT_K_UP | STB_TEXTEDIT_K_SHIFT:
+      case STB_TEXTEDIT_K_PGUP:
+      case STB_TEXTEDIT_K_PGUP | STB_TEXTEDIT_K_SHIFT: {
+         StbFindState find;
+         StbTexteditRow row;
+         int i, j, prev_scan, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0;
+         int is_page = (key & ~STB_TEXTEDIT_K_SHIFT) == STB_TEXTEDIT_K_PGUP;
+         int row_count = is_page ? state->row_count_per_page : 1;
+
+         if (!is_page && state->single_line) {
+            // on windows, up&down become left&right
+            key = STB_TEXTEDIT_K_LEFT | (key & STB_TEXTEDIT_K_SHIFT);
+            goto retry;
+         }
+
+         if (sel)
+            stb_textedit_prep_selection_at_cursor(state);
+         else if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_move_to_first(state);
+
+         // compute current position of cursor point
+         stb_textedit_clamp(str, state);
+         stb_textedit_find_charpos(&find, str, state->cursor, state->single_line);
+
+         for (j = 0; j < row_count; ++j) {
+            float  x, goal_x = state->has_preferred_x ? state->preferred_x : find.x;
+
+            // can only go up if there's a previous row
+            if (find.prev_first == find.first_char)
+               break;
+
+            // now find character position up a row
+            state->cursor = find.prev_first;
+            STB_TEXTEDIT_LAYOUTROW(&row, str, state->cursor);
+            x = row.x0;
+            for (i=0; i < row.num_chars; ++i) {
+               float dx = STB_TEXTEDIT_GETWIDTH(str, find.prev_first, i);
+               #ifdef STB_TEXTEDIT_GETWIDTH_NEWLINE
+               if (dx == STB_TEXTEDIT_GETWIDTH_NEWLINE)
+                  break;
+               #endif
+               x += dx;
+               if (x > goal_x)
+                  break;
+               ++state->cursor;
+            }
+            stb_textedit_clamp(str, state);
+
+            state->has_preferred_x = 1;
+            state->preferred_x = goal_x;
+
+            if (sel)
+               state->select_end = state->cursor;
+
+            // go to previous line
+            // (we need to scan previous line the hard way. maybe we could expose this as a new API function?)
+            prev_scan = find.prev_first > 0 ? find.prev_first - 1 : 0;
+            while (prev_scan > 0 && STB_TEXTEDIT_GETCHAR(str, prev_scan - 1) != STB_TEXTEDIT_NEWLINE)
+               --prev_scan;
+            find.first_char = find.prev_first;
+            find.prev_first = prev_scan;
+         }
+         break;
+      }
+
+      case STB_TEXTEDIT_K_DELETE:
+      case STB_TEXTEDIT_K_DELETE | STB_TEXTEDIT_K_SHIFT:
+         if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_delete_selection(str, state);
+         else {
+            int n = STB_TEXTEDIT_STRINGLEN(str);
+            if (state->cursor < n)
+               stb_textedit_delete(str, state, state->cursor, 1);
+         }
+         state->has_preferred_x = 0;
+         break;
+
+      case STB_TEXTEDIT_K_BACKSPACE:
+      case STB_TEXTEDIT_K_BACKSPACE | STB_TEXTEDIT_K_SHIFT:
+         if (STB_TEXT_HAS_SELECTION(state))
+            stb_textedit_delete_selection(str, state);
+         else {
+            stb_textedit_clamp(str, state);
+            if (state->cursor > 0) {
+               stb_textedit_delete(str, state, state->cursor-1, 1);
+               --state->cursor;
+            }
+         }
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_K_TEXTSTART2
+      case STB_TEXTEDIT_K_TEXTSTART2:
+#endif
+      case STB_TEXTEDIT_K_TEXTSTART:
+         state->cursor = state->select_start = state->select_end = 0;
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_K_TEXTEND2
+      case STB_TEXTEDIT_K_TEXTEND2:
+#endif
+      case STB_TEXTEDIT_K_TEXTEND:
+         state->cursor = STB_TEXTEDIT_STRINGLEN(str);
+         state->select_start = state->select_end = 0;
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_K_TEXTSTART2
+      case STB_TEXTEDIT_K_TEXTSTART2 | STB_TEXTEDIT_K_SHIFT:
+#endif
+      case STB_TEXTEDIT_K_TEXTSTART | STB_TEXTEDIT_K_SHIFT:
+         stb_textedit_prep_selection_at_cursor(state);
+         state->cursor = state->select_end = 0;
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_K_TEXTEND2
+      case STB_TEXTEDIT_K_TEXTEND2 | STB_TEXTEDIT_K_SHIFT:
+#endif
+      case STB_TEXTEDIT_K_TEXTEND | STB_TEXTEDIT_K_SHIFT:
+         stb_textedit_prep_selection_at_cursor(state);
+         state->cursor = state->select_end = STB_TEXTEDIT_STRINGLEN(str);
+         state->has_preferred_x = 0;
+         break;
+
+
+#ifdef STB_TEXTEDIT_K_LINESTART2
+      case STB_TEXTEDIT_K_LINESTART2:
+#endif
+      case STB_TEXTEDIT_K_LINESTART:
+         stb_textedit_clamp(str, state);
+         stb_textedit_move_to_first(state);
+         if (state->single_line)
+            state->cursor = 0;
+         else while (state->cursor > 0 && STB_TEXTEDIT_GETCHAR(str, state->cursor-1) != STB_TEXTEDIT_NEWLINE)
+            --state->cursor;
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_K_LINEEND2
+      case STB_TEXTEDIT_K_LINEEND2:
+#endif
+      case STB_TEXTEDIT_K_LINEEND: {
+         int n = STB_TEXTEDIT_STRINGLEN(str);
+         stb_textedit_clamp(str, state);
+         stb_textedit_move_to_first(state);
+         if (state->single_line)
+             state->cursor = n;
+         else while (state->cursor < n && STB_TEXTEDIT_GETCHAR(str, state->cursor) != STB_TEXTEDIT_NEWLINE)
+             ++state->cursor;
+         state->has_preferred_x = 0;
+         break;
+      }
+
+#ifdef STB_TEXTEDIT_K_LINESTART2
+      case STB_TEXTEDIT_K_LINESTART2 | STB_TEXTEDIT_K_SHIFT:
+#endif
+      case STB_TEXTEDIT_K_LINESTART | STB_TEXTEDIT_K_SHIFT:
+         stb_textedit_clamp(str, state);
+         stb_textedit_prep_selection_at_cursor(state);
+         if (state->single_line)
+            state->cursor = 0;
+         else while (state->cursor > 0 && STB_TEXTEDIT_GETCHAR(str, state->cursor-1) != STB_TEXTEDIT_NEWLINE)
+            --state->cursor;
+         state->select_end = state->cursor;
+         state->has_preferred_x = 0;
+         break;
+
+#ifdef STB_TEXTEDIT_K_LINEEND2
+      case STB_TEXTEDIT_K_LINEEND2 | STB_TEXTEDIT_K_SHIFT:
+#endif
+      case STB_TEXTEDIT_K_LINEEND | STB_TEXTEDIT_K_SHIFT: {
+         int n = STB_TEXTEDIT_STRINGLEN(str);
+         stb_textedit_clamp(str, state);
+         stb_textedit_prep_selection_at_cursor(state);
+         if (state->single_line)
+             state->cursor = n;
+         else while (state->cursor < n && STB_TEXTEDIT_GETCHAR(str, state->cursor) != STB_TEXTEDIT_NEWLINE)
+            ++state->cursor;
+         state->select_end = state->cursor;
+         state->has_preferred_x = 0;
+         break;
+      }
+   }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+//      Undo processing
+//
+// @OPTIMIZE: the undo/redo buffer should be circular
+
+static void stb_textedit_flush_redo(StbUndoState *state)
+{
+   state->redo_point = STB_TEXTEDIT_UNDOSTATECOUNT;
+   state->redo_char_point = STB_TEXTEDIT_UNDOCHARCOUNT;
+}
+
+// discard the oldest entry in the undo list
+static void stb_textedit_discard_undo(StbUndoState *state)
+{
+   if (state->undo_point > 0) {
+      // if the 0th undo state has characters, clean those up
+      if (state->undo_rec[0].char_storage >= 0) {
+         int n = state->undo_rec[0].insert_length, i;
+         // delete n characters from all other records
+         state->undo_char_point -= n;
+         STB_TEXTEDIT_memmove(state->undo_char, state->undo_char + n, (size_t) (state->undo_char_point*sizeof(STB_TEXTEDIT_CHARTYPE)));
+         for (i=0; i < state->undo_point; ++i)
+            if (state->undo_rec[i].char_storage >= 0)
+               state->undo_rec[i].char_storage -= n; // @OPTIMIZE: get rid of char_storage and infer it
+      }
+      --state->undo_point;
+      STB_TEXTEDIT_memmove(state->undo_rec, state->undo_rec+1, (size_t) (state->undo_point*sizeof(state->undo_rec[0])));
+   }
+}
+
+// discard the oldest entry in the redo list--it's bad if this
+// ever happens, but because undo & redo have to store the actual
+// characters in different cases, the redo character buffer can
+// fill up even though the undo buffer didn't
+static void stb_textedit_discard_redo(StbUndoState *state)
+{
+   int k = STB_TEXTEDIT_UNDOSTATECOUNT-1;
+
+   if (state->redo_point <= k) {
+      // if the k'th undo state has characters, clean those up
+      if (state->undo_rec[k].char_storage >= 0) {
+         int n = state->undo_rec[k].insert_length, i;
+         // move the remaining redo character data to the end of the buffer
+         state->redo_char_point += n;
+         STB_TEXTEDIT_memmove(state->undo_char + state->redo_char_point, state->undo_char + state->redo_char_point-n, (size_t) ((STB_TEXTEDIT_UNDOCHARCOUNT - state->redo_char_point)*sizeof(STB_TEXTEDIT_CHARTYPE)));
+         // adjust the position of all the other records to account for above memmove
+         for (i=state->redo_point; i < k; ++i)
+            if (state->undo_rec[i].char_storage >= 0)
+               state->undo_rec[i].char_storage += n;
+      }
+      // now move all the redo records towards the end of the buffer; the first one is at 'redo_point'
+      STB_TEXTEDIT_memmove(state->undo_rec + state->redo_point+1, state->undo_rec + state->redo_point, (size_t) ((STB_TEXTEDIT_UNDOSTATECOUNT - state->redo_point)*sizeof(state->undo_rec[0])));
+      // now move redo_point to point to the new one
+      ++state->redo_point;
+   }
+}
+
+static StbUndoRecord *stb_text_create_undo_record(StbUndoState *state, int numchars)
+{
+   // any time we create a new undo record, we discard redo
+   stb_textedit_flush_redo(state);
+
+   // if we have no free records, we have to make room, by sliding the
+   // existing records down
+   if (state->undo_point == STB_TEXTEDIT_UNDOSTATECOUNT)
+      stb_textedit_discard_undo(state);
+
+   // if the characters to store won't possibly fit in the buffer, we can't undo
+   if (numchars > STB_TEXTEDIT_UNDOCHARCOUNT) {
+      state->undo_point = 0;
+      state->undo_char_point = 0;
+      return NULL;
+   }
+
+   // if we don't have enough free characters in the buffer, we have to make room
+   while (state->undo_char_point + numchars > STB_TEXTEDIT_UNDOCHARCOUNT)
+      stb_textedit_discard_undo(state);
+
+   return &state->undo_rec[state->undo_point++];
+}
+
+static STB_TEXTEDIT_CHARTYPE *stb_text_createundo(StbUndoState *state, int pos, int insert_len, int delete_len)
+{
+   StbUndoRecord *r = stb_text_create_undo_record(state, insert_len);
+   if (r == NULL)
+      return NULL;
+
+   r->where = pos;
+   r->insert_length = (STB_TEXTEDIT_POSITIONTYPE) insert_len;
+   r->delete_length = (STB_TEXTEDIT_POSITIONTYPE) delete_len;
+
+   if (insert_len == 0) {
+      r->char_storage = -1;
+      return NULL;
+   } else {
+      r->char_storage = state->undo_char_point;
+      state->undo_char_point += insert_len;
+      return &state->undo_char[r->char_storage];
+   }
+}
+
+static void stb_text_undo(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+{
+   StbUndoState *s = &state->undostate;
+   StbUndoRecord u, *r;
+   if (s->undo_point == 0)
+      return;
+
+   // we need to do two things: apply the undo record, and create a redo record
+   u = s->undo_rec[s->undo_point-1];
+   r = &s->undo_rec[s->redo_point-1];
+   r->char_storage = -1;
+
+   r->insert_length = u.delete_length;
+   r->delete_length = u.insert_length;
+   r->where = u.where;
+
+   if (u.delete_length) {
+      // if the undo record says to delete characters, then the redo record will
+      // need to re-insert the characters that get deleted, so we need to store
+      // them.
+
+      // there are three cases:
+      //    there's enough room to store the characters
+      //    characters stored for *redoing* don't leave room for redo
+      //    characters stored for *undoing* don't leave room for redo
+      // if the last is true, we have to bail
+
+      if (s->undo_char_point + u.delete_length >= STB_TEXTEDIT_UNDOCHARCOUNT) {
+         // the undo records take up too much character space; there's no space to store the redo characters
+         r->insert_length = 0;
+      } else {
+         int i;
+
+         // there's definitely room to store the characters eventually
+         while (s->undo_char_point + u.delete_length > s->redo_char_point) {
+            // should never happen:
+            if (s->redo_point == STB_TEXTEDIT_UNDOSTATECOUNT)
+               return;
+            // there's currently not enough room, so discard a redo record
+            stb_textedit_discard_redo(s);
+         }
+         r = &s->undo_rec[s->redo_point-1];
+
+         r->char_storage = s->redo_char_point - u.delete_length;
+         s->redo_char_point = s->redo_char_point - u.delete_length;
+
+         // now save the characters
+         for (i=0; i < u.delete_length; ++i)
+            s->undo_char[r->char_storage + i] = STB_TEXTEDIT_GETCHAR(str, u.where + i);
+      }
+
+      // now we can carry out the deletion
+      STB_TEXTEDIT_DELETECHARS(str, u.where, u.delete_length);
+   }
+
+   // check type of recorded action:
+   if (u.insert_length) {
+      // easy case: was a deletion, so we need to insert n characters
+      STB_TEXTEDIT_INSERTCHARS(str, u.where, &s->undo_char[u.char_storage], u.insert_length);
+      s->undo_char_point -= u.insert_length;
+   }
+
+   state->cursor = u.where + u.insert_length;
+
+   s->undo_point--;
+   s->redo_point--;
+}
+
+static void stb_text_redo(STB_TEXTEDIT_STRING *str, STB_TexteditState *state)
+{
+   StbUndoState *s = &state->undostate;
+   StbUndoRecord *u, r;
+   if (s->redo_point == STB_TEXTEDIT_UNDOSTATECOUNT)
+      return;
+
+   // we need to do two things: apply the redo record, and create an undo record
+   u = &s->undo_rec[s->undo_point];
+   r = s->undo_rec[s->redo_point];
+
+   // we KNOW there must be room for the undo record, because the redo record
+   // was derived from an undo record
+
+   u->delete_length = r.insert_length;
+   u->insert_length = r.delete_length;
+   u->where = r.where;
+   u->char_storage = -1;
+
+   if (r.delete_length) {
+      // the redo record requires us to delete characters, so the undo record
+      // needs to store the characters
+
+      if (s->undo_char_point + u->insert_length > s->redo_char_point) {
+         u->insert_length = 0;
+         u->delete_length = 0;
+      } else {
+         int i;
+         u->char_storage = s->undo_char_point;
+         s->undo_char_point = s->undo_char_point + u->insert_length;
+
+         // now save the characters
+         for (i=0; i < u->insert_length; ++i)
+            s->undo_char[u->char_storage + i] = STB_TEXTEDIT_GETCHAR(str, u->where + i);
+      }
+
+      STB_TEXTEDIT_DELETECHARS(str, r.where, r.delete_length);
+   }
+
+   if (r.insert_length) {
+      // easy case: need to insert n characters
+      STB_TEXTEDIT_INSERTCHARS(str, r.where, &s->undo_char[r.char_storage], r.insert_length);
+      s->redo_char_point += r.insert_length;
+   }
+
+   state->cursor = r.where + r.insert_length;
+
+   s->undo_point++;
+   s->redo_point++;
+}
+
+static void stb_text_makeundo_insert(STB_TexteditState *state, int where, int length)
+{
+   stb_text_createundo(&state->undostate, where, 0, length);
+}
+
+static void stb_text_makeundo_delete(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, int where, int length)
+{
+   int i;
+   STB_TEXTEDIT_CHARTYPE *p = stb_text_createundo(&state->undostate, where, length, 0);
+   if (p) {
+      for (i=0; i < length; ++i)
+         p[i] = STB_TEXTEDIT_GETCHAR(str, where+i);
+   }
+}
+
+static void stb_text_makeundo_replace(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, int where, int old_length, int new_length)
+{
+   int i;
+   STB_TEXTEDIT_CHARTYPE *p = stb_text_createundo(&state->undostate, where, old_length, new_length);
+   if (p) {
+      for (i=0; i < old_length; ++i)
+         p[i] = STB_TEXTEDIT_GETCHAR(str, where+i);
+   }
+}
+
+// reset the state to default
+static void stb_textedit_clear_state(STB_TexteditState *state, int is_single_line)
+{
+   state->undostate.undo_point = 0;
+   state->undostate.undo_char_point = 0;
+   state->undostate.redo_point = STB_TEXTEDIT_UNDOSTATECOUNT;
+   state->undostate.redo_char_point = STB_TEXTEDIT_UNDOCHARCOUNT;
+   state->select_end = state->select_start = 0;
+   state->cursor = 0;
+   state->has_preferred_x = 0;
+   state->preferred_x = 0;
+   state->cursor_at_end_of_line = 0;
+   state->initialized = 1;
+   state->single_line = (unsigned char) is_single_line;
+   state->insert_mode = 0;
+   state->row_count_per_page = 0;
+}
+
+// API initialize
+static void stb_textedit_initialize_state(STB_TexteditState *state, int is_single_line)
+{
+   stb_textedit_clear_state(state, is_single_line);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static int stb_textedit_paste(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, STB_TEXTEDIT_CHARTYPE const *ctext, int len)
+{
+   return stb_textedit_paste_internal(str, state, (STB_TEXTEDIT_CHARTYPE *) ctext, len);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+#endif//STB_TEXTEDIT_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_tilemap_editor.h b/lib/stb/stb_tilemap_editor.h
new file mode 100644
index 0000000..fbd3388
--- /dev/null
+++ b/lib/stb/stb_tilemap_editor.h
@@ -0,0 +1,4187 @@
+// stb_tilemap_editor.h - v0.42 - Sean Barrett - http://nothings.org/stb
+// placed in the public domain - not copyrighted - first released 2014-09
+//
+// Embeddable tilemap editor for C/C++
+//
+//
+// TABLE OF CONTENTS
+//    FAQ
+//    How to compile/use the library
+//    Additional configuration macros
+//    API documentation
+//    Info on editing multiple levels
+//    Revision history
+//    Todo
+//    Credits
+//    License
+//
+//
+// FAQ
+//
+//   Q: What counts as a tilemap for this library?
+//
+//   A: An array of rectangles, where each rectangle contains a small
+//      stack of images.
+//
+//   Q: What are the limitations?
+//
+//   A: Maps are limited to 4096x4096 in dimension.
+//      Each map square can only contain a stack of at most 32 images.
+//      A map can only use up to 32768 distinct image tiles.
+//
+//   Q: How do I compile this?
+//
+//   A: You need to #define several symbols before #including it, but only
+//      in one file. This will cause all the function definitions to be
+//      generated in that file. See the "HOW TO COMPILE" section.
+//
+//   Q: What advantages does this have over a standalone editor?
+//
+//   A: For one, you can integrate the editor into your game so you can
+//      flip between editing and testing without even switching windows.
+//      For another, you don't need an XML parser to get at the map data.
+//
+//   Q: Can I live-edit my game maps?
+//
+//   A: Not really, the editor keeps its own map representation.
+//
+//   Q: How do I save and load maps?
+//
+//   A: You have to do this yourself. The editor provides serialization
+//      functions (get & set) for reading and writing the map it holds.
+//      You can choose whatever format you want to store the map to on
+//      disk; you just need to provide functions to convert. (For example,
+//      I actually store the editor's map representation to disk basically
+//      as-is; then I have a single function that converts from the editor
+//      map representation to the game representation, which is used both
+//      to go from editor-to-game and from loaded-map-to-game.)
+//
+//   Q: I want to have tiles change appearance based on what's
+//      adjacent, or other tile-display/substitution trickiness.
+//
+//   A: You can do this when you convert from the editor's map
+//      representation to the game representation, but there's
+//      no way to show this live in the editor.
+//
+//   Q: The editor appears to be put map location (0,0) at the top left?
+//      I want to use a different coordinate system in my game (e.g. y
+//      increasing upwards, or origin at the center).
+//
+//   A: You can do this when you convert from the editor's map
+//      representation to the game representation. (Don't forget to
+//      translate link coordinates as well!)
+//
+//   Q: The editor appears to put pixel (0,0) at the top left? I want
+//      to use a different coordinate system in my game.
+//
+//   A: The editor defines an "editor pixel coordinate system" with
+//      (0,0) at the top left and requires you to display things in
+//      that coordinate system. You can freely remap those coordinates
+//      to anything you want on screen.
+//
+//   Q: How do I scale the user interface?
+//
+//   A: Since you do all the rendering, you can scale up all the rendering
+//      calls that the library makes to you. If you do, (a) you need
+//      to also scale up the mouse coordinates, and (b) you may want
+//      to scale the map display back down so that you're only scaling
+//      the UI and not everything. See the next question.
+//
+//   Q: How do I scale the map display?
+//
+//   A: Use stbte_set_spacing() to change the size that the map is displayed
+//      at. Note that the "callbacks" to draw tiles are used for both drawing
+//      the map and drawing the tile palette, so that callback may need to
+//      draw at two different scales. You should choose the scales to match
+//       You can tell them apart because the
+//      tile palette gets NULL for the property pointer.
+//
+//   Q: How does object editing work?
+//
+//   A: One way to think of this is that in the editor, you're placing
+//      spawners, not objects. Each spawner must be tile-aligned, because
+//      it's only a tile editor. Each tile (stack of layers) gets
+//      an associated set of properties, and it's up to you to
+//      determine what properties should appear for a given tile,
+//      based on e.g. the spawners that are in it.
+//
+//   Q: How are properties themselves handled?
+//
+//   A: All properties, regardless of UI behavior, are internally floats.
+//      Each tile has an array of floats associated with it, which is
+//      passed back to you when drawing the tiles so you can draw
+//      objects appropriately modified by the properties.
+//
+//   Q: What if I want to have two different objects/spawners in
+//      one tile, both of which have their own properties?
+//
+//   A: Make sure STBTE_MAX_PROPERTIES is large enough for the sum of
+//      properties in both objects, and then you have to explicitly
+//      map the property slot #s to the appropriate objects. They'll
+//      still all appear in a single property panel; there's no way
+//      to get multiple panels.
+//
+//   Q: Can I do one-to-many linking?
+//
+//   A: The library only supports one link per tile. However, you
+//      can have multiple tiles all link to a single tile. So, you
+//      can fake one-to-many linking by linking in the reverse
+//      direction.
+//
+//   Q: What if I have two objects in the same tile, and they each
+//      need an independent link? Or I have two kinds of link associated
+//      with a single object?
+//
+//   A: There is no way to do this. (Unless you can reverse one link.)
+//
+//   Q: How does cut & paste interact with object properties & links?
+//
+//   A: Currently the library has no idea which properties or links
+//      are associated with which layers of a tile. So currently, the
+//      library will only copy properties & links if the layer panel
+//      is set to allow all layers to be copied, OR if you set the
+//      "props" in the layer panel to "always". Similarly, you can
+//      set "props" to "none" so it will never copy.
+//
+//   Q: What happens if the library gets a memory allocation failure
+//      while I'm editing? Will I lose my work?
+//
+//   A: The library allocates all editor memory when you create
+//      the tilemap. It allocates a maximally-sized map and a
+//      fixed-size undo buffer (and the fixed-size copy buffer
+//      is static), and never allocates memory while it's running.
+//      So it can't fail due to running out of memory.
+//
+//   Q: What happens if the library crashes while I'm editing? Will
+//      I lose my work?
+//
+//   A: Yes. Save often.
+//
+//
+// HOW TO COMPILE
+//
+//   This header file contains both the header file and the
+//   implementation file in one. To create the implementation,
+//   in one source file define a few symbols first and then
+//   include this header:
+//
+//      #define STB_TILEMAP_EDITOR_IMPLEMENTATION
+//      // this triggers the implementation
+//
+//      void STBTE_DRAW_RECT(int x0, int y0, int x1, int y1, unsigned int color);
+//      // this must draw a filled rectangle (exclusive on right/bottom)
+//      // color = (r<<16)|(g<<8)|(b)
+//
+//      void STBTE_DRAW_TILE(int x0, int y0,
+//                    unsigned short id, int highlight, float *data);
+//      // this draws the tile image identified by 'id' in one of several
+//      // highlight modes (see STBTE_drawmode_* in the header section);
+//      // if 'data' is NULL, it's drawing the tile in the palette; if 'data'
+//      // is not NULL, it's drawing a tile on the map, and that is the data
+//      // associated with that map tile
+//
+//      #include "stb_tilemap_editor.h"
+//
+//   Optionally you can define the following functions before the include;
+//   note these must be macros (but they can just call a function) so
+//   this library can #ifdef to detect if you've defined them:
+//
+//      #define STBTE_PROP_TYPE(int n, short *tiledata, float *params) ...
+//      // Returns the type of the n'th property of a given tile, which
+//      // controls how it is edited. Legal types are:
+//      //     0                    /* no editable property in this slot */
+//      //     STBTE_PROP_int       /* uses a slider to adjust value     */
+//      //     STBTE_PROP_float     /* uses a weird multi-axis control   */
+//      //     STBTE_PROP_bool      /* uses a checkbox to change value   */
+//      // And you can bitwise-OR in the following flags:
+//      //     STBTE_PROP_disabled
+//      // Note that all of these are stored as floats in the param array.
+//      // The integer slider is limited in precision based on the space
+//      // available on screen, so for wide-ranged integers you may want
+//      // to use floats instead.
+//      //
+//      // Since the tiledata is passed to you, you can choose which property
+//      // is bound to that slot based on that data.
+//      //
+//      // Changing the type of a parameter does not cause the underlying
+//      // value to be clamped to the type min/max except when the tile is
+//      // explicitly selected.
+//
+//      #define STBTE_PROP_NAME(int n, short *tiledata, float *params) ...
+//      // these return a string with the name for slot #n in the float
+//      // property list for the tile.
+//
+//      #define STBTE_PROP_MIN(int n, short *tiledata) ...your code here...
+//      #define STBTE_PROP_MAX(int n, short *tiledata) ...your code here...
+//      // These return the allowable range for the property values for
+//      // the specified slot. It is never called for boolean types.
+//
+//      #define STBTE_PROP_FLOAT_SCALE(int n, short *tiledata, float *params)
+//      // This rescales the float control for a given property; by default
+//      // left mouse drags add integers, right mouse drags adds fractions,
+//      // but you can rescale this per-property.
+//
+//      #define STBTE_FLOAT_CONTROL_GRANULARITY       ... value ...
+//      // This returns the number of pixels of mouse motion necessary
+//      // to advance the object float control. Default is 4
+//
+//      #define STBTE_ALLOW_LINK(short *src, float *src_data,  \
+//                               short *dest, float *dest_data) ...your code...
+//      // this returns true or false depending on whether you allow a link
+//      // to be drawn from a tile 'src' to a tile 'dest'. if you don't
+//      // define this, linking will not be supported
+//
+//      #define STBTE_LINK_COLOR(short *src, float *src_data,  \
+//                               short *dest, float *dest_data) ...your code...
+//      // return a color encoded as a 24-bit unsigned integer in the
+//      // form 0xRRGGBB. If you don't define this, default colors will
+//      // be used.
+//
+//
+//      [[ support for those below is not implemented yet ]]
+//
+//      #define STBTE_HITTEST_TILE(x0,y0,id,mx,my)   ...your code here...
+//      // this returns true or false depending on whether the mouse
+//      // pointer at mx,my is over (touching) a tile of type 'id'
+//      // displayed at x0,y0. Normally stb_tilemap_editor just does
+//      // this hittest based on the tile geometry, but if you have
+//      // tiles whose images extend out of the tile, you'll need this.
+//
+// ADDITIONAL CONFIGURATION
+//
+//   The following symbols set static limits which determine how much
+//   memory will be allocated for the editor. You can override them
+//   by making similar definitions, but memory usage will increase.
+//
+//      #define STBTE_MAX_TILEMAP_X      200   // max 4096
+//      #define STBTE_MAX_TILEMAP_Y      200   // max 4096
+//      #define STBTE_MAX_LAYERS         8     // max 32
+//      #define STBTE_MAX_CATEGORIES     100
+//      #define STBTE_UNDO_BUFFER_BYTES  (1 << 24) // 16 MB
+//      #define STBTE_MAX_COPY           90000  // e.g. 300x300
+//      #define STBTE_MAX_PROPERTIES     10     // max properties per tile
+//
+// API
+//
+//   Further documentation appears in the header-file section below.
+//
+// EDITING MULTIPLE LEVELS
+//
+//   You can only have one active editor instance. To switch between multiple
+//   levels, you can either store the levels in your own format and copy them
+//   in and out of the editor format, or you can create multiple stbte_tilemap
+//   objects and switch between them. The latter has the advantage that each
+//   stbte_tilemap keeps its own undo state. (The clipboard is global, so
+//   either approach allows cut&pasting between levels.)
+//
+// REVISION HISTORY
+//   0.42  fix compilation errors
+//   0.41  fix warnings
+//   0.40  fix warning
+//   0.39  fix warning
+//   0.38  fix warning
+//   0.37  fix warning
+//   0.36  minor compiler support
+//   0.35  layername button changes
+//          - layername buttons grow with the layer panel
+//          - fix stbte_create_map being declared as stbte_create
+//          - fix declaration of stbte_create_map
+//   0.30  properties release
+//          - properties panel for editing user-defined "object" properties
+//          - can link each tile to one other tile
+//          - keyboard interface
+//          - fix eraser tool bug (worked in complex cases, failed in simple)
+//          - undo/redo tools have visible disabled state
+//          - tiles on higher layers draw on top of adjacent lower-layer tiles
+//   0.20  erasable release
+//          - eraser tool
+//          - fix bug when pasting into protected layer
+//          - better color scheme
+//          - internal-use color picker
+//   0.10  initial release
+//
+// TODO
+//
+//   Separate scroll state for each category
+//   Implement paint bucket
+//   Support STBTE_HITTEST_TILE above
+//  ?Cancel drags by clicking other button? - may be fixed
+//   Finish support for toolbar at side
+//
+// CREDITS
+//
+//
+//   Main editor & features
+//      Sean Barrett
+//   Additional features:
+//      Josh Huelsman
+//   Bugfixes:
+//      Ryan Whitworth
+//      Eugene Opalev
+//      Rob Loach
+//      github:wernsey
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+
+
+///////////////////////////////////////////////////////////////////////
+//
+//   HEADER SECTION
+
+#ifndef STB_TILEMAP_INCLUDE_STB_TILEMAP_EDITOR_H
+#define STB_TILEMAP_INCLUDE_STB_TILEMAP_EDITOR_H
+
+#ifdef _WIN32
+  #ifndef _CRT_SECURE_NO_WARNINGS
+  #define _CRT_SECURE_NO_WARNINGS
+  #endif
+  #include <stdlib.h>
+  #include <stdio.h>
+#endif
+
+typedef struct stbte_tilemap stbte_tilemap;
+
+// these are the drawmodes used in STBTE_DRAW_TILE
+enum
+{
+   STBTE_drawmode_deemphasize = -1,
+   STBTE_drawmode_normal      =  0,
+   STBTE_drawmode_emphasize   =  1,
+};
+
+// these are the property types
+#define STBTE_PROP_none     0
+#define STBTE_PROP_int      1
+#define STBTE_PROP_float    2
+#define STBTE_PROP_bool     3
+#define STBTE_PROP_disabled 4
+
+////////
+//
+// creation
+//
+
+extern stbte_tilemap *stbte_create_map(int map_x, int map_y, int map_layers, int spacing_x, int spacing_y, int max_tiles);
+// create an editable tilemap
+//   map_x      : dimensions of map horizontally (user can change this in editor), <= STBTE_MAX_TILEMAP_X
+//   map_y      : dimensions of map vertically (user can change this in editor)    <= STBTE_MAX_TILEMAP_Y
+//   map_layers : number of layers to use (fixed), <= STBTE_MAX_LAYERS
+//   spacing_x  : initial horizontal distance between left edges of map tiles in stb_tilemap_editor pixels
+//   spacing_y  : initial vertical distance between top edges of map tiles in stb_tilemap_editor pixels
+//   max_tiles  : maximum number of tiles that can defined
+//
+// If insufficient memory, returns NULL
+
+extern void stbte_define_tile(stbte_tilemap *tm, unsigned short id, unsigned int layermask, const char * category);
+// call this repeatedly for each tile to install the tile definitions into the editable tilemap
+//   tm        : tilemap created by stbte_create_map
+//   id        : unique identifier for each tile, 0 <= id < 32768
+//   layermask : bitmask of which layers tile is allowed on: 1 = layer 0, 255 = layers 0..7
+//               (note that onscreen, the editor numbers the layers from 1 not 0)
+//               layer 0 is the furthest back, layer 1 is just in front of layer 0, etc
+//   category  : which category this tile is grouped in
+
+extern void stbte_set_display(int x0, int y0, int x1, int y1);
+// call this once to set the size; if you resize, call it again
+
+
+/////////
+//
+// every frame
+//
+
+extern void stbte_draw(stbte_tilemap *tm);
+
+extern void stbte_tick(stbte_tilemap *tm, float time_in_seconds_since_last_frame);
+
+////////////
+//
+//  user input
+//
+
+// if you're using SDL, call the next function for SDL_MOUSEMOTION, SDL_MOUSEBUTTONDOWN, SDL_MOUSEBUTTONUP, SDL_MOUSEWHEEL;
+// the transformation lets you scale from SDL mouse coords to stb_tilemap_editor coords
+extern void stbte_mouse_sdl(stbte_tilemap *tm, const void *sdl_event, float xscale, float yscale, int xoffset, int yoffset);
+
+// otherwise, hook these up explicitly:
+extern void stbte_mouse_move(stbte_tilemap *tm, int x, int y, int shifted, int scrollkey);
+extern void stbte_mouse_button(stbte_tilemap *tm, int x, int y, int right, int down, int shifted, int scrollkey);
+extern void stbte_mouse_wheel(stbte_tilemap *tm, int x, int y, int vscroll);
+
+// note: at the moment, mouse wheel events (SDL_MOUSEWHEEL) are ignored.
+
+// for keyboard, define your own mapping from keys to the following actions.
+// this is totally optional, as all features are accessible with the mouse
+enum stbte_action
+{
+   STBTE_tool_select,
+   STBTE_tool_brush,
+   STBTE_tool_erase,
+   STBTE_tool_rectangle,
+   STBTE_tool_eyedropper,
+   STBTE_tool_link,
+   STBTE_act_toggle_grid,
+   STBTE_act_toggle_links,
+   STBTE_act_undo,
+   STBTE_act_redo,
+   STBTE_act_cut,
+   STBTE_act_copy,
+   STBTE_act_paste,
+   STBTE_scroll_left,
+   STBTE_scroll_right,
+   STBTE_scroll_up,
+   STBTE_scroll_down,
+};
+extern void stbte_action(stbte_tilemap *tm, enum stbte_action act);
+
+////////////////
+//
+//  save/load
+//
+//  There is no editor file format. You have to save and load the data yourself
+//  through the following functions. You can also use these functions to get the
+//  data to generate game-formatted levels directly. (But make sure you save
+//  first! You may also want to autosave to a temp file periodically, etc etc.)
+
+#define STBTE_EMPTY    -1
+
+extern void stbte_get_dimensions(stbte_tilemap *tm, int *max_x, int *max_y);
+// get the dimensions of the level, since the user can change them
+
+extern short* stbte_get_tile(stbte_tilemap *tm, int x, int y);
+// returns an array of shorts that is 'map_layers' in length. each short is
+// either one of the tile_id values from define_tile, or STBTE_EMPTY.
+
+extern float *stbte_get_properties(stbte_tilemap *tm, int x, int y);
+// get the property array associated with the tile at x,y. this is an
+// array of floats that is STBTE_MAX_PROPERTIES in length; you have to
+// interpret the slots according to the semantics you've chosen
+
+extern void stbte_get_link(stbte_tilemap *tm, int x, int y, int *destx, int *desty);
+// gets the link associated with the tile at x,y.
+
+extern void stbte_set_dimensions(stbte_tilemap *tm, int max_x, int max_y);
+// set the dimensions of the level, overrides previous stbte_create_map()
+// values or anything the user has changed
+
+extern void stbte_clear_map(stbte_tilemap *tm);
+// clears the map, including the region outside the defined region, so if the
+// user expands the map, they won't see garbage there
+
+extern void stbte_set_tile(stbte_tilemap *tm, int x, int y, int layer, signed short tile);
+// tile is your tile_id from define_tile, or STBTE_EMPTY
+
+extern void stbte_set_property(stbte_tilemap *tm, int x, int y, int n, float val);
+// set the value of the n'th slot of the tile at x,y
+
+extern void stbte_set_link(stbte_tilemap *tm, int x, int y, int destx, int desty);
+// set a link going from x,y to destx,desty. to force no link,
+// use destx=desty=-1
+
+////////
+//
+// optional
+//
+
+extern void stbte_set_background_tile(stbte_tilemap *tm, short id);
+// selects the tile to fill the bottom layer with and used to clear bottom tiles to;
+// should be same ID as
+
+extern void stbte_set_sidewidths(int left, int right);
+// call this once to set the left & right side widths. don't call
+// it again since the user can change it
+
+extern void stbte_set_spacing(stbte_tilemap *tm, int spacing_x, int spacing_y, int palette_spacing_x, int palette_spacing_y);
+// call this to set the spacing of map tiles and the spacing of palette tiles.
+// if you rescale your display, call it again (e.g. you can implement map zooming yourself)
+
+extern void stbte_set_layername(stbte_tilemap *tm, int layer, const char *layername);
+// sets a string name for your layer that shows in the layer selector. note that this
+// makes the layer selector wider. 'layer' is from 0..(map_layers-1)
+
+#endif
+
+#ifdef STB_TILEMAP_EDITOR_IMPLEMENTATION
+
+#ifndef STBTE_ASSERT
+#define STBTE_ASSERT assert
+#include <assert.h>
+#endif
+
+#ifdef _MSC_VER
+#define STBTE__NOTUSED(v)  (void)(v)
+#else
+#define STBTE__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifndef STBTE_MAX_TILEMAP_X
+#define STBTE_MAX_TILEMAP_X      200
+#endif
+
+#ifndef STBTE_MAX_TILEMAP_Y
+#define STBTE_MAX_TILEMAP_Y      200
+#endif
+
+#ifndef STBTE_MAX_LAYERS
+#define STBTE_MAX_LAYERS         8
+#endif
+
+#ifndef STBTE_MAX_CATEGORIES
+#define STBTE_MAX_CATEGORIES     100
+#endif
+
+#ifndef STBTE_MAX_COPY
+#define STBTE_MAX_COPY           65536
+#endif
+
+#ifndef STBTE_UNDO_BUFFER_BYTES
+#define STBTE_UNDO_BUFFER_BYTES  (1 << 24) // 16 MB
+#endif
+
+#ifndef STBTE_PROP_TYPE
+#define STBTE__NO_PROPS
+#define STBTE_PROP_TYPE(n,td,tp)   0
+#endif
+
+#ifndef STBTE_PROP_NAME
+#define STBTE_PROP_NAME(n,td,tp)  ""
+#endif
+
+#ifndef STBTE_MAX_PROPERTIES
+#define STBTE_MAX_PROPERTIES           10
+#endif
+
+#ifndef STBTE_PROP_MIN
+#define STBTE_PROP_MIN(n,td,tp)  0
+#endif
+
+#ifndef STBTE_PROP_MAX
+#define STBTE_PROP_MAX(n,td,tp)  100.0
+#endif
+
+#ifndef STBTE_PROP_FLOAT_SCALE
+#define STBTE_PROP_FLOAT_SCALE(n,td,tp)  1   // default scale size
+#endif
+
+#ifndef STBTE_FLOAT_CONTROL_GRANULARITY
+#define STBTE_FLOAT_CONTROL_GRANULARITY 4
+#endif
+
+
+#define STBTE__UNDO_BUFFER_COUNT  (STBTE_UNDO_BUFFER_BYTES>>1)
+
+#if STBTE_MAX_TILEMAP_X > 4096 || STBTE_MAX_TILEMAP_Y > 4096
+#error "Maximum editable map size is 4096 x 4096"
+#endif
+#if STBTE_MAX_LAYERS > 32
+#error "Maximum layers allowed is 32"
+#endif
+#if STBTE_UNDO_BUFFER_COUNT & (STBTE_UNDO_BUFFER_COUNT-1)
+#error "Undo buffer size must be a power of 2"
+#endif
+
+#if STBTE_MAX_PROPERTIES == 0
+#define STBTE__NO_PROPS
+#endif
+
+#ifdef STBTE__NO_PROPS
+#undef STBTE_MAX_PROPERTIES
+#define STBTE_MAX_PROPERTIES 1  // so we can declare arrays
+#endif
+
+typedef struct
+{
+   short x,y;
+} stbte__link;
+
+enum
+{
+   STBTE__base,
+   STBTE__outline,
+   STBTE__text,
+
+   STBTE__num_color_aspects,
+};
+
+enum
+{
+   STBTE__idle,
+   STBTE__over,
+   STBTE__down,
+   STBTE__over_down,
+   STBTE__selected,
+   STBTE__selected_over,
+   STBTE__disabled,
+   STBTE__num_color_states,
+};
+
+enum
+{
+   STBTE__cexpander,
+   STBTE__ctoolbar,
+   STBTE__ctoolbar_button,
+   STBTE__cpanel,
+   STBTE__cpanel_sider,
+   STBTE__cpanel_sizer,
+   STBTE__cscrollbar,
+   STBTE__cmapsize,
+   STBTE__clayer_button,
+   STBTE__clayer_hide,
+   STBTE__clayer_lock,
+   STBTE__clayer_solo,
+   STBTE__ccategory_button,
+
+   STBTE__num_color_modes,
+};
+
+#ifdef STBTE__COLORPICKER
+static char *stbte__color_names[] =
+{
+   "expander", "toolbar", "tool button", "panel",
+   "panel c1", "panel c2", "scollbar", "map button",
+   "layer", "hide", "lock", "solo",
+   "category",
+};
+#endif // STBTE__COLORPICKER
+
+      // idle,    over,     down,    over&down, selected, sel&over, disabled
+static int stbte__color_table[STBTE__num_color_modes][STBTE__num_color_aspects][STBTE__num_color_states] =
+{
+   {
+      { 0x000000, 0x84987c, 0xdcdca8, 0xdcdca8, 0x40c040, 0x60d060, 0x505050, },
+      { 0xa4b090, 0xe0ec80, 0xffffc0, 0xffffc0, 0x80ff80, 0x80ff80, 0x606060, },
+      { 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x909090, },
+   }, {
+      { 0x808890, 0x606060, 0x606060, 0x606060, 0x606060, 0x606060, 0x606060, },
+      { 0x605860, 0x606060, 0x606060, 0x606060, 0x606060, 0x606060, 0x606060, },
+      { 0x000000, 0x000000, 0x000000, 0x000000, 0x000000, 0x000000, 0x000000, },
+   }, {
+      { 0x3c5068, 0x7088a8, 0x647488, 0x94b4dc, 0x8890c4, 0x9caccc, 0x404040, },
+      { 0x889cb8, 0x889cb8, 0x889cb8, 0x889cb8, 0x84c4e8, 0xacc8ff, 0x0c0c08, },
+      { 0xbcc4cc, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x707074, },
+   }, {
+      { 0x403848, 0x403010, 0x403010, 0x403010, 0x403010, 0x403010, 0x303024, },
+      { 0x68546c, 0xc08040, 0xc08040, 0xc08040, 0xc08040, 0xc08040, 0x605030, },
+      { 0xf4e4ff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x909090, },
+   }, {
+      { 0xb4b04c, 0xacac60, 0xc0ffc0, 0xc0ffc0, 0x40c040, 0x60d060, 0x505050, },
+      { 0xa0a04c, 0xd0d04c, 0xffff80, 0xffff80, 0x80ff80, 0x80ff80, 0x606060, },
+      { 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x909090, },
+   }, {
+      { 0x40c440, 0x60d060, 0xc0ffc0, 0xc0ffc0, 0x40c040, 0x60d060, 0x505050, },
+      { 0x40c040, 0x80ff80, 0x80ff80, 0x80ff80, 0x80ff80, 0x80ff80, 0x606060, },
+      { 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x909090, },
+   }, {
+      { 0x9090ac, 0xa0a0b8, 0xbcb8cc, 0xbcb8cc, 0x909040, 0x909040, 0x909040, },
+      { 0xa0a0b8, 0xb0b4d0, 0xa0a0b8, 0xa0a0b8, 0xa0a050, 0xa0a050, 0xa0a050, },
+      { 0x808088, 0x808030, 0x808030, 0x808030, 0x808030, 0x808030, 0x808030, },
+   }, {
+      { 0x704c70, 0x885c8c, 0x9c68a4, 0xb870bc, 0xb490bc, 0xb490bc, 0x302828, },
+      { 0x646064, 0xcca8d4, 0xc060c0, 0xa07898, 0xe0b8e0, 0xe0b8e0, 0x403838, },
+      { 0xdccce4, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x909090, },
+   }, {
+      { 0x704c70, 0x885c8c, 0x9c68a4, 0xb870bc, 0xb490bc, 0xb490bc, 0x302828, },
+      { 0xb09cb4, 0xcca8d4, 0xc060c0, 0xa07898, 0xe0b8e0, 0xe0b8e0, 0x403838, },
+      { 0xdccce4, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0xffffff, 0x909090, },
+   }, {
+      { 0x646494, 0x888cb8, 0xb0b0b0, 0xb0b0cc, 0x9c9cf4, 0x8888b0, 0x50506c, },
+      { 0x9090a4, 0xb0b4d4, 0xb0b0dc, 0xb0b0cc, 0xd0d0fc, 0xd0d4f0, 0x606060, },
+      { 0xb4b4d4, 0xe4e4ff, 0xffffff, 0xffffff, 0xe0e4ff, 0xececff, 0x909090, },
+   }, {
+      { 0x646444, 0x888c64, 0xb0b0b0, 0xb0b088, 0xaca858, 0x88886c, 0x505050, },
+      { 0x88886c, 0xb0b490, 0xb0b0b0, 0xb0b088, 0xd8d898, 0xd0d4b0, 0x606060, },
+      { 0xb4b49c, 0xffffd8, 0xffffff, 0xffffd4, 0xffffdc, 0xffffcc, 0x909090, },
+   }, {
+      { 0x906464, 0xb48c8c, 0xd4b0b0, 0xdcb0b0, 0xff9c9c, 0xc88888, 0x505050, },
+      { 0xb47c80, 0xd4b4b8, 0xc4a8a8, 0xdcb0b0, 0xffc0c0, 0xfce8ec, 0x606060, },
+      { 0xe0b4b4, 0xffdcd8, 0xffd8d4, 0xffe0e4, 0xffece8, 0xffffff, 0x909090, },
+   }, {
+      { 0x403848, 0x403848, 0x403848, 0x886894, 0x7c80c8, 0x7c80c8, 0x302828, },
+      { 0x403848, 0x403848, 0x403848, 0x403848, 0x7c80c8, 0x7c80c8, 0x403838, },
+      { 0xc8c4c8, 0xffffff, 0xffffff, 0xffffff, 0xe8e8ec, 0xffffff, 0x909090, },
+   },
+};
+
+#define STBTE_COLOR_TILEMAP_BACKGROUND      0x000000
+#define STBTE_COLOR_TILEMAP_BORDER          0x203060
+#define STBTE_COLOR_TILEMAP_HIGHLIGHT       0xffffff
+#define STBTE_COLOR_GRID                    0x404040
+#define STBTE_COLOR_SELECTION_OUTLINE1      0xdfdfdf
+#define STBTE_COLOR_SELECTION_OUTLINE2      0x303030
+#define STBTE_COLOR_TILEPALETTE_OUTLINE     0xffffff
+#define STBTE_COLOR_TILEPALETTE_BACKGROUND  0x000000
+
+#ifndef STBTE_LINK_COLOR
+#define STBTE_LINK_COLOR(src,sp,dest,dp)    0x5030ff
+#endif
+
+#ifndef STBTE_LINK_COLOR_DRAWING
+#define STBTE_LINK_COLOR_DRAWING            0xff40ff
+#endif
+
+#ifndef STBTE_LINK_COLOR_DISALLOWED
+#define STBTE_LINK_COLOR_DISALLOWED         0x602060
+#endif
+
+
+// disabled, selected, down, over
+static unsigned char stbte__state_to_index[2][2][2][2] =
+{
+   {
+      { { STBTE__idle    , STBTE__over          }, { STBTE__down    , STBTE__over_down }, },
+      { { STBTE__selected, STBTE__selected_over }, { STBTE__down    , STBTE__over_down }, },
+   },{
+      { { STBTE__disabled, STBTE__disabled      }, { STBTE__disabled, STBTE__disabled  }, },
+      { { STBTE__selected, STBTE__selected_over }, { STBTE__disabled, STBTE__disabled  }, },
+   }
+};
+#define STBTE__INDEX_FOR_STATE(disable,select,down,over) stbte__state_to_index[disable][select][down][over]
+#define STBTE__INDEX_FOR_ID(id,disable,select) STBTE__INDEX_FOR_STATE(disable,select,STBTE__IS_ACTIVE(id),STBTE__IS_HOT(id))
+
+#define STBTE__FONT_HEIGHT    9
+static short stbte__font_offset[95+16];
+static short stbte__fontdata[769] =
+{
+   4,9,6,9,9,9,9,8,9,8,4,9,7,7,7,7,4,2,6,8,6,6,7,3,4,4,8,6,3,6,2,6,6,6,6,6,6,
+   6,6,6,6,6,2,3,5,4,5,6,6,6,6,6,6,6,6,6,6,6,6,7,6,7,7,7,6,7,6,6,6,6,7,7,6,6,
+   6,4,6,4,7,7,3,6,6,5,6,6,5,6,6,4,5,6,4,7,6,6,6,6,6,6,6,6,6,7,6,6,6,5,2,5,8,
+   0,0,0,0,2,253,130,456,156,8,72,184,64,2,125,66,64,160,64,146,511,146,146,
+   511,146,146,511,146,511,257,341,297,341,297,341,257,511,16,56,124,16,16,16,
+   124,56,16,96,144,270,261,262,136,80,48,224,192,160,80,40,22,14,15,3,448,496,
+   496,240,232,20,10,5,2,112,232,452,450,225,113,58,28,63,30,60,200,455,257,
+   257,0,0,0,257,257,455,120,204,132,132,159,14,4,4,14,159,132,132,204,120,8,
+   24,56,120,56,24,8,32,48,56,60,56,48,32,0,0,0,0,111,111,7,7,0,0,7,7,34,127,
+   127,34,34,127,127,34,36,46,107,107,58,18,99,51,24,12,102,99,48,122,79,93,
+   55,114,80,4,7,3,62,127,99,65,65,99,127,62,8,42,62,28,28,62,42,8,8,8,62,62,
+   8,8,128,224,96,8,8,8,8,8,8,96,96,96,48,24,12,6,3,62,127,89,77,127,62,64,66,
+   127,127,64,64,98,115,89,77,71,66,33,97,73,93,119,35,24,28,22,127,127,16,39,
+   103,69,69,125,57,62,127,73,73,121,48,1,1,113,121,15,7,54,127,73,73,127,54,
+   6,79,73,105,63,30,54,54,128,246,118,8,28,54,99,65,20,20,20,20,65,99,54,28,
+   8,2,3,105,109,7,2,30,63,33,45,47,46,124,126,19,19,126,124,127,127,73,73,127,
+   54,62,127,65,65,99,34,127,127,65,99,62,28,127,127,73,73,73,65,127,127,9,9,
+   9,1,62,127,65,73,121,121,127,127,8,8,127,127,65,65,127,127,65,65,32,96,64,
+   64,127,63,127,127,8,28,54,99,65,127,127,64,64,64,64,127,127,6,12,6,127,127,
+   127,127,6,12,24,127,127,62,127,65,65,65,127,62,127,127,9,9,15,6,62,127,65,
+   81,49,127,94,127,127,9,25,127,102,70,79,73,73,121,49,1,1,127,127,1,1,63,127,
+   64,64,127,63,15,31,48,96,48,31,15,127,127,48,24,48,127,127,99,119,28,28,119,
+   99,7,15,120,120,15,7,97,113,89,77,71,67,127,127,65,65,3,6,12,24,48,96,65,
+   65,127,127,8,12,6,3,6,12,8,64,64,64,64,64,64,64,3,7,4,32,116,84,84,124,120,
+   127,127,68,68,124,56,56,124,68,68,68,56,124,68,68,127,127,56,124,84,84,92,
+   24,8,124,126,10,10,56,380,324,324,508,252,127,127,4,4,124,120,72,122,122,
+   64,256,256,256,506,250,126,126,16,56,104,64,66,126,126,64,124,124,24,56,28,
+   124,120,124,124,4,4,124,120,56,124,68,68,124,56,508,508,68,68,124,56,56,124,
+   68,68,508,508,124,124,4,4,12,8,72,92,84,84,116,36,4,4,62,126,68,68,60,124,
+   64,64,124,124,28,60,96,96,60,28,28,124,112,56,112,124,28,68,108,56,56,108,
+   68,284,316,352,320,508,252,68,100,116,92,76,68,8,62,119,65,65,127,127,65,
+   65,119,62,8,16,24,12,12,24,24,12,4,
+};
+
+typedef struct
+{
+   short id;
+   unsigned short category_id;
+   char *category;
+   unsigned int layermask;
+} stbte__tileinfo;
+
+#define MAX_LAYERMASK    (1 << (8*sizeof(unsigned int)))
+
+typedef short stbte__tiledata;
+
+#define STBTE__NO_TILE   -1
+
+enum
+{
+   STBTE__panel_toolbar,
+   STBTE__panel_colorpick,
+   STBTE__panel_info,
+   STBTE__panel_layers,
+   STBTE__panel_props,
+   STBTE__panel_categories,
+   STBTE__panel_tiles,
+
+   STBTE__num_panel,
+};
+
+enum
+{
+   STBTE__side_left,
+   STBTE__side_right,
+   STBTE__side_top,
+   STBTE__side_bottom,
+};
+
+enum
+{
+   STBTE__tool_select,
+   STBTE__tool_brush,
+   STBTE__tool_erase,
+   STBTE__tool_rect,
+   STBTE__tool_eyedrop,
+   STBTE__tool_fill,
+   STBTE__tool_link,
+
+   STBTE__tool_showgrid,
+   STBTE__tool_showlinks,
+
+   STBTE__tool_undo,
+   STBTE__tool_redo,
+   // copy/cut/paste aren't included here because they're displayed differently
+
+   STBTE__num_tool,
+};
+
+// icons are stored in the 0-31 range of ASCII in the font
+static int toolchar[] = { 26,24,25,20,23,22,18, 19,17, 29,28, };
+
+enum
+{
+   STBTE__propmode_default,
+   STBTE__propmode_always,
+   STBTE__propmode_never,
+};
+
+enum
+{
+   STBTE__paint,
+
+   // from here down does hittesting
+   STBTE__tick,
+   STBTE__mousemove,
+   STBTE__mousewheel,
+   STBTE__leftdown,
+   STBTE__leftup,
+   STBTE__rightdown,
+   STBTE__rightup,
+};
+
+typedef struct
+{
+   int expanded, mode;
+   int delta_height;     // number of rows they've requested for this
+   int side;
+   int width,height;
+   int x0,y0;
+} stbte__panel;
+
+typedef struct
+{
+   int x0,y0,x1,y1,color;
+} stbte__colorrect;
+
+#define STBTE__MAX_DELAYRECT 256
+
+typedef struct
+{
+   int tool, active_event;
+   int active_id, hot_id, next_hot_id;
+   int event;
+   int mx,my, dx,dy;
+   int ms_time;
+   int shift, scrollkey;
+   int initted;
+   int side_extended[2];
+   stbte__colorrect delayrect[STBTE__MAX_DELAYRECT];
+   int delaycount;
+   int show_grid, show_links;
+   int brush_state; // used to decide which kind of erasing
+   int eyedrop_x, eyedrop_y, eyedrop_last_layer;
+   int pasting, paste_x, paste_y;
+   int scrolling, start_x, start_y;
+   int last_mouse_x, last_mouse_y;
+   int accum_x, accum_y;
+   int linking;
+   int dragging;
+   int drag_x, drag_y, drag_w, drag_h;
+   int drag_offx, drag_offy, drag_dest_x, drag_dest_y;
+   int undoing;
+   int has_selection, select_x0, select_y0, select_x1, select_y1;
+   int sx,sy;
+   int x0,y0,x1,y1, left_width, right_width; // configurable widths
+   float alert_timer;
+   const char *alert_msg;
+   float dt;
+   stbte__panel panel[STBTE__num_panel];
+   short copybuffer[STBTE_MAX_COPY][STBTE_MAX_LAYERS];
+   float copyprops[STBTE_MAX_COPY][STBTE_MAX_PROPERTIES];
+#ifdef STBTE_ALLOW_LINK
+   stbte__link copylinks[STBTE_MAX_COPY];
+#endif
+   int copy_src_x, copy_src_y;
+   stbte_tilemap *copy_src;
+   int copy_width,copy_height,has_copy,copy_has_props;
+} stbte__ui_t;
+
+// there's only one UI system at a time, so we can globalize this
+static stbte__ui_t stbte__ui = { STBTE__tool_brush, 0 };
+
+#define STBTE__INACTIVE()     (stbte__ui.active_id == 0)
+#define STBTE__IS_ACTIVE(id)  (stbte__ui.active_id == (id))
+#define STBTE__IS_HOT(id)     (stbte__ui.hot_id    == (id))
+
+#define STBTE__BUTTON_HEIGHT            (STBTE__FONT_HEIGHT + 2 * STBTE__BUTTON_INTERNAL_SPACING)
+#define STBTE__BUTTON_INTERNAL_SPACING  (2 + (STBTE__FONT_HEIGHT>>4))
+
+typedef struct
+{
+   const char *name;
+   int locked;
+   int hidden;
+} stbte__layer;
+
+enum
+{
+   STBTE__unlocked,
+   STBTE__protected,
+   STBTE__locked,
+};
+
+struct stbte_tilemap
+{
+    stbte__tiledata data[STBTE_MAX_TILEMAP_Y][STBTE_MAX_TILEMAP_X][STBTE_MAX_LAYERS];
+    float props[STBTE_MAX_TILEMAP_Y][STBTE_MAX_TILEMAP_X][STBTE_MAX_PROPERTIES];
+    #ifdef STBTE_ALLOW_LINK
+    stbte__link link[STBTE_MAX_TILEMAP_Y][STBTE_MAX_TILEMAP_X];
+    int linkcount[STBTE_MAX_TILEMAP_Y][STBTE_MAX_TILEMAP_X];
+    #endif
+    int max_x, max_y, num_layers;
+    int spacing_x, spacing_y;
+    int palette_spacing_x, palette_spacing_y;
+    int scroll_x,scroll_y;
+    int cur_category, cur_tile, cur_layer;
+    char *categories[STBTE_MAX_CATEGORIES];
+    int num_categories, category_scroll;
+    stbte__tileinfo *tiles;
+    int num_tiles, max_tiles, digits;
+    unsigned char undo_available_valid;
+    unsigned char undo_available;
+    unsigned char redo_available;
+    unsigned char padding;
+    int cur_palette_count;
+    int palette_scroll;
+    int tileinfo_dirty;
+    stbte__layer layerinfo[STBTE_MAX_LAYERS];
+    int has_layer_names;
+    int layername_width;
+    int layer_scroll;
+    int propmode;
+    int solo_layer;
+    int undo_pos, undo_len, redo_len;
+    short background_tile;
+    unsigned char id_in_use[32768>>3];
+    short *undo_buffer;
+};
+
+static char *default_category = (char*) "[unassigned]";
+
+static void stbte__init_gui(void)
+{
+   int i,n;
+   stbte__ui.initted = 1;
+   // init UI state
+   stbte__ui.show_links = 1;
+   for (i=0; i < STBTE__num_panel; ++i) {
+      stbte__ui.panel[i].expanded     = 1; // visible if not autohidden
+      stbte__ui.panel[i].delta_height = 0;
+      stbte__ui.panel[i].side         = STBTE__side_left;
+   }
+   stbte__ui.panel[STBTE__panel_toolbar  ].side = STBTE__side_top;
+   stbte__ui.panel[STBTE__panel_colorpick].side = STBTE__side_right;
+
+   if (stbte__ui.left_width == 0)
+      stbte__ui.left_width = 80;
+   if (stbte__ui.right_width == 0)
+      stbte__ui.right_width = 80;
+
+   // init font
+   n=95+16;
+   for (i=0; i < 95+16; ++i) {
+      stbte__font_offset[i] = n;
+      n += stbte__fontdata[i];
+   }
+}
+
+stbte_tilemap *stbte_create_map(int map_x, int map_y, int map_layers, int spacing_x, int spacing_y, int max_tiles)
+{
+   int i;
+   stbte_tilemap *tm;
+   STBTE_ASSERT(map_layers >= 0 && map_layers <= STBTE_MAX_LAYERS);
+   STBTE_ASSERT(map_x >= 0 && map_x <= STBTE_MAX_TILEMAP_X);
+   STBTE_ASSERT(map_y >= 0 && map_y <= STBTE_MAX_TILEMAP_Y);
+   if (map_x < 0 || map_y < 0 || map_layers < 0 ||
+       map_x > STBTE_MAX_TILEMAP_X || map_y > STBTE_MAX_TILEMAP_Y || map_layers > STBTE_MAX_LAYERS)
+      return NULL;
+
+   if (!stbte__ui.initted)
+      stbte__init_gui();
+
+   tm = (stbte_tilemap *) malloc(sizeof(*tm) + sizeof(*tm->tiles) * max_tiles + STBTE_UNDO_BUFFER_BYTES);
+   if (tm == NULL)
+      return NULL;
+
+   tm->tiles = (stbte__tileinfo *) (tm+1);
+   tm->undo_buffer = (short *) (tm->tiles + max_tiles);
+   tm->num_layers = map_layers;
+   tm->max_x = map_x;
+   tm->max_y = map_y;
+   tm->spacing_x = spacing_x;
+   tm->spacing_y = spacing_y;
+   tm->scroll_x = 0;
+   tm->scroll_y = 0;
+   tm->palette_scroll = 0;
+   tm->palette_spacing_x = spacing_x+1;
+   tm->palette_spacing_y = spacing_y+1;
+   tm->cur_category = -1;
+   tm->cur_tile = 0;
+   tm->solo_layer = -1;
+   tm->undo_len = 0;
+   tm->redo_len = 0;
+   tm->undo_pos = 0;
+   tm->category_scroll = 0;
+   tm->layer_scroll = 0;
+   tm->propmode = 0;
+   tm->has_layer_names = 0;
+   tm->layername_width = 0;
+   tm->undo_available_valid = 0;
+
+   for (i=0; i < tm->num_layers; ++i) {
+      tm->layerinfo[i].hidden = 0;
+      tm->layerinfo[i].locked = STBTE__unlocked;
+      tm->layerinfo[i].name   = 0;
+   }
+
+   tm->background_tile = STBTE__NO_TILE;
+   stbte_clear_map(tm);
+
+   tm->max_tiles = max_tiles;
+   tm->num_tiles = 0;
+   for (i=0; i < 32768/8; ++i)
+      tm->id_in_use[i] = 0;
+   tm->tileinfo_dirty = 1;
+   return tm;
+}
+
+void stbte_set_background_tile(stbte_tilemap *tm, short id)
+{
+   int i;
+   STBTE_ASSERT(id >= -1);
+   // STBTE_ASSERT(id < 32768);
+   if (id < -1)
+      return;
+   for (i=0; i < STBTE_MAX_TILEMAP_X * STBTE_MAX_TILEMAP_Y; ++i)
+      if (tm->data[0][i][0] == -1)
+         tm->data[0][i][0] = id;
+   tm->background_tile = id;
+}
+
+void stbte_set_spacing(stbte_tilemap *tm, int spacing_x, int spacing_y, int palette_spacing_x, int palette_spacing_y)
+{
+   tm->spacing_x = spacing_x;
+   tm->spacing_y = spacing_y;
+   tm->palette_spacing_x = palette_spacing_x;
+   tm->palette_spacing_y = palette_spacing_y;
+}
+
+void stbte_set_sidewidths(int left, int right)
+{
+   stbte__ui.left_width  = left;
+   stbte__ui.right_width = right;
+}
+
+void stbte_set_display(int x0, int y0, int x1, int y1)
+{
+   stbte__ui.x0 = x0;
+   stbte__ui.y0 = y0;
+   stbte__ui.x1 = x1;
+   stbte__ui.y1 = y1;
+}
+
+void stbte_define_tile(stbte_tilemap *tm, unsigned short id, unsigned int layermask, const char * category_c)
+{
+   char *category = (char *) category_c;
+   STBTE_ASSERT(id < 32768);
+   STBTE_ASSERT(tm->num_tiles < tm->max_tiles);
+   STBTE_ASSERT((tm->id_in_use[id>>3]&(1<<(id&7))) == 0);
+   if (id >= 32768 || tm->num_tiles >= tm->max_tiles || (tm->id_in_use[id>>3]&(1<<(id&7))))
+      return;
+
+   if (category == NULL)
+      category = (char*) default_category;
+   tm->id_in_use[id>>3] |= 1 << (id&7);
+   tm->tiles[tm->num_tiles].category    = category;
+   tm->tiles[tm->num_tiles].id        = id;
+   tm->tiles[tm->num_tiles].layermask = layermask;
+   ++tm->num_tiles;
+   tm->tileinfo_dirty = 1;
+}
+
+static int stbte__text_width(const char *str);
+
+void stbte_set_layername(stbte_tilemap *tm, int layer, const char *layername)
+{
+   STBTE_ASSERT(layer >= 0 && layer < tm->num_layers);
+   if (layer >= 0 && layer < tm->num_layers) {
+      int width;
+      tm->layerinfo[layer].name = layername;
+      tm->has_layer_names = 1;
+      width = stbte__text_width(layername);
+      tm->layername_width = (width > tm->layername_width ? width : tm->layername_width);
+   }
+}
+
+void stbte_get_dimensions(stbte_tilemap *tm, int *max_x, int *max_y)
+{
+   *max_x = tm->max_x;
+   *max_y = tm->max_y;
+}
+
+short* stbte_get_tile(stbte_tilemap *tm, int x, int y)
+{
+   STBTE_ASSERT(x >= 0 && x < tm->max_x && y >= 0 && y < tm->max_y);
+   if (x < 0 || x >= STBTE_MAX_TILEMAP_X || y < 0 || y >= STBTE_MAX_TILEMAP_Y)
+      return NULL;
+   return tm->data[y][x];
+}
+
+float *stbte_get_properties(stbte_tilemap *tm, int x, int y)
+{
+   STBTE_ASSERT(x >= 0 && x < tm->max_x && y >= 0 && y < tm->max_y);
+   if (x < 0 || x >= STBTE_MAX_TILEMAP_X || y < 0 || y >= STBTE_MAX_TILEMAP_Y)
+      return NULL;
+   return tm->props[y][x];
+}
+
+void stbte_get_link(stbte_tilemap *tm, int x, int y, int *destx, int *desty)
+{
+   int gx=-1,gy=-1;
+   STBTE_ASSERT(x >= 0 && x < tm->max_x && y >= 0 && y < tm->max_y);
+#ifdef STBTE_ALLOW_LINK
+   if (x >= 0 && x < STBTE_MAX_TILEMAP_X && y >= 0 && y < STBTE_MAX_TILEMAP_Y) {
+      gx = tm->link[y][x].x;
+      gy = tm->link[y][x].y;
+      if (gx >= 0)
+         if (!STBTE_ALLOW_LINK(tm->data[y][x], tm->props[y][x], tm->data[gy][gx], tm->props[gy][gx]))
+            gx = gy = -1;
+   }
+#endif
+   *destx = gx;
+   *desty = gy;
+}
+
+void stbte_set_property(stbte_tilemap *tm, int x, int y, int n, float val)
+{
+   tm->props[y][x][n] = val;
+}
+
+#ifdef STBTE_ALLOW_LINK
+static void stbte__set_link(stbte_tilemap *tm, int src_x, int src_y, int dest_x, int dest_y, int undo_mode);
+#endif
+
+enum
+{
+   STBTE__undo_none,
+   STBTE__undo_record,
+   STBTE__undo_block,
+};
+
+void stbte_set_link(stbte_tilemap *tm, int x, int y, int destx, int desty)
+{
+#ifdef STBTE_ALLOW_LINK
+   stbte__set_link(tm, x, y, destx, desty, STBTE__undo_none);
+#else
+   STBTE_ASSERT(0);
+#endif
+}
+
+
+// returns an array of map_layers shorts. each short is either
+// one of the tile_id values from define_tile, or STBTE_EMPTY
+
+void stbte_set_dimensions(stbte_tilemap *tm, int map_x, int map_y)
+{
+   STBTE_ASSERT(map_x >= 0 && map_x <= STBTE_MAX_TILEMAP_X);
+   STBTE_ASSERT(map_y >= 0 && map_y <= STBTE_MAX_TILEMAP_Y);
+   if (map_x < 0 || map_y < 0 || map_x > STBTE_MAX_TILEMAP_X || map_y > STBTE_MAX_TILEMAP_Y)
+      return;
+   tm->max_x = map_x;
+   tm->max_y = map_y;
+}
+
+void stbte_clear_map(stbte_tilemap *tm)
+{
+   int i,j;
+   for (i=0; i < STBTE_MAX_TILEMAP_X * STBTE_MAX_TILEMAP_Y; ++i) {
+      tm->data[0][i][0] = tm->background_tile;
+      for (j=1; j < tm->num_layers; ++j)
+         tm->data[0][i][j] = STBTE__NO_TILE;
+      for (j=0; j < STBTE_MAX_PROPERTIES; ++j)
+         tm->props[0][i][j] = 0;
+      #ifdef STBTE_ALLOW_LINK
+      tm->link[0][i].x = -1;
+      tm->link[0][i].y = -1;
+      tm->linkcount[0][i] = 0;
+      #endif
+   }
+}
+
+void stbte_set_tile(stbte_tilemap *tm, int x, int y, int layer, signed short tile)
+{
+   STBTE_ASSERT(x >= 0 && x < tm->max_x && y >= 0 && y < tm->max_y);
+   STBTE_ASSERT(layer >= 0 && layer < tm->num_layers);
+   STBTE_ASSERT(tile >= -1);
+   //STBTE_ASSERT(tile < 32768);
+   if (x < 0 || x >= STBTE_MAX_TILEMAP_X || y < 0 || y >= STBTE_MAX_TILEMAP_Y)
+      return;
+   if (layer < 0 || layer >= tm->num_layers || tile < -1)
+      return;
+   tm->data[y][x][layer] = tile;
+}
+
+static void stbte__choose_category(stbte_tilemap *tm, int category)
+{
+   int i,n=0;
+   tm->cur_category = category;
+   for (i=0; i < tm->num_tiles; ++i)
+      if (tm->tiles[i].category_id == category || category == -1)
+         ++n;
+   tm->cur_palette_count = n;
+   tm->palette_scroll = 0;
+}
+
+static int stbte__strequal(char *p, char *q)
+{
+   while (*p)
+      if (*p++ != *q++) return 0;
+   return *q == 0;
+}
+
+static void stbte__compute_tileinfo(stbte_tilemap *tm)
+{
+   int i,j;
+
+   tm->num_categories=0;
+
+   for (i=0; i < tm->num_tiles; ++i) {
+      stbte__tileinfo *t = &tm->tiles[i];
+      // find category
+      for (j=0; j < tm->num_categories; ++j)
+         if (stbte__strequal(t->category, tm->categories[j]))
+            goto found;
+      tm->categories[j] = t->category;
+      ++tm->num_categories;
+     found:
+      t->category_id = (unsigned short) j;
+   }
+
+   // currently number of categories can never decrease because you
+   // can't remove tile definitions, but let's get it right anyway
+   if (tm->cur_category > tm->num_categories) {
+      tm->cur_category = -1;
+   }
+
+   stbte__choose_category(tm, tm->cur_category);
+
+   tm->tileinfo_dirty = 0;
+}
+
+static void stbte__prepare_tileinfo(stbte_tilemap *tm)
+{
+   if (tm->tileinfo_dirty)
+      stbte__compute_tileinfo(tm);
+}
+
+
+/////////////////////// undo system ////////////////////////
+
+// the undo system works by storing "commands" into a buffer, and
+// then playing back those commands. undo and redo have to store
+// the commands in different order.
+//
+// the commands are:
+//
+// 1)  end_of_undo_record
+//       -1:short
+//
+// 2)  end_of_redo_record
+//       -2:short
+//
+// 3)  tile update
+//       tile_id:short (-1..32767)
+//       x_coord:short
+//       y_coord:short
+//       layer:short (0..31)
+//
+// 4)  property update (also used for links)
+//       value_hi:short
+//       value_lo:short
+//       y_coord:short
+//       x_coord:short
+//       property:short (256+prop#)
+//
+// Since we use a circular buffer, we might overwrite the undo storage.
+// To detect this, before playing back commands we scan back and see
+// if we see an end_of_undo_record before hitting the relevant boundary,
+// it's wholly contained.
+//
+// When we read back through, we see them in reverse order, so
+// we'll see the layer number or property number first
+//
+// To be clearer about the circular buffer, there are two cases:
+//     1. a single record is larger than the whole buffer.
+//        this is caught because the end_of_undo_record will
+//        get overwritten.
+//     2. multiple records written are larger than the whole
+//        buffer, so some of them have been overwritten by
+//        the later ones. this is handled by explicitly tracking
+//        the undo length; we never try to parse the data that
+//        got overwritten
+
+// given two points, compute the length between them
+#define stbte__wrap(pos)            ((pos) & (STBTE__UNDO_BUFFER_COUNT-1))
+
+#define STBTE__undo_record  -2
+#define STBTE__redo_record  -3
+#define STBTE__undo_junk    -4  // this is written underneath the undo pointer, never used
+
+static void stbte__write_undo(stbte_tilemap *tm, short value)
+{
+   int pos = tm->undo_pos;
+   tm->undo_buffer[pos] = value;
+   tm->undo_pos = stbte__wrap(pos+1);
+   tm->undo_len += (tm->undo_len < STBTE__UNDO_BUFFER_COUNT-2);
+   tm->redo_len -= (tm->redo_len > 0);
+   tm->undo_available_valid = 0;
+}
+
+static void stbte__write_redo(stbte_tilemap *tm, short value)
+{
+   int pos = tm->undo_pos;
+   tm->undo_buffer[pos] = value;
+   tm->undo_pos = stbte__wrap(pos-1);
+   tm->redo_len += (tm->redo_len < STBTE__UNDO_BUFFER_COUNT-2);
+   tm->undo_len -= (tm->undo_len > 0);
+   tm->undo_available_valid = 0;
+}
+
+static void stbte__begin_undo(stbte_tilemap *tm)
+{
+   tm->redo_len = 0;
+   stbte__write_undo(tm, STBTE__undo_record);
+   stbte__ui.undoing = 1;
+   stbte__ui.alert_msg = 0; // clear alert if they start doing something
+}
+
+static void stbte__end_undo(stbte_tilemap *tm)
+{
+   if (stbte__ui.undoing) {
+      // check if anything got written
+      int pos = stbte__wrap(tm->undo_pos-1);
+      if (tm->undo_buffer[pos] == STBTE__undo_record) {
+         // empty undo record, move back
+         tm->undo_pos = pos;
+         STBTE_ASSERT(tm->undo_len > 0);
+         tm->undo_len -= 1;
+      }
+      tm->undo_buffer[tm->undo_pos] = STBTE__undo_junk;
+      // otherwise do nothing
+
+      stbte__ui.undoing = 0;
+   }
+}
+
+static void stbte__undo_record(stbte_tilemap *tm, int x, int y, int i, int v)
+{
+   STBTE_ASSERT(stbte__ui.undoing);
+   if (stbte__ui.undoing) {
+      stbte__write_undo(tm, v);
+      stbte__write_undo(tm, x);
+      stbte__write_undo(tm, y);
+      stbte__write_undo(tm, i);
+   }
+}
+
+static void stbte__redo_record(stbte_tilemap *tm, int x, int y, int i, int v)
+{
+   stbte__write_redo(tm, v);
+   stbte__write_redo(tm, x);
+   stbte__write_redo(tm, y);
+   stbte__write_redo(tm, i);
+}
+
+static float stbte__extract_float(short s0, short s1)
+{
+   union { float f; short s[2]; } converter;
+   converter.s[0] = s0;
+   converter.s[1] = s1;
+   return converter.f;
+}
+
+static short stbte__extract_short(float f, int slot)
+{
+   union { float f; short s[2]; } converter;
+   converter.f = f;
+   return converter.s[slot];
+}
+
+static void stbte__undo_record_prop(stbte_tilemap *tm, int x, int y, int i, short s0, short s1)
+{
+   STBTE_ASSERT(stbte__ui.undoing);
+   if (stbte__ui.undoing) {
+      stbte__write_undo(tm, s1);
+      stbte__write_undo(tm, s0);
+      stbte__write_undo(tm, x);
+      stbte__write_undo(tm, y);
+      stbte__write_undo(tm, 256+i);
+   }
+}
+
+static void stbte__undo_record_prop_float(stbte_tilemap *tm, int x, int y, int i, float f)
+{
+   stbte__undo_record_prop(tm, x,y,i, stbte__extract_short(f,0), stbte__extract_short(f,1));
+}
+
+static void stbte__redo_record_prop(stbte_tilemap *tm, int x, int y, int i, short s0, short s1)
+{
+   stbte__write_redo(tm, s1);
+   stbte__write_redo(tm, s0);
+   stbte__write_redo(tm, x);
+   stbte__write_redo(tm, y);
+   stbte__write_redo(tm, 256+i);
+}
+
+
+static int stbte__undo_find_end(stbte_tilemap *tm)
+{
+   // first scan through for the end record
+   int i, pos = stbte__wrap(tm->undo_pos-1);
+   for (i=0; i < tm->undo_len;) {
+      STBTE_ASSERT(tm->undo_buffer[pos] != STBTE__undo_junk);
+      if (tm->undo_buffer[pos] == STBTE__undo_record)
+         break;
+      if (tm->undo_buffer[pos] >= 255)
+         pos = stbte__wrap(pos-5), i += 5;
+      else
+         pos = stbte__wrap(pos-4), i += 4;
+   }
+   if (i >= tm->undo_len)
+      return -1;
+   return pos;
+}
+
+static void stbte__undo(stbte_tilemap *tm)
+{
+   int i, pos, endpos;
+   endpos = stbte__undo_find_end(tm);
+   if (endpos < 0)
+      return;
+
+   // we found a complete undo record
+   pos = stbte__wrap(tm->undo_pos-1);
+
+   // start a redo record
+   stbte__write_redo(tm, STBTE__redo_record);
+
+   // so now go back through undo and apply in reverse
+   // order, and copy it to redo
+   for (i=0; endpos != pos; i += 4) {
+      int x,y,n,v;
+      // get the undo entry
+      n = tm->undo_buffer[pos];
+      y = tm->undo_buffer[stbte__wrap(pos-1)];
+      x = tm->undo_buffer[stbte__wrap(pos-2)];
+      v = tm->undo_buffer[stbte__wrap(pos-3)];
+      if (n >= 255) {
+         short s0=0,s1=0;
+         int v2 = tm->undo_buffer[stbte__wrap(pos-4)];
+         pos = stbte__wrap(pos-5);
+         if (n > 255) {
+            float vf = stbte__extract_float(v, v2);
+            s0 = stbte__extract_short(tm->props[y][x][n-256], 0);
+            s1 = stbte__extract_short(tm->props[y][x][n-256], 1);
+            tm->props[y][x][n-256] = vf;
+         } else {
+#ifdef STBTE_ALLOW_LINK
+            s0 = tm->link[y][x].x;
+            s1 = tm->link[y][x].y;
+            stbte__set_link(tm, x,y, v, v2, STBTE__undo_none);
+#endif
+         }
+         // write the redo entry
+         stbte__redo_record_prop(tm, x, y, n-256, s0,s1);
+         // apply the undo entry
+      } else {
+         pos = stbte__wrap(pos-4);
+         // write the redo entry
+         stbte__redo_record(tm, x, y, n, tm->data[y][x][n]);
+         // apply the undo entry
+         tm->data[y][x][n] = (short) v;
+      }
+   }
+   // overwrite undo record with junk
+   tm->undo_buffer[tm->undo_pos] = STBTE__undo_junk;
+}
+
+static int stbte__redo_find_end(stbte_tilemap *tm)
+{
+   // first scan through for the end record
+   int i, pos = stbte__wrap(tm->undo_pos+1);
+   for (i=0; i < tm->redo_len;) {
+      STBTE_ASSERT(tm->undo_buffer[pos] != STBTE__undo_junk);
+      if (tm->undo_buffer[pos] == STBTE__redo_record)
+         break;
+      if (tm->undo_buffer[pos] >= 255)
+         pos = stbte__wrap(pos+5), i += 5;
+      else
+         pos = stbte__wrap(pos+4), i += 4;
+   }
+   if (i >= tm->redo_len)
+      return -1; // this should only ever happen if redo buffer is empty
+   return pos;
+}
+
+static void stbte__redo(stbte_tilemap *tm)
+{
+   // first scan through for the end record
+   int i, pos, endpos;
+   endpos = stbte__redo_find_end(tm);
+   if (endpos < 0)
+      return;
+
+   // we found a complete redo record
+   pos = stbte__wrap(tm->undo_pos+1);
+
+   // start an undo record
+   stbte__write_undo(tm, STBTE__undo_record);
+
+   for (i=0; pos != endpos; i += 4) {
+      int x,y,n,v;
+      n = tm->undo_buffer[pos];
+      y = tm->undo_buffer[stbte__wrap(pos+1)];
+      x = tm->undo_buffer[stbte__wrap(pos+2)];
+      v = tm->undo_buffer[stbte__wrap(pos+3)];
+      if (n >= 255) {
+         int v2 = tm->undo_buffer[stbte__wrap(pos+4)];
+         short s0=0,s1=0;
+         pos = stbte__wrap(pos+5);
+         if (n > 255) {
+            float vf = stbte__extract_float(v, v2);
+            s0 = stbte__extract_short(tm->props[y][x][n-256],0);
+            s1 = stbte__extract_short(tm->props[y][x][n-256],1);
+            tm->props[y][x][n-256] = vf;
+         } else {
+#ifdef STBTE_ALLOW_LINK
+            s0 = tm->link[y][x].x;
+            s1 = tm->link[y][x].y;
+            stbte__set_link(tm, x,y,v,v2, STBTE__undo_none);
+#endif
+         }
+         // don't use stbte__undo_record_prop because it's guarded
+         stbte__write_undo(tm, s1);
+         stbte__write_undo(tm, s0);
+         stbte__write_undo(tm, x);
+         stbte__write_undo(tm, y);
+         stbte__write_undo(tm, n);
+      } else {
+         pos = stbte__wrap(pos+4);
+         // don't use stbte__undo_record because it's guarded
+         stbte__write_undo(tm, tm->data[y][x][n]);
+         stbte__write_undo(tm, x);
+         stbte__write_undo(tm, y);
+         stbte__write_undo(tm, n);
+         tm->data[y][x][n] = (short) v;
+      }
+   }
+   tm->undo_buffer[tm->undo_pos] = STBTE__undo_junk;
+}
+
+// because detecting that undo is available
+static void stbte__recompute_undo_available(stbte_tilemap *tm)
+{
+   tm->undo_available = (stbte__undo_find_end(tm) >= 0);
+   tm->redo_available = (stbte__redo_find_end(tm) >= 0);
+}
+
+static int stbte__undo_available(stbte_tilemap *tm)
+{
+   if (!tm->undo_available_valid)
+      stbte__recompute_undo_available(tm);
+   return tm->undo_available;
+}
+
+static int stbte__redo_available(stbte_tilemap *tm)
+{
+   if (!tm->undo_available_valid)
+      stbte__recompute_undo_available(tm);
+   return tm->redo_available;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef STBTE_ALLOW_LINK
+static void stbte__set_link(stbte_tilemap *tm, int src_x, int src_y, int dest_x, int dest_y, int undo_mode)
+{
+   stbte__link *a;
+   STBTE_ASSERT(src_x >= 0 && src_x < STBTE_MAX_TILEMAP_X && src_y >= 0 && src_y < STBTE_MAX_TILEMAP_Y);
+   a = &tm->link[src_y][src_x];
+   // check if it's a do nothing
+   if (a->x == dest_x && a->y == dest_y)
+      return;
+   if (undo_mode != STBTE__undo_none ) {
+      if (undo_mode == STBTE__undo_block) stbte__begin_undo(tm);
+      stbte__undo_record_prop(tm, src_x, src_y, -1, a->x, a->y);
+      if (undo_mode == STBTE__undo_block) stbte__end_undo(tm);
+   }
+   // check if there's an existing link
+   if (a->x >= 0) {
+      // decrement existing link refcount
+      STBTE_ASSERT(tm->linkcount[a->y][a->x] > 0);
+      --tm->linkcount[a->y][a->x];
+   }
+   // increment new dest
+   if (dest_x >= 0) {
+      ++tm->linkcount[dest_y][dest_x];
+   }
+   a->x = dest_x;
+   a->y = dest_y;
+}
+#endif
+
+
+static void stbte__draw_rect(int x0, int y0, int x1, int y1, unsigned int color)
+{
+   STBTE_DRAW_RECT(x0,y0,x1,y1, color);
+}
+
+#ifdef STBTE_ALLOW_LINK
+static void stbte__draw_line(int x0, int y0, int x1, int y1, unsigned int color)
+{
+   int temp;
+   if (x1 < x0) temp=x0,x0=x1,x1=temp;
+   if (y1 < y0) temp=y0,y0=y1,y1=temp;
+   stbte__draw_rect(x0,y0,x1+1,y1+1,color);
+}
+
+static void stbte__draw_link(int x0, int y0, int x1, int y1, unsigned int color)
+{
+   stbte__draw_line(x0,y0,x0,y1, color);
+   stbte__draw_line(x0,y1,x1,y1, color);
+}
+#endif
+
+static void stbte__draw_frame(int x0, int y0, int x1, int y1, unsigned int color)
+{
+   stbte__draw_rect(x0,y0,x1-1,y0+1,color);
+   stbte__draw_rect(x1-1,y0,x1,y1-1,color);
+   stbte__draw_rect(x0+1,y1-1,x1,y1,color);
+   stbte__draw_rect(x0,y0+1,x0+1,y1,color);
+}
+
+static int stbte__get_char_width(int ch)
+{
+   return stbte__fontdata[ch-16];
+}
+
+static short *stbte__get_char_bitmap(int ch)
+{
+   return stbte__fontdata + stbte__font_offset[ch-16];
+}
+
+static void stbte__draw_bitmask_as_columns(int x, int y, short bitmask, int color)
+{
+   int start_i = -1, i=0;
+   while (bitmask) {
+      if (bitmask & (1<<i)) {
+         if (start_i < 0)
+            start_i = i;
+      } else if (start_i >= 0) {
+         stbte__draw_rect(x, y+start_i, x+1, y+i, color);
+         start_i = -1;
+         bitmask &= ~((1<<i)-1); // clear all the old bits; we don't clear them as we go to save code
+      }
+      ++i;
+   }
+}
+
+static void stbte__draw_bitmap(int x, int y, int w, short *bitmap, int color)
+{
+   int i;
+   for (i=0; i < w; ++i)
+      stbte__draw_bitmask_as_columns(x+i, y, *bitmap++, color);
+}
+
+static void stbte__draw_text_core(int x, int y, const char *str, int w, int color, int digitspace)
+{
+   int x_end = x+w;
+   while (*str) {
+      int c = *str++;
+      int cw = stbte__get_char_width(c);
+      if (x + cw > x_end)
+         break;
+      stbte__draw_bitmap(x, y, cw, stbte__get_char_bitmap(c), color);
+      if (digitspace && c == ' ')
+         cw = stbte__get_char_width('0');
+      x += cw+1;
+   }
+}
+
+static void stbte__draw_text(int x, int y, const char *str, int w, int color)
+{
+   stbte__draw_text_core(x,y,str,w,color,0);
+}
+
+static int stbte__text_width(const char *str)
+{
+   int x = 0;
+   while (*str) {
+      int c = *str++;
+      int cw = stbte__get_char_width(c);
+      x += cw+1;
+   }
+   return x;
+}
+
+static void stbte__draw_frame_delayed(int x0, int y0, int x1, int y1, int color)
+{
+   if (stbte__ui.delaycount < STBTE__MAX_DELAYRECT) {
+      stbte__colorrect r = { x0,y0,x1,y1,color };
+      stbte__ui.delayrect[stbte__ui.delaycount++] = r;
+   }
+}
+
+static void stbte__flush_delay(void)
+{
+   stbte__colorrect *r;
+   int i;
+   r = stbte__ui.delayrect;
+   for (i=0; i < stbte__ui.delaycount; ++i,++r)
+      stbte__draw_frame(r->x0,r->y0,r->x1,r->y1,r->color);
+   stbte__ui.delaycount = 0;
+}
+
+static void stbte__activate(int id)
+{
+   stbte__ui.active_id = id;
+   stbte__ui.active_event = stbte__ui.event;
+   stbte__ui.accum_x = 0;
+   stbte__ui.accum_y = 0;
+}
+
+static int stbte__hittest(int x0, int y0, int x1, int y1, int id)
+{
+   int over =    stbte__ui.mx >= x0 && stbte__ui.my >= y0
+              && stbte__ui.mx <  x1 && stbte__ui.my <  y1;
+
+   if (over && stbte__ui.event >= STBTE__tick)
+      stbte__ui.next_hot_id = id;
+
+   return over;
+}
+
+static int stbte__button_core(int id)
+{
+   switch (stbte__ui.event) {
+      case STBTE__leftdown:
+         if (stbte__ui.hot_id == id && STBTE__INACTIVE())
+            stbte__activate(id);
+         break;
+      case STBTE__leftup:
+         if (stbte__ui.active_id == id && STBTE__IS_HOT(id)) {
+            stbte__activate(0);
+            return 1;
+         }
+         break;
+      case STBTE__rightdown:
+         if (stbte__ui.hot_id == id && STBTE__INACTIVE())
+            stbte__activate(id);
+         break;
+      case STBTE__rightup:
+         if (stbte__ui.active_id == id && STBTE__IS_HOT(id)) {
+            stbte__activate(0);
+            return -1;
+         }
+         break;
+   }
+   return 0;
+}
+
+static void stbte__draw_box(int x0, int y0, int x1, int y1, int colormode, int colorindex)
+{
+   stbte__draw_rect (x0,y0,x1,y1, stbte__color_table[colormode][STBTE__base   ][colorindex]);
+   stbte__draw_frame(x0,y0,x1,y1, stbte__color_table[colormode][STBTE__outline][colorindex]);
+}
+
+static void stbte__draw_textbox(int x0, int y0, int x1, int y1, char *text, int xoff, int yoff, int colormode, int colorindex)
+{
+   stbte__draw_box(x0,y0,x1,y1,colormode,colorindex);
+   stbte__draw_text(x0+xoff,y0+yoff, text, x1-x0-xoff-1, stbte__color_table[colormode][STBTE__text][colorindex]);
+}
+
+static int stbte__button(int colormode, const char *label, int x, int y, int textoff, int width, int id, int toggled, int disabled)
+{
+   int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT;
+   int s = STBTE__BUTTON_INTERNAL_SPACING;
+
+   if(!disabled) stbte__hittest(x0,y0,x1,y1,id);
+
+   if (stbte__ui.event == STBTE__paint)
+      stbte__draw_textbox(x0,y0,x1,y1, (char*) label,s+textoff,s, colormode, STBTE__INDEX_FOR_ID(id,disabled,toggled));
+   if (disabled)
+      return 0;
+   return (stbte__button_core(id) == 1);
+}
+
+static int stbte__button_icon(int colormode, char ch, int x, int y, int width, int id, int toggled, int disabled)
+{
+   int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT;
+   int s = STBTE__BUTTON_INTERNAL_SPACING;
+
+   stbte__hittest(x0,y0,x1,y1,id);
+
+   if (stbte__ui.event == STBTE__paint) {
+      char label[2] = { ch, 0 };
+      int pad = (9 - stbte__get_char_width(ch))/2;
+      stbte__draw_textbox(x0,y0,x1,y1, label,s+pad,s, colormode, STBTE__INDEX_FOR_ID(id,disabled,toggled));
+   }
+   if (disabled)
+      return 0;
+   return (stbte__button_core(id) == 1);
+}
+
+static int stbte__minibutton(int colormode, int x, int y, int ch, int id)
+{
+   int x0 = x, y0 = y, x1 = x+8, y1 = y+7;
+   stbte__hittest(x0,y0,x1,y1,id);
+   if (stbte__ui.event == STBTE__paint) {
+      char str[2] = { (char)ch, 0 };
+      stbte__draw_textbox(x0,y0,x1,y1, str,1,0,colormode, STBTE__INDEX_FOR_ID(id,0,0));
+   }
+   return stbte__button_core(id);
+}
+
+static int stbte__layerbutton(int x, int y, int ch, int id, int toggled, int disabled, int colormode)
+{
+   int x0 = x, y0 = y, x1 = x+10, y1 = y+11;
+   if(!disabled) stbte__hittest(x0,y0,x1,y1,id);
+   if (stbte__ui.event == STBTE__paint) {
+      char str[2] = { (char)ch, 0 };
+      int off = (9-stbte__get_char_width(ch))/2;
+      stbte__draw_textbox(x0,y0,x1,y1, str, off+1,2, colormode, STBTE__INDEX_FOR_ID(id,disabled,toggled));
+   }
+   if (disabled)
+      return 0;
+   return stbte__button_core(id);
+}
+
+static int stbte__microbutton(int x, int y, int size, int id, int colormode)
+{
+   int x0 = x, y0 = y, x1 = x+size, y1 = y+size;
+   stbte__hittest(x0,y0,x1,y1,id);
+   if (stbte__ui.event == STBTE__paint) {
+      stbte__draw_box(x0,y0,x1,y1, colormode, STBTE__INDEX_FOR_ID(id,0,0));
+   }
+   return stbte__button_core(id);
+}
+
+static int stbte__microbutton_dragger(int x, int y, int size, int id, int *pos)
+{
+   int x0 = x, y0 = y, x1 = x+size, y1 = y+size;
+   stbte__hittest(x0,y0,x1,y1,id);
+   switch (stbte__ui.event) {
+      case STBTE__paint:
+         stbte__draw_box(x0,y0,x1,y1, STBTE__cexpander, STBTE__INDEX_FOR_ID(id,0,0));
+         break;
+      case STBTE__leftdown:
+         if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+            stbte__activate(id);
+            stbte__ui.sx = stbte__ui.mx - *pos;
+         }
+         break;
+      case STBTE__mousemove:
+         if (STBTE__IS_ACTIVE(id) && stbte__ui.active_event == STBTE__leftdown) {
+            *pos = stbte__ui.mx - stbte__ui.sx;
+         }
+         break;
+      case STBTE__leftup:
+         if (STBTE__IS_ACTIVE(id))
+            stbte__activate(0);
+         break;
+      default:
+         return stbte__button_core(id);
+   }
+   return 0;
+}
+
+static int stbte__category_button(const char *label, int x, int y, int width, int id, int toggled)
+{
+   int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT;
+   int s = STBTE__BUTTON_INTERNAL_SPACING;
+
+   stbte__hittest(x0,y0,x1,y1,id);
+
+   if (stbte__ui.event == STBTE__paint)
+      stbte__draw_textbox(x0,y0,x1,y1, (char*) label, s,s, STBTE__ccategory_button, STBTE__INDEX_FOR_ID(id,0,toggled));
+
+   return (stbte__button_core(id) == 1);
+}
+
+enum
+{
+   STBTE__none,
+   STBTE__begin,
+   STBTE__end,
+   STBTE__change,
+};
+
+// returns -1 if value changes, 1 at end of drag
+static int stbte__slider(int x0, int w, int y, int range, int *value, int id)
+{
+   int x1 = x0+w;
+   int pos = *value * w / (range+1);
+   stbte__hittest(x0,y-2,x1,y+3,id);
+   int event_mouse_move = STBTE__change;
+   switch (stbte__ui.event) {
+      case STBTE__paint:
+         stbte__draw_rect(x0,y,x1,y+1, 0x808080);
+         stbte__draw_rect(x0+pos-1,y-1,x0+pos+2,y+2, 0xffffff);
+         break;
+      case STBTE__leftdown:
+         if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+            stbte__activate(id);
+            event_mouse_move = STBTE__begin;
+         }
+         // fall through
+      case STBTE__mousemove:
+         if (STBTE__IS_ACTIVE(id)) {
+            int v = (stbte__ui.mx-x0)*(range+1)/w;
+            if (v < 0) v = 0; else if (v > range) v = range;
+            *value = v;
+            return event_mouse_move;
+         }
+         break;
+      case STBTE__leftup:
+         if (STBTE__IS_ACTIVE(id)) {
+            stbte__activate(0);
+            return STBTE__end;
+         }
+         break;
+   }
+   return STBTE__none;
+}
+
+#if defined(_WIN32) && defined(__STDC_WANT_SECURE_LIB__)
+   #define stbte__sprintf      sprintf_s
+   #define stbte__sizeof(s)    , sizeof(s)
+#else
+   #define stbte__sprintf      sprintf
+   #define stbte__sizeof(s)
+#endif
+
+static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, float scale, const char *fmt, float *value, int colormode, int id)
+{
+   int x1 = x0+w;
+   int y1 = y0+11;
+   stbte__hittest(x0,y0,x1,y1,id);
+   switch (stbte__ui.event) {
+      case STBTE__paint: {
+         char text[32];
+         stbte__sprintf(text stbte__sizeof(text), fmt ? fmt : "%6.2f", *value);
+         stbte__draw_textbox(x0,y0,x1,y1, text, 1,2, colormode, STBTE__INDEX_FOR_ID(id,0,0));
+         break;
+      }
+      case STBTE__leftdown:
+      case STBTE__rightdown:
+         if (STBTE__IS_HOT(id) && STBTE__INACTIVE())
+            stbte__activate(id);
+         return STBTE__begin;
+         break;
+      case STBTE__leftup:
+      case STBTE__rightup:
+         if (STBTE__IS_ACTIVE(id)) {
+            stbte__activate(0);
+            return STBTE__end;
+         }
+         break;
+      case STBTE__mousemove:
+         if (STBTE__IS_ACTIVE(id)) {
+            float v = *value, delta;
+            int ax = stbte__ui.accum_x/STBTE_FLOAT_CONTROL_GRANULARITY;
+            int ay = stbte__ui.accum_y/STBTE_FLOAT_CONTROL_GRANULARITY;
+            stbte__ui.accum_x -= ax*STBTE_FLOAT_CONTROL_GRANULARITY;
+            stbte__ui.accum_y -= ay*STBTE_FLOAT_CONTROL_GRANULARITY;
+            if (stbte__ui.shift) {
+               if (stbte__ui.active_event == STBTE__leftdown)
+                  delta = ax * 16.0f + ay;
+               else
+                  delta = ax / 16.0f + ay / 256.0f;
+            } else {
+               if (stbte__ui.active_event == STBTE__leftdown)
+                  delta = ax*10.0f + ay;
+               else
+                  delta = ax * 0.1f + ay * 0.01f;
+            }
+            v += delta * scale;
+            if (v < minv) v = minv;
+            if (v > maxv) v = maxv;
+            *value = v;
+            return STBTE__change;
+         }
+         break;
+   }
+   return STBTE__none;
+}
+
+static void stbte__scrollbar(int x, int y0, int y1, int *val, int v0, int v1, int num_vis, int id)
+{
+   int thumbpos;
+   if (v1 - v0 <= num_vis)
+      return;
+
+   // generate thumbpos from numvis
+   thumbpos = y0+2 + (y1-y0-4) * *val / (v1 - v0 - num_vis);
+   if (thumbpos < y0) thumbpos = y0;
+   if (thumbpos >= y1) thumbpos = y1;
+   stbte__hittest(x-1,y0,x+2,y1,id);
+   switch (stbte__ui.event) {
+      case STBTE__paint:
+         stbte__draw_rect(x,y0,x+1,y1, stbte__color_table[STBTE__cscrollbar][STBTE__text][STBTE__idle]);
+         stbte__draw_box(x-1,thumbpos-3,x+2,thumbpos+4, STBTE__cscrollbar, STBTE__INDEX_FOR_ID(id,0,0));
+         break;
+      case STBTE__leftdown:
+         if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+            // check if it's over the thumb
+            stbte__activate(id);
+            *val = ((stbte__ui.my-y0) * (v1 - v0 - num_vis) + (y1-y0)/2)/ (y1-y0);
+         }
+         break;
+      case STBTE__mousemove:
+         if (STBTE__IS_ACTIVE(id) && stbte__ui.mx >= x-15 && stbte__ui.mx <= x+15)
+            *val = ((stbte__ui.my-y0) * (v1 - v0 - num_vis) + (y1-y0)/2)/ (y1-y0);
+         break;
+      case STBTE__leftup:
+         if (STBTE__IS_ACTIVE(id))
+            stbte__activate(0);
+         break;
+
+   }
+
+   if (*val >= v1-num_vis)
+      *val = v1-num_vis;
+   if (*val <= v0)
+      *val = v0;
+}
+
+
+static void stbte__compute_digits(stbte_tilemap *tm)
+{
+   if (tm->max_x >= 1000 || tm->max_y >= 1000)
+      tm->digits = 4;
+   else if (tm->max_x >= 100 || tm->max_y >= 100)
+      tm->digits = 3;
+   else
+      tm->digits = 2;
+}
+
+static int stbte__is_single_selection(void)
+{
+   return stbte__ui.has_selection
+       && stbte__ui.select_x0 == stbte__ui.select_x1
+       && stbte__ui.select_y0 == stbte__ui.select_y1;
+}
+
+typedef struct
+{
+   int width, height;
+   int x,y;
+   int active;
+   float retracted;
+} stbte__region_t;
+
+static stbte__region_t stbte__region[4];
+
+#define STBTE__TOOLBAR_ICON_SIZE   (9+2*2)
+#define STBTE__TOOLBAR_PASTE_SIZE  (34+2*2)
+
+// This routine computes where every panel goes onscreen: computes
+// a minimum width for each side based on which panels are on that
+// side, and accounts for width-dependent layout of certain panels.
+static void stbte__compute_panel_locations(stbte_tilemap *tm)
+{
+   int i, limit, w, k;
+   int window_width  = stbte__ui.x1 - stbte__ui.x0;
+   int window_height = stbte__ui.y1 - stbte__ui.y0;
+   int min_width[STBTE__num_panel]={0,0,0,0,0,0,0};
+   int height[STBTE__num_panel]={0,0,0,0,0,0,0};
+   int panel_active[STBTE__num_panel]={1,0,1,1,1,1,1};
+   int vpos[4] = { 0,0,0,0 };
+   stbte__panel *p = stbte__ui.panel;
+   stbte__panel *pt = &p[STBTE__panel_toolbar];
+#ifdef STBTE__NO_PROPS
+   int props = 0;
+#else
+   int props = 1;
+#endif
+
+   for (i=0; i < 4; ++i) {
+      stbte__region[i].active = 0;
+      stbte__region[i].width = 0;
+      stbte__region[i].height = 0;
+   }
+
+   // compute number of digits needs for info panel
+   stbte__compute_digits(tm);
+
+   // determine which panels are active
+   panel_active[STBTE__panel_categories] = tm->num_categories != 0;
+   panel_active[STBTE__panel_layers    ] = tm->num_layers     >  1;
+#ifdef STBTE__COLORPICKER
+   panel_active[STBTE__panel_colorpick ] = 1;
+#endif
+
+   panel_active[STBTE__panel_props     ] = props && stbte__is_single_selection();
+
+   // compute minimum widths for each panel (assuming they're on sides not top)
+   min_width[STBTE__panel_info      ] = 8 + 11 + 7*tm->digits+17+7;               // estimate min width of "w:0000"
+   min_width[STBTE__panel_colorpick ] = 120;
+   min_width[STBTE__panel_tiles     ] = 4 + tm->palette_spacing_x + 5;            // 5 for scrollbar
+   min_width[STBTE__panel_categories] = 4 + 42 + 5;                               // 42 is enough to show ~7 chars; 5 for scrollbar
+   min_width[STBTE__panel_layers    ] = 4 + 54 + 30*tm->has_layer_names;          // 2 digits plus 3 buttons plus scrollbar
+   min_width[STBTE__panel_toolbar   ] = 4 + STBTE__TOOLBAR_PASTE_SIZE;            // wide enough for 'Paste' button
+   min_width[STBTE__panel_props     ] = 80;                    // narrowest info panel
+
+   // compute minimum widths for left & right panels based on the above
+   stbte__region[0].width = stbte__ui.left_width;
+   stbte__region[1].width = stbte__ui.right_width;
+
+   for (i=0; i < STBTE__num_panel; ++i) {
+      if (panel_active[i]) {
+         int side = stbte__ui.panel[i].side;
+         if (min_width[i] > stbte__region[side].width)
+            stbte__region[side].width = min_width[i];
+         stbte__region[side].active = 1;
+      }
+   }
+
+   // now compute the heights of each panel
+
+   // if toolbar at top, compute its size & push the left and right start points down
+   if (stbte__region[STBTE__side_top].active) {
+      int height = STBTE__TOOLBAR_ICON_SIZE+2;
+      pt->x0     = stbte__ui.x0;
+      pt->y0     = stbte__ui.y0;
+      pt->width  = window_width;
+      pt->height = height;
+      vpos[STBTE__side_left] = vpos[STBTE__side_right] = height;
+   } else {
+      int num_rows = STBTE__num_tool * ((stbte__region[pt->side].width-4)/STBTE__TOOLBAR_ICON_SIZE);
+      height[STBTE__panel_toolbar] = num_rows*13 + 3*15 + 4; // 3*15 for cut/copy/paste, which are stacked vertically
+   }
+
+   for (i=0; i < 4; ++i)
+      stbte__region[i].y = stbte__ui.y0 + vpos[i];
+
+   for (i=0; i < 2; ++i) {
+      int anim = (int) (stbte__region[i].width * stbte__region[i].retracted);
+      stbte__region[i].x = (i == STBTE__side_left) ? stbte__ui.x0 - anim : stbte__ui.x1 - stbte__region[i].width + anim;
+   }
+
+   // color picker
+   height[STBTE__panel_colorpick] = 300;
+
+   // info panel
+   w = stbte__region[p[STBTE__panel_info].side].width;
+   p[STBTE__panel_info].mode = (w >= 8 + (11+7*tm->digits+17)*2 + 4);
+   if (p[STBTE__panel_info].mode)
+      height[STBTE__panel_info] = 5 + 11*2 + 2 + tm->palette_spacing_y;
+   else
+      height[STBTE__panel_info] = 5 + 11*4 + 2 + tm->palette_spacing_y;
+
+   // layers
+   limit = 6 + stbte__ui.panel[STBTE__panel_layers].delta_height;
+   height[STBTE__panel_layers] = (tm->num_layers > limit ? limit : tm->num_layers)*15 + 7 + (tm->has_layer_names ? 0 : 11) + props*13;
+
+   // categories
+   limit = 6 + stbte__ui.panel[STBTE__panel_categories].delta_height;
+   height[STBTE__panel_categories] = (tm->num_categories+1 > limit ? limit : tm->num_categories+1)*11 + 14;
+   if (stbte__ui.panel[STBTE__panel_categories].side == stbte__ui.panel[STBTE__panel_categories].side)
+      height[STBTE__panel_categories] -= 4;
+
+   // palette
+   k =  (stbte__region[p[STBTE__panel_tiles].side].width - 8) / tm->palette_spacing_x;
+   if (k == 0) k = 1;
+   height[STBTE__panel_tiles] = ((tm->num_tiles+k-1)/k) * tm->palette_spacing_y + 8;
+
+   // properties panel
+   height[STBTE__panel_props] = 9 + STBTE_MAX_PROPERTIES*14;
+
+   // now compute the locations of all the panels
+   for (i=0; i < STBTE__num_panel; ++i) {
+      if (panel_active[i]) {
+         int side = p[i].side;
+         if (side == STBTE__side_left || side == STBTE__side_right) {
+            p[i].width  = stbte__region[side].width;
+            p[i].x0     = stbte__region[side].x;
+            p[i].y0     = stbte__ui.y0 + vpos[side];
+            p[i].height = height[i];
+            vpos[side] += height[i];
+            if (vpos[side] > window_height) {
+               vpos[side] = window_height;
+               p[i].height = stbte__ui.y1 - p[i].y0;
+            }
+         } else {
+            ; // it's at top, it's already been explicitly set up earlier
+         }
+      } else {
+         // inactive panel
+         p[i].height = 0;
+         p[i].width  = 0;
+         p[i].x0     = stbte__ui.x1;
+         p[i].y0     = stbte__ui.y1;
+      }
+   }
+}
+
+// unique identifiers for imgui
+enum
+{
+   STBTE__map=1,
+   STBTE__region,
+   STBTE__panel,                          // panel background to hide map, and misc controls
+   STBTE__info,                           // info data
+   STBTE__toolbarA, STBTE__toolbarB,      // toolbar buttons: param is tool number
+   STBTE__palette,                        // palette selectors: param is tile index
+   STBTE__categories,                     // category selectors: param is category index
+   STBTE__layer,                          //
+   STBTE__solo, STBTE__hide, STBTE__lock, // layer controls: param is layer
+   STBTE__scrollbar,                      // param is panel ID
+   STBTE__panel_mover,                    // p1 is panel ID, p2 is destination side
+   STBTE__panel_sizer,                    // param panel ID
+   STBTE__scrollbar_id,
+   STBTE__colorpick_id,
+   STBTE__prop_flag,
+   STBTE__prop_float,
+   STBTE__prop_int,
+};
+
+// id is:      [      24-bit data     : 7-bit identifier ]
+// map id is:  [  12-bit y : 12 bit x : 7-bit identifier ]
+
+#define STBTE__ID(n,p)     ((n) + ((p)<<7))
+#define STBTE__ID2(n,p,q)  STBTE__ID(n, ((p)<<12)+(q) )
+#define STBTE__IDMAP(x,y)  STBTE__ID2(STBTE__map, x,y)
+
+static void stbte__activate_map(int x, int y)
+{
+   stbte__ui.active_id = STBTE__IDMAP(x,y);
+   stbte__ui.active_event = stbte__ui.event;
+   stbte__ui.sx = x;
+   stbte__ui.sy = y;
+}
+
+static void stbte__alert(const char *msg)
+{
+   stbte__ui.alert_msg = msg;
+   stbte__ui.alert_timer = 3;
+}
+
+#define STBTE__BG(tm,layer) ((layer) == 0 ? (tm)->background_tile : STBTE__NO_TILE)
+
+
+
+static void stbte__brush_predict(stbte_tilemap *tm, short result[])
+{
+   stbte__tileinfo *ti;
+   int i;
+
+   if (tm->cur_tile < 0) return;
+
+   ti = &tm->tiles[tm->cur_tile];
+
+   // find lowest legit layer to paint it on, and put it there
+   for (i=0; i < tm->num_layers; ++i) {
+      // check if object is allowed on layer
+      if (!(ti->layermask & (1 << i)))
+         continue;
+
+      if (i != tm->solo_layer) {
+         // if there's a selected layer, can only paint on that
+         if (tm->cur_layer >= 0 && i != tm->cur_layer)
+            continue;
+
+         // if the layer is hidden, we can't see it
+         if (tm->layerinfo[i].hidden)
+            continue;
+
+         // if the layer is locked, we can't write to it
+         if (tm->layerinfo[i].locked == STBTE__locked)
+            continue;
+
+         // if the layer is non-empty and protected, can't write to it
+         if (tm->layerinfo[i].locked == STBTE__protected && result[i] != STBTE__BG(tm,i))
+            continue;
+      }
+
+      result[i] = ti->id;
+      return;
+   }
+}
+
+static void stbte__brush(stbte_tilemap *tm, int x, int y)
+{
+   stbte__tileinfo *ti;
+
+   // find lowest legit layer to paint it on, and put it there
+   int i;
+
+   if (tm->cur_tile < 0) return;
+
+   ti = &tm->tiles[tm->cur_tile];
+
+   for (i=0; i < tm->num_layers; ++i) {
+      // check if object is allowed on layer
+      if (!(ti->layermask & (1 << i)))
+         continue;
+
+      if (i != tm->solo_layer) {
+         // if there's a selected layer, can only paint on that
+         if (tm->cur_layer >= 0 && i != tm->cur_layer)
+            continue;
+
+         // if the layer is hidden, we can't see it
+         if (tm->layerinfo[i].hidden)
+            continue;
+
+         // if the layer is locked, we can't write to it
+         if (tm->layerinfo[i].locked == STBTE__locked)
+            continue;
+
+         // if the layer is non-empty and protected, can't write to it
+         if (tm->layerinfo[i].locked == STBTE__protected && tm->data[y][x][i] != STBTE__BG(tm,i))
+            continue;
+      }
+
+      stbte__undo_record(tm,x,y,i,tm->data[y][x][i]);
+      tm->data[y][x][i] = ti->id;
+      return;
+   }
+
+   //stbte__alert("Selected tile not valid on active layer(s)");
+}
+
+enum
+{
+   STBTE__erase_none = -1,
+   STBTE__erase_brushonly = 0,
+   STBTE__erase_any = 1,
+   STBTE__erase_all = 2,
+};
+
+static int stbte__erase_predict(stbte_tilemap *tm, short result[], int allow_any)
+{
+   stbte__tileinfo *ti = tm->cur_tile >= 0 ? &tm->tiles[tm->cur_tile] : NULL;
+   int i;
+
+   if (allow_any == STBTE__erase_none)
+      return allow_any;
+
+   // first check if only one layer is legit
+   i = tm->cur_layer;
+   if (tm->solo_layer >= 0)
+      i = tm->solo_layer;
+
+   // if only one layer is legit, directly process that one for clarity
+   if (i >= 0) {
+      short bg = (i == 0 ? tm->background_tile : -1);
+      if (tm->solo_layer < 0) {
+         // check that we're allowed to write to it
+         if (tm->layerinfo[i].hidden) return STBTE__erase_none;
+         if (tm->layerinfo[i].locked) return STBTE__erase_none;
+      }
+      if (result[i] == bg)
+         return STBTE__erase_none; // didn't erase anything
+      if (ti && result[i] == ti->id && (i != 0 || ti->id != tm->background_tile)) {
+         result[i] = bg;
+         return STBTE__erase_brushonly;
+      }
+      if (allow_any == STBTE__erase_any) {
+         result[i] = bg;
+         return STBTE__erase_any;
+      }
+      return STBTE__erase_none;
+   }
+
+   // if multiple layers are legit, first scan all for brush data
+
+   if (ti && allow_any != STBTE__erase_all) {
+      for (i=tm->num_layers-1; i >= 0; --i) {
+         if (result[i] != ti->id)
+            continue;
+         if (tm->layerinfo[i].locked || tm->layerinfo[i].hidden)
+            continue;
+         if (i == 0 && result[i] == tm->background_tile)
+            return STBTE__erase_none;
+         result[i] = STBTE__BG(tm,i);
+         return STBTE__erase_brushonly;
+      }
+   }
+
+   if (allow_any != STBTE__erase_any && allow_any != STBTE__erase_all)
+      return STBTE__erase_none;
+
+   // apply layer filters, erase from top
+   for (i=tm->num_layers-1; i >= 0; --i) {
+      if (result[i] < 0)
+         continue;
+      if (tm->layerinfo[i].locked || tm->layerinfo[i].hidden)
+         continue;
+      if (i == 0 && result[i] == tm->background_tile)
+         return STBTE__erase_none;
+      result[i] = STBTE__BG(tm,i);
+      if (allow_any != STBTE__erase_all)
+         return STBTE__erase_any;
+   }
+
+   if (allow_any == STBTE__erase_all)
+      return allow_any;
+   return STBTE__erase_none;
+}
+
+static int stbte__erase(stbte_tilemap *tm, int x, int y, int allow_any)
+{
+   stbte__tileinfo *ti = tm->cur_tile >= 0 ? &tm->tiles[tm->cur_tile] : NULL;
+   int i;
+
+   if (allow_any == STBTE__erase_none)
+      return allow_any;
+
+   // first check if only one layer is legit
+   i = tm->cur_layer;
+   if (tm->solo_layer >= 0)
+      i = tm->solo_layer;
+
+   // if only one layer is legit, directly process that one for clarity
+   if (i >= 0) {
+      short bg = (i == 0 ? tm->background_tile : -1);
+      if (tm->solo_layer < 0) {
+         // check that we're allowed to write to it
+         if (tm->layerinfo[i].hidden) return STBTE__erase_none;
+         if (tm->layerinfo[i].locked) return STBTE__erase_none;
+      }
+      if (tm->data[y][x][i] == bg)
+         return -1; // didn't erase anything
+      if (ti && tm->data[y][x][i] == ti->id && (i != 0 || ti->id != tm->background_tile)) {
+         stbte__undo_record(tm,x,y,i,tm->data[y][x][i]);
+         tm->data[y][x][i] = bg;
+         return STBTE__erase_brushonly;
+      }
+      if (allow_any == STBTE__erase_any) {
+         stbte__undo_record(tm,x,y,i,tm->data[y][x][i]);
+         tm->data[y][x][i] = bg;
+         return STBTE__erase_any;
+      }
+      return STBTE__erase_none;
+   }
+
+   // if multiple layers are legit, first scan all for brush data
+
+   if (ti && allow_any != STBTE__erase_all) {
+      for (i=tm->num_layers-1; i >= 0; --i) {
+         if (tm->data[y][x][i] != ti->id)
+            continue;
+         if (tm->layerinfo[i].locked || tm->layerinfo[i].hidden)
+            continue;
+         if (i == 0 && tm->data[y][x][i] == tm->background_tile)
+            return STBTE__erase_none;
+         stbte__undo_record(tm,x,y,i,tm->data[y][x][i]);
+         tm->data[y][x][i] = STBTE__BG(tm,i);
+         return STBTE__erase_brushonly;
+      }
+   }
+
+   if (allow_any != STBTE__erase_any && allow_any != STBTE__erase_all)
+      return STBTE__erase_none;
+
+   // apply layer filters, erase from top
+   for (i=tm->num_layers-1; i >= 0; --i) {
+      if (tm->data[y][x][i] < 0)
+         continue;
+      if (tm->layerinfo[i].locked || tm->layerinfo[i].hidden)
+         continue;
+      if (i == 0 && tm->data[y][x][i] == tm->background_tile)
+         return STBTE__erase_none;
+      stbte__undo_record(tm,x,y,i,tm->data[y][x][i]);
+      tm->data[y][x][i] = STBTE__BG(tm,i);
+      if (allow_any != STBTE__erase_all)
+         return STBTE__erase_any;
+   }
+   if (allow_any == STBTE__erase_all)
+      return allow_any;
+   return STBTE__erase_none;
+}
+
+static int stbte__find_tile(stbte_tilemap *tm, int tile_id)
+{
+   int i;
+   for (i=0; i < tm->num_tiles; ++i)
+      if (tm->tiles[i].id == tile_id)
+         return i;
+   stbte__alert("Eyedropped tile that isn't in tileset");
+   return -1;
+}
+
+static void stbte__eyedrop(stbte_tilemap *tm, int x, int y)
+{
+   int i,j;
+
+   // flush eyedropper state
+   if (stbte__ui.eyedrop_x != x || stbte__ui.eyedrop_y != y) {
+      stbte__ui.eyedrop_x = x;
+      stbte__ui.eyedrop_y = y;
+      stbte__ui.eyedrop_last_layer = tm->num_layers;
+   }
+
+   // if only one layer is active, query that
+   i = tm->cur_layer;
+   if (tm->solo_layer >= 0)
+      i = tm->solo_layer;
+   if (i >= 0) {
+      if (tm->data[y][x][i] == STBTE__NO_TILE)
+         return;
+      tm->cur_tile = stbte__find_tile(tm, tm->data[y][x][i]);
+      return;
+   }
+
+   // if multiple layers, continue from previous
+   i = stbte__ui.eyedrop_last_layer;
+   for (j=0; j < tm->num_layers; ++j) {
+      if (--i < 0)
+         i = tm->num_layers-1;
+      if (tm->layerinfo[i].hidden)
+         continue;
+      if (tm->data[y][x][i] == STBTE__NO_TILE)
+         continue;
+      stbte__ui.eyedrop_last_layer = i;
+      tm->cur_tile = stbte__find_tile(tm, tm->data[y][x][i]);
+      return;
+   }
+}
+
+static int stbte__should_copy_properties(stbte_tilemap *tm)
+{
+   int i;
+   if (tm->propmode == STBTE__propmode_always)
+      return 1;
+   if (tm->propmode == STBTE__propmode_never)
+      return 0;
+   if (tm->solo_layer >= 0 || tm->cur_layer >= 0)
+      return 0;
+   for (i=0; i < tm->num_layers; ++i)
+      if (tm->layerinfo[i].hidden || tm->layerinfo[i].locked)
+         return 0;
+   return 1;
+}
+
+// compute the result of pasting into a tile non-destructively so we can preview it
+static void stbte__paste_stack(stbte_tilemap *tm, short result[], short dest[], short src[], int dragging)
+{
+   int i;
+
+   // special case single-layer
+   i = tm->cur_layer;
+   if (tm->solo_layer >= 0)
+      i = tm->solo_layer;
+   if (i >= 0) {
+      if (tm->solo_layer < 0) {
+         // check that we're allowed to write to it
+         if (tm->layerinfo[i].hidden) return;
+         if (tm->layerinfo[i].locked == STBTE__locked) return;
+         // if protected, dest has to be empty
+         if (tm->layerinfo[i].locked == STBTE__protected && dest[i] != STBTE__BG(tm,i)) return;
+         // if dragging w/o copy, we will try to erase stuff, which protection disallows
+         if (dragging && tm->layerinfo[i].locked == STBTE__protected)
+             return;
+      }
+      result[i] = dest[i];
+      if (src[i] != STBTE__BG(tm,i))
+         result[i] = src[i];
+      return;
+   }
+
+   for (i=0; i < tm->num_layers; ++i) {
+      result[i] = dest[i];
+      if (src[i] != STBTE__NO_TILE)
+         if (!tm->layerinfo[i].hidden && tm->layerinfo[i].locked != STBTE__locked)
+            if (tm->layerinfo[i].locked == STBTE__unlocked || (!dragging && dest[i] == STBTE__BG(tm,i)))
+               result[i] = src[i];
+   }
+}
+
+// compute the result of dragging away from a tile
+static void stbte__clear_stack(stbte_tilemap *tm, short result[])
+{
+   int i;
+   // special case single-layer
+   i = tm->cur_layer;
+   if (tm->solo_layer >= 0)
+      i = tm->solo_layer;
+   if (i >= 0)
+      result[i] = STBTE__BG(tm,i);
+   else
+      for (i=0; i < tm->num_layers; ++i)
+         if (!tm->layerinfo[i].hidden && tm->layerinfo[i].locked == STBTE__unlocked)
+            result[i] = STBTE__BG(tm,i);
+}
+
+// check if some map square is active
+#define STBTE__IS_MAP_ACTIVE()  ((stbte__ui.active_id & 127) == STBTE__map)
+#define STBTE__IS_MAP_HOT()     ((stbte__ui.hot_id & 127) == STBTE__map)
+
+static void stbte__fillrect(stbte_tilemap *tm, int x0, int y0, int x1, int y1, int fill)
+{
+   int i,j;
+
+   stbte__begin_undo(tm);
+   if (x0 > x1) i=x0,x0=x1,x1=i;
+   if (y0 > y1) j=y0,y0=y1,y1=j;
+   for (j=y0; j <= y1; ++j)
+      for (i=x0; i <= x1; ++i)
+         if (fill)
+            stbte__brush(tm, i,j);
+         else
+            stbte__erase(tm, i,j,STBTE__erase_any);
+   stbte__end_undo(tm);
+   // suppress warning from brush
+   stbte__ui.alert_msg = 0;
+}
+
+static void stbte__select_rect(stbte_tilemap *tm, int x0, int y0, int x1, int y1)
+{
+   stbte__ui.has_selection = 1;
+   stbte__ui.select_x0 = (x0 < x1 ? x0 : x1);
+   stbte__ui.select_x1 = (x0 < x1 ? x1 : x0);
+   stbte__ui.select_y0 = (y0 < y1 ? y0 : y1);
+   stbte__ui.select_y1 = (y0 < y1 ? y1 : y0);
+}
+
+static void stbte__copy_properties(float *dest, float *src)
+{
+   int i;
+   for (i=0; i < STBTE_MAX_PROPERTIES; ++i)
+      dest[i] = src[i];
+}
+
+static void stbte__copy_cut(stbte_tilemap *tm, int cut)
+{
+   int i,j,n,w,h,p=0;
+   int copy_props = stbte__should_copy_properties(tm);
+   if (!stbte__ui.has_selection)
+      return;
+   w = stbte__ui.select_x1 - stbte__ui.select_x0 + 1;
+   h = stbte__ui.select_y1 - stbte__ui.select_y0 + 1;
+   if (STBTE_MAX_COPY / w < h) {
+      stbte__alert("Selection too large for copy buffer, increase STBTE_MAX_COPY");
+      return;
+   }
+
+   for (i=0; i < w*h; ++i)
+      for (n=0; n < tm->num_layers; ++n)
+         stbte__ui.copybuffer[i][n] = STBTE__NO_TILE;
+
+   if (cut)
+      stbte__begin_undo(tm);
+   for (j=stbte__ui.select_y0; j <= stbte__ui.select_y1; ++j) {
+      for (i=stbte__ui.select_x0; i <= stbte__ui.select_x1; ++i) {
+         for (n=0; n < tm->num_layers; ++n) {
+            if (tm->solo_layer >= 0) {
+               if (tm->solo_layer != n)
+                  continue;
+            } else {
+               if (tm->cur_layer >= 0)
+                  if (tm->cur_layer != n)
+                     continue;
+               if (tm->layerinfo[n].hidden)
+                  continue;
+               if (cut && tm->layerinfo[n].locked)
+                  continue;
+            }
+            stbte__ui.copybuffer[p][n] = tm->data[j][i][n];
+            if (cut) {
+               stbte__undo_record(tm,i,j,n, tm->data[j][i][n]);
+               tm->data[j][i][n] = (n==0 ? tm->background_tile : -1);
+            }
+         }
+         if (copy_props) {
+            stbte__copy_properties(stbte__ui.copyprops[p], tm->props[j][i]);
+#ifdef STBTE_ALLOW_LINK
+            stbte__ui.copylinks[p] = tm->link[j][i];
+            if (cut)
+               stbte__set_link(tm, i,j,-1,-1, STBTE__undo_record);
+#endif
+         }
+         ++p;
+      }
+   }
+   if (cut)
+      stbte__end_undo(tm);
+   stbte__ui.copy_width = w;
+   stbte__ui.copy_height = h;
+   stbte__ui.has_copy = 1;
+   //stbte__ui.has_selection = 0;
+   stbte__ui.copy_has_props = copy_props;
+   stbte__ui.copy_src = tm; // used to give better semantics when copying links
+   stbte__ui.copy_src_x = stbte__ui.select_x0;
+   stbte__ui.copy_src_y = stbte__ui.select_y0;
+}
+
+static int stbte__in_rect(int x, int y, int x0, int y0, int w, int h)
+{
+   return x >= x0 && x < x0+w && y >= y0 && y < y0+h;
+}
+
+#ifdef STBTE_ALLOW_LINK
+static int stbte__in_src_rect(int x, int y)
+{
+   return stbte__in_rect(x,y, stbte__ui.copy_src_x, stbte__ui.copy_src_y, stbte__ui.copy_width, stbte__ui.copy_height);
+}
+
+static int stbte__in_dest_rect(int x, int y, int destx, int desty)
+{
+   return stbte__in_rect(x,y, destx, desty, stbte__ui.copy_width, stbte__ui.copy_height);
+}
+#endif
+
+static void stbte__paste(stbte_tilemap *tm, int mapx, int mapy)
+{
+   int w = stbte__ui.copy_width;
+   int h = stbte__ui.copy_height;
+   int i,j,k,p;
+   int x = mapx - (w>>1);
+   int y = mapy - (h>>1);
+   int copy_props = stbte__should_copy_properties(tm) && stbte__ui.copy_has_props;
+   if (stbte__ui.has_copy == 0)
+      return;
+   stbte__begin_undo(tm);
+   p = 0;
+   for (j=0; j < h; ++j) {
+      for (i=0; i < w; ++i) {
+         if (y+j >= 0 && y+j < tm->max_y && x+i >= 0 && x+i < tm->max_x) {
+            // compute the new stack
+            short tilestack[STBTE_MAX_LAYERS];
+            for (k=0; k < tm->num_layers; ++k)
+               tilestack[k] = tm->data[y+j][x+i][k];
+            stbte__paste_stack(tm, tilestack, tilestack, stbte__ui.copybuffer[p], 0);
+            // update anything that changed
+            for (k=0; k < tm->num_layers; ++k) {
+               if (tilestack[k] != tm->data[y+j][x+i][k]) {
+                  stbte__undo_record(tm, x+i,y+j,k, tm->data[y+j][x+i][k]);
+                  tm->data[y+j][x+i][k] = tilestack[k];
+               }
+            }
+         }
+         if (copy_props) {
+#ifdef STBTE_ALLOW_LINK
+            // need to decide how to paste a link, so there's a few cases
+            int destx = -1, desty = -1;
+            stbte__link *link = &stbte__ui.copylinks[p];
+
+            // check if link is within-rect
+            if (stbte__in_src_rect(link->x, link->y)) {
+               // new link should point to copy (but only if copy is within map)
+               destx = x + (link->x - stbte__ui.copy_src_x);
+               desty = y + (link->y - stbte__ui.copy_src_y);
+            } else if (tm == stbte__ui.copy_src) {
+               // if same map, then preserve link unless target is overwritten
+               if (!stbte__in_dest_rect(link->x,link->y,x,y)) {
+                  destx = link->x;
+                  desty = link->y;
+               }
+            }
+            // this is necessary for offset-copy, but also in case max_x/max_y has changed
+            if (destx < 0 || destx >= tm->max_x || desty < 0 || desty >= tm->max_y)
+               destx = -1, desty = -1;
+            stbte__set_link(tm, x+i, y+j, destx, desty, STBTE__undo_record);
+#endif
+            for (k=0; k < STBTE_MAX_PROPERTIES; ++k) {
+               if (tm->props[y+j][x+i][k] != stbte__ui.copyprops[p][k])
+                  stbte__undo_record_prop_float(tm, x+i, y+j, k, tm->props[y+j][x+i][k]);
+            }
+            stbte__copy_properties(tm->props[y+j][x+i], stbte__ui.copyprops[p]);
+         }
+         ++p;
+      }
+   }
+   stbte__end_undo(tm);
+}
+
+static void stbte__drag_update(stbte_tilemap *tm, int mapx, int mapy, int copy_props)
+{
+   int w = stbte__ui.drag_w, h = stbte__ui.drag_h;
+   int ox,oy,i,deleted=0,written=0;
+   short temp[STBTE_MAX_LAYERS];
+   short *data = NULL;
+
+   STBTE__NOTUSED(deleted);
+   STBTE__NOTUSED(written);
+
+   if (!stbte__ui.shift) {
+      ox = mapx - stbte__ui.drag_x;
+      oy = mapy - stbte__ui.drag_y;
+      if (ox >= 0 && ox < w && oy >= 0 && oy < h) {
+         deleted=1;
+         for (i=0; i < tm->num_layers; ++i)
+            temp[i] = tm->data[mapy][mapx][i];
+         data = temp;
+         stbte__clear_stack(tm, data);
+      }
+   }
+   ox = mapx - stbte__ui.drag_dest_x;
+   oy = mapy - stbte__ui.drag_dest_y;
+   // if this map square is in the target drag region
+   if (ox >= 0 && ox < w && oy >= 0 && oy < h) {
+      // and the src map square is on the map
+      if (stbte__in_rect(stbte__ui.drag_x+ox, stbte__ui.drag_y+oy, 0, 0, tm->max_x, tm->max_y)) {
+         written = 1;
+         if (data == NULL) {
+            for (i=0; i < tm->num_layers; ++i)
+               temp[i] = tm->data[mapy][mapx][i];
+            data = temp;
+         }
+         stbte__paste_stack(tm, data, data, tm->data[stbte__ui.drag_y+oy][stbte__ui.drag_x+ox], !stbte__ui.shift);
+         if (copy_props) {
+            for (i=0; i < STBTE_MAX_PROPERTIES; ++i) {
+               if (tm->props[mapy][mapx][i] != tm->props[stbte__ui.drag_y+oy][stbte__ui.drag_x+ox][i]) {
+                  stbte__undo_record_prop_float(tm, mapx, mapy, i, tm->props[mapy][mapx][i]);
+                  tm->props[mapy][mapx][i] = tm->props[stbte__ui.drag_y+oy][stbte__ui.drag_x+ox][i];
+               }
+            }
+         }
+      }
+   }
+   if (data) {
+      for (i=0; i < tm->num_layers; ++i) {
+         if (tm->data[mapy][mapx][i] != data[i]) {
+            stbte__undo_record(tm, mapx, mapy, i, tm->data[mapy][mapx][i]);
+            tm->data[mapy][mapx][i] = data[i];
+         }
+      }
+   }
+   #ifdef STBTE_ALLOW_LINK
+   if (copy_props) {
+      int overwritten=0, moved=0, copied=0;
+      // since this function is called on EVERY tile, we can fix up even tiles not
+      // involved in the move
+
+      stbte__link *k;
+      // first, determine what src link ends up here
+      k = &tm->link[mapy][mapx]; // by default, it's the one currently here
+      if (deleted)               // if dragged away, it's erased
+         k = NULL;
+      if (written)               // if dragged into, it gets that link
+         k = &tm->link[stbte__ui.drag_y+oy][stbte__ui.drag_x+ox];
+
+      // now check whether the *target* gets moved or overwritten
+      if (k && k->x >= 0) {
+         overwritten = stbte__in_rect(k->x, k->y, stbte__ui.drag_dest_x, stbte__ui.drag_dest_y, w, h);
+         if (!stbte__ui.shift)
+            moved    = stbte__in_rect(k->x, k->y, stbte__ui.drag_x     , stbte__ui.drag_y     , w, h);
+         else
+            copied   = stbte__in_rect(k->x, k->y, stbte__ui.drag_x     , stbte__ui.drag_y     , w, h);
+      }
+
+      if (deleted || written || overwritten || moved || copied) {
+         // choose the final link value based on the above
+         if (k == NULL || k->x < 0)
+            stbte__set_link(tm, mapx, mapy, -1, -1, STBTE__undo_record);
+         else if (moved || (copied && written)) {
+            // if we move the target, we update to point to the new target;
+            // or, if we copy the target and the source is part of the copy, then update to new target
+            int x = k->x + (stbte__ui.drag_dest_x - stbte__ui.drag_x);
+            int y = k->y + (stbte__ui.drag_dest_y - stbte__ui.drag_y);
+            if (!(x >= 0 && y >= 0 && x < tm->max_x && y < tm->max_y))
+               x = -1, y = -1;
+            stbte__set_link(tm, mapx, mapy, x, y, STBTE__undo_record);
+         } else if (overwritten) {
+            stbte__set_link(tm, mapx, mapy, -1, -1, STBTE__undo_record);
+         } else
+            stbte__set_link(tm, mapx, mapy, k->x, k->y, STBTE__undo_record);
+      }
+   }
+   #endif
+}
+
+static void stbte__drag_place(stbte_tilemap *tm, int mapx, int mapy)
+{
+   int i,j;
+   int copy_props = stbte__should_copy_properties(tm);
+   int move_x = (stbte__ui.drag_dest_x - stbte__ui.drag_x);
+   int move_y = (stbte__ui.drag_dest_y - stbte__ui.drag_y);
+   if (move_x == 0 && move_y == 0)
+      return;
+
+   stbte__begin_undo(tm);
+   // we now need a 2D memmove-style mover that doesn't
+   // overwrite any data as it goes. this requires being
+   // direction sensitive in the same way as memmove
+   if (move_y > 0 || (move_y == 0 && move_x > 0)) {
+      for (j=tm->max_y-1; j >= 0; --j)
+         for (i=tm->max_x-1; i >= 0; --i)
+            stbte__drag_update(tm,i,j,copy_props);
+   } else {
+      for (j=0; j < tm->max_y; ++j)
+         for (i=0; i < tm->max_x; ++i)
+            stbte__drag_update(tm,i,j,copy_props);
+   }
+   stbte__end_undo(tm);
+
+   stbte__ui.has_selection = 1;
+   stbte__ui.select_x0 = stbte__ui.drag_dest_x;
+   stbte__ui.select_y0 = stbte__ui.drag_dest_y;
+   stbte__ui.select_x1 = stbte__ui.select_x0 + stbte__ui.drag_w - 1;
+   stbte__ui.select_y1 = stbte__ui.select_y0 + stbte__ui.drag_h - 1;
+}
+
+static void stbte__tile_paint(stbte_tilemap *tm, int sx, int sy, int mapx, int mapy, int layer)
+{
+   int i;
+   int id = STBTE__IDMAP(mapx,mapy);
+   int x0=sx, y0=sy;
+   int x1=sx+tm->spacing_x, y1=sy+tm->spacing_y;
+   stbte__hittest(x0,y0,x1,y1, id);
+   short *data = tm->data[mapy][mapx];
+   short temp[STBTE_MAX_LAYERS];
+
+   if (STBTE__IS_MAP_HOT()) {
+      if (stbte__ui.pasting) {
+         int ox = mapx - stbte__ui.paste_x;
+         int oy = mapy - stbte__ui.paste_y;
+         if (ox >= 0 && ox < stbte__ui.copy_width && oy >= 0 && oy < stbte__ui.copy_height) {
+            stbte__paste_stack(tm, temp, tm->data[mapy][mapx], stbte__ui.copybuffer[oy*stbte__ui.copy_width+ox], 0);
+            data = temp;
+         }
+      } else if (stbte__ui.dragging) {
+         int ox,oy;
+         for (i=0; i < tm->num_layers; ++i)
+            temp[i] = tm->data[mapy][mapx][i];
+         data = temp;
+
+         // if it's in the source area, remove things unless shift-dragging
+         ox = mapx - stbte__ui.drag_x;
+         oy = mapy - stbte__ui.drag_y;
+         if (!stbte__ui.shift && ox >= 0 && ox < stbte__ui.drag_w && oy >= 0 && oy < stbte__ui.drag_h) {
+            stbte__clear_stack(tm, temp);
+         }
+
+         ox = mapx - stbte__ui.drag_dest_x;
+         oy = mapy - stbte__ui.drag_dest_y;
+         if (ox >= 0 && ox < stbte__ui.drag_w && oy >= 0 && oy < stbte__ui.drag_h) {
+            stbte__paste_stack(tm, temp, temp, tm->data[stbte__ui.drag_y+oy][stbte__ui.drag_x+ox], !stbte__ui.shift);
+         }
+      } else if (STBTE__IS_MAP_ACTIVE()) {
+         if (stbte__ui.tool == STBTE__tool_rect) {
+            if ((stbte__ui.ms_time & 511) < 380) {
+               int ex = ((stbte__ui.hot_id >> 19) & 4095);
+               int ey = ((stbte__ui.hot_id >>  7) & 4095);
+               int sx = stbte__ui.sx;
+               int sy = stbte__ui.sy;
+
+               if (   ((mapx >= sx && mapx < ex+1) || (mapx >= ex && mapx < sx+1))
+                   && ((mapy >= sy && mapy < ey+1) || (mapy >= ey && mapy < sy+1))) {
+                  int i;
+                  for (i=0; i < tm->num_layers; ++i)
+                     temp[i] = tm->data[mapy][mapx][i];
+                  data = temp;
+                  if (stbte__ui.active_event == STBTE__leftdown)
+                     stbte__brush_predict(tm, temp);
+                  else
+                     stbte__erase_predict(tm, temp, STBTE__erase_any);
+               }
+            }
+         }
+      }
+   }
+
+   if (STBTE__IS_HOT(id) && STBTE__INACTIVE() && !stbte__ui.pasting) {
+      if (stbte__ui.tool == STBTE__tool_brush) {
+         if ((stbte__ui.ms_time & 511) < 300) {
+            data = temp;
+            for (i=0; i < tm->num_layers; ++i)
+               temp[i] = tm->data[mapy][mapx][i];
+            stbte__brush_predict(tm, temp);
+         }
+      }
+   }
+
+   {
+      i = layer;
+      if (i == tm->solo_layer || (!tm->layerinfo[i].hidden && tm->solo_layer < 0))
+         if (data[i] >= 0)
+            STBTE_DRAW_TILE(sx,sy, (unsigned short) data[i], 0, tm->props[mapy][mapx]);
+   }
+}
+
+static void stbte__tile(stbte_tilemap *tm, int sx, int sy, int mapx, int mapy)
+{
+   int tool = stbte__ui.tool;
+   int x0=sx, y0=sy;
+   int x1=sx+tm->spacing_x, y1=sy+tm->spacing_y;
+   int id = STBTE__IDMAP(mapx,mapy);
+   int over = stbte__hittest(x0,y0,x1,y1, id);
+   switch (stbte__ui.event) {
+      case STBTE__paint: {
+         if (stbte__ui.pasting || stbte__ui.dragging || stbte__ui.scrolling)
+            break;
+         if (stbte__ui.scrollkey && !STBTE__IS_MAP_ACTIVE())
+            break;
+         if (STBTE__IS_HOT(id) && STBTE__IS_MAP_ACTIVE() && (tool == STBTE__tool_rect || tool == STBTE__tool_select)) {
+            int rx0,ry0,rx1,ry1,t;
+            // compute the center of each rect
+            rx0 = x0 + tm->spacing_x/2;
+            ry0 = y0 + tm->spacing_y/2;
+            rx1 = rx0 + (stbte__ui.sx - mapx) * tm->spacing_x;
+            ry1 = ry0 + (stbte__ui.sy - mapy) * tm->spacing_y;
+            if (rx0 > rx1) t=rx0,rx0=rx1,rx1=t;
+            if (ry0 > ry1) t=ry0,ry0=ry1,ry1=t;
+            rx0 -= tm->spacing_x/2;
+            ry0 -= tm->spacing_y/2;
+            rx1 += tm->spacing_x/2;
+            ry1 += tm->spacing_y/2;
+            stbte__draw_frame(rx0-1,ry0-1,rx1+1,ry1+1, STBTE_COLOR_TILEMAP_HIGHLIGHT);
+            break;
+         }
+         if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+            stbte__draw_frame(x0-1,y0-1,x1+1,y1+1, STBTE_COLOR_TILEMAP_HIGHLIGHT);
+         }
+#ifdef STBTE_ALLOW_LINK
+         if (stbte__ui.show_links && tm->link[mapy][mapx].x >= 0) {
+            int tx = tm->link[mapy][mapx].x;
+            int ty = tm->link[mapy][mapx].y;
+            int lx0,ly0,lx1,ly1;
+            if (STBTE_ALLOW_LINK(tm->data[mapy][mapx], tm->props[mapy][mapx],
+                                 tm->data[ty  ][tx  ], tm->props[ty  ][tx  ]))
+            {
+               lx0 =  x0 + (tm->spacing_x >> 1) - 1;
+               ly0 =  y0 + (tm->spacing_y >> 1) - 1;
+               lx1 = lx0 + (tx - mapx) * tm->spacing_x + 2;
+               ly1 = ly0 + (ty - mapy) * tm->spacing_y + 2;
+               stbte__draw_link(lx0,ly0,lx1,ly1,
+                   STBTE_LINK_COLOR(tm->data[mapy][mapx], tm->props[mapy][mapx],
+                                    tm->data[ty  ][tx  ], tm->props[ty  ][tx]));
+            }
+         }
+#endif
+         break;
+      }
+   }
+
+   if (stbte__ui.pasting) {
+      switch (stbte__ui.event) {
+         case STBTE__leftdown:
+            if (STBTE__IS_HOT(id)) {
+               stbte__ui.pasting = 0;
+               stbte__paste(tm, mapx, mapy);
+               stbte__activate(0);
+            }
+            break;
+         case STBTE__leftup:
+            // just clear it no matter what, since they might click away to clear it
+            stbte__activate(0);
+            break;
+         case STBTE__rightdown:
+            if (STBTE__IS_HOT(id)) {
+               stbte__activate(0);
+               stbte__ui.pasting = 0;
+            }
+            break;
+      }
+      return;
+   }
+
+   if (stbte__ui.scrolling) {
+      if (stbte__ui.event == STBTE__leftup) {
+         stbte__activate(0);
+         stbte__ui.scrolling = 0;
+      }
+      if (stbte__ui.event == STBTE__mousemove) {
+         tm->scroll_x += (stbte__ui.start_x - stbte__ui.mx);
+         tm->scroll_y += (stbte__ui.start_y - stbte__ui.my);
+         stbte__ui.start_x = stbte__ui.mx;
+         stbte__ui.start_y = stbte__ui.my;
+      }
+      return;
+   }
+
+   // regardless of tool, leftdown is a scrolldrag
+   if (STBTE__IS_HOT(id) && stbte__ui.scrollkey && stbte__ui.event == STBTE__leftdown) {
+      stbte__ui.scrolling = 1;
+      stbte__ui.start_x = stbte__ui.mx;
+      stbte__ui.start_y = stbte__ui.my;
+      return;
+   }
+
+   switch (tool) {
+      case STBTE__tool_brush:
+         switch (stbte__ui.event) {
+            case STBTE__mousemove:
+               if (STBTE__IS_MAP_ACTIVE() && over) {
+                  // don't brush/erase same tile multiple times unless they move away and back @TODO should just be only once, but that needs another data structure
+                  if (!STBTE__IS_ACTIVE(id)) {
+                     if (stbte__ui.active_event == STBTE__leftdown)
+                        stbte__brush(tm, mapx, mapy);
+                     else
+                        stbte__erase(tm, mapx, mapy, stbte__ui.brush_state);
+                     stbte__ui.active_id = id; // switch to this map square so we don't rebrush IT multiple times
+                  }
+               }
+               break;
+            case STBTE__leftdown:
+               if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+                  stbte__activate(id);
+                  stbte__begin_undo(tm);
+                  stbte__brush(tm, mapx, mapy);
+               }
+               break;
+            case STBTE__rightdown:
+               if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+                  stbte__activate(id);
+                  stbte__begin_undo(tm);
+                  if (stbte__erase(tm, mapx, mapy, STBTE__erase_any) == STBTE__erase_brushonly)
+                     stbte__ui.brush_state = STBTE__erase_brushonly;
+                  else
+                     stbte__ui.brush_state = STBTE__erase_any;
+               }
+               break;
+            case STBTE__leftup:
+            case STBTE__rightup:
+               if (STBTE__IS_MAP_ACTIVE()) {
+                  stbte__end_undo(tm);
+                  stbte__activate(0);
+               }
+               break;
+         }
+         break;
+
+#ifdef STBTE_ALLOW_LINK
+      case STBTE__tool_link:
+         switch (stbte__ui.event) {
+            case STBTE__leftdown:
+               if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+                  stbte__activate(id);
+                  stbte__ui.linking = 1;
+                  stbte__ui.sx = mapx;
+                  stbte__ui.sy = mapy;
+                  // @TODO: undo
+               }
+               break;
+            case STBTE__leftup:
+               if (STBTE__IS_HOT(id) && STBTE__IS_MAP_ACTIVE()) {
+                  if ((mapx != stbte__ui.sx || mapy != stbte__ui.sy) &&
+                         STBTE_ALLOW_LINK(tm->data[stbte__ui.sy][stbte__ui.sx], tm->props[stbte__ui.sy][stbte__ui.sx],
+                                          tm->data[mapy][mapx], tm->props[mapy][mapx]))
+                     stbte__set_link(tm, stbte__ui.sx, stbte__ui.sy, mapx, mapy, STBTE__undo_block);
+                  else
+                     stbte__set_link(tm, stbte__ui.sx, stbte__ui.sy, -1,-1, STBTE__undo_block);
+                  stbte__ui.linking = 0;
+                  stbte__activate(0);
+               }
+               break;
+
+            case STBTE__rightdown:
+               if (STBTE__IS_ACTIVE(id)) {
+                  stbte__activate(0);
+                  stbte__ui.linking = 0;
+               }
+               break;
+         }
+         break;
+#endif
+
+      case STBTE__tool_erase:
+         switch (stbte__ui.event) {
+            case STBTE__mousemove:
+               if (STBTE__IS_MAP_ACTIVE() && over)
+                  stbte__erase(tm, mapx, mapy, STBTE__erase_all);
+               break;
+            case STBTE__leftdown:
+               if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) {
+                  stbte__activate(id);
+                  stbte__begin_undo(tm);
+                  stbte__erase(tm, mapx, mapy, STBTE__erase_all);
+               }
+               break;
+            case STBTE__leftup:
+               if (STBTE__IS_MAP_ACTIVE()) {
+                  stbte__end_undo(tm);
+                  stbte__activate(0);
+               }
+               break;
+         }
+         break;
+
+      case STBTE__tool_select:
+         if (STBTE__IS_HOT(id)) {
+            switch (stbte__ui.event) {
+               case STBTE__leftdown:
+                  if (STBTE__INACTIVE()) {
+                     // if we're clicking in an existing selection...
+                     if (stbte__ui.has_selection) {
+                        if (  mapx >= stbte__ui.select_x0 && mapx <= stbte__ui.select_x1
+                           && mapy >= stbte__ui.select_y0 && mapy <= stbte__ui.select_y1)
+                        {
+                           stbte__ui.dragging = 1;
+                           stbte__ui.drag_x = stbte__ui.select_x0;
+                           stbte__ui.drag_y = stbte__ui.select_y0;
+                           stbte__ui.drag_w = stbte__ui.select_x1 - stbte__ui.select_x0 + 1;
+                           stbte__ui.drag_h = stbte__ui.select_y1 - stbte__ui.select_y0 + 1;
+                           stbte__ui.drag_offx = mapx - stbte__ui.select_x0;
+                           stbte__ui.drag_offy = mapy - stbte__ui.select_y0;
+                        }
+                     }
+                     stbte__ui.has_selection = 0; // no selection until it completes
+                     stbte__activate_map(mapx,mapy);
+                  }
+                  break;
+               case STBTE__leftup:
+                  if (STBTE__IS_MAP_ACTIVE()) {
+                     if (stbte__ui.dragging) {
+                        stbte__drag_place(tm, mapx,mapy);
+                        stbte__ui.dragging = 0;
+                        stbte__activate(0);
+                     } else {
+                        stbte__select_rect(tm, stbte__ui.sx, stbte__ui.sy, mapx, mapy);
+                        stbte__activate(0);
+                     }
+                  }
+                  break;
+               case STBTE__rightdown:
+                  stbte__ui.has_selection = 0;
+                  break;
+            }
+         }
+         break;
+
+      case STBTE__tool_rect:
+         if (STBTE__IS_HOT(id)) {
+            switch (stbte__ui.event) {
+               case STBTE__leftdown:
+                  if (STBTE__INACTIVE())
+                     stbte__activate_map(mapx,mapy);
+                  break;
+               case STBTE__leftup:
+                  if (STBTE__IS_MAP_ACTIVE()) {
+                     stbte__fillrect(tm, stbte__ui.sx, stbte__ui.sy, mapx, mapy, 1);
+                     stbte__activate(0);
+                  }
+                  break;
+               case STBTE__rightdown:
+                  if (STBTE__INACTIVE())
+                     stbte__activate_map(mapx,mapy);
+                  break;
+               case STBTE__rightup:
+                  if (STBTE__IS_MAP_ACTIVE()) {
+                     stbte__fillrect(tm, stbte__ui.sx, stbte__ui.sy, mapx, mapy, 0);
+                     stbte__activate(0);
+                  }
+                  break;
+            }
+         }
+         break;
+
+
+      case STBTE__tool_eyedrop:
+         switch (stbte__ui.event) {
+            case STBTE__leftdown:
+               if (STBTE__IS_HOT(id) && STBTE__INACTIVE())
+                  stbte__eyedrop(tm,mapx,mapy);
+               break;
+         }
+         break;
+   }
+}
+
+static void stbte__start_paste(stbte_tilemap *tm)
+{
+   if (stbte__ui.has_copy) {
+      stbte__ui.pasting = 1;
+      stbte__activate(STBTE__ID(STBTE__toolbarB,3));
+   }
+}
+
+static void stbte__toolbar(stbte_tilemap *tm, int x0, int y0, int w, int h)
+{
+   int i;
+   int estimated_width = 13 * STBTE__num_tool + 8+8+ 120+4 - 30;
+   int x = x0 + w/2 - estimated_width/2;
+   int y = y0+1;
+
+   for (i=0; i < STBTE__num_tool; ++i) {
+      int highlight=0, disable=0;
+      highlight = (stbte__ui.tool == i);
+      if (i == STBTE__tool_undo || i == STBTE__tool_showgrid)
+          x += 8;
+      if (i == STBTE__tool_showgrid && stbte__ui.show_grid)
+         highlight = 1;
+      if (i == STBTE__tool_showlinks && stbte__ui.show_links)
+         highlight = 1;
+      if (i == STBTE__tool_fill)
+         continue;
+      #ifndef STBTE_ALLOW_LINK
+      if (i == STBTE__tool_link || i == STBTE__tool_showlinks)
+         disable = 1;
+      #endif
+      if (i == STBTE__tool_undo && !stbte__undo_available(tm))
+         disable = 1;
+      if (i == STBTE__tool_redo && !stbte__redo_available(tm))
+         disable = 1;
+      if (stbte__button_icon(STBTE__ctoolbar_button, toolchar[i], x, y, 13, STBTE__ID(STBTE__toolbarA, i), highlight, disable)) {
+         switch (i) {
+            case STBTE__tool_eyedrop:
+               stbte__ui.eyedrop_last_layer = tm->num_layers; // flush eyedropper state
+               // fallthrough
+            default:
+               stbte__ui.tool = i;
+               stbte__ui.has_selection = 0;
+               break;
+            case STBTE__tool_showlinks:
+               stbte__ui.show_links = !stbte__ui.show_links;
+               break;
+            case STBTE__tool_showgrid:
+               stbte__ui.show_grid = (stbte__ui.show_grid+1)%3;
+               break;
+            case STBTE__tool_undo:
+               stbte__undo(tm);
+               break;
+            case STBTE__tool_redo:
+               stbte__redo(tm);
+               break;
+         }
+      }
+      x += 13;
+   }
+
+   x += 8;
+   if (stbte__button(STBTE__ctoolbar_button, "cut"  , x, y,10, 40, STBTE__ID(STBTE__toolbarB,0), 0, !stbte__ui.has_selection))
+      stbte__copy_cut(tm, 1);
+   x += 42;
+   if (stbte__button(STBTE__ctoolbar_button, "copy" , x, y, 5, 40, STBTE__ID(STBTE__toolbarB,1), 0, !stbte__ui.has_selection))
+      stbte__copy_cut(tm, 0);
+   x += 42;
+   if (stbte__button(STBTE__ctoolbar_button, "paste", x, y, 0, 40, STBTE__ID(STBTE__toolbarB,2), stbte__ui.pasting, !stbte__ui.has_copy))
+      stbte__start_paste(tm);
+}
+
+#define STBTE__TEXTCOLOR(n)  stbte__color_table[n][STBTE__text][STBTE__idle]
+
+static int stbte__info_value(const char *label, int x, int y, int val, int digits, int id)
+{
+   if (stbte__ui.event == STBTE__paint) {
+      int off = 9-stbte__get_char_width(label[0]);
+      char text[16];
+      stbte__sprintf(text stbte__sizeof(text), label, digits, val);
+      stbte__draw_text_core(x+off,y, text, 999, STBTE__TEXTCOLOR(STBTE__cpanel),1);
+   }
+   if (id) {
+      x += 9+7*digits+4;
+      if (stbte__minibutton(STBTE__cmapsize, x,y, '+', STBTE__ID2(id,1,0)))
+         val += (stbte__ui.shift ? 10 : 1);
+      x += 9;
+      if (stbte__minibutton(STBTE__cmapsize, x,y, '-', STBTE__ID2(id,2,0)))
+         val -= (stbte__ui.shift ? 10 : 1);
+      if (val < 1) val = 1; else if (val > 4096) val = 4096;
+   }
+   return val;
+}
+
+static void stbte__info(stbte_tilemap *tm, int x0, int y0, int w, int h)
+{
+   int mode = stbte__ui.panel[STBTE__panel_info].mode;
+   int s = 11+7*tm->digits+4+15;
+   int x,y;
+   int in_region;
+
+   x = x0+2;
+   y = y0+2;
+   tm->max_x = stbte__info_value("w:%*d",x,y, tm->max_x, tm->digits, STBTE__ID(STBTE__info,0));
+   if (mode)
+      x += s;
+   else
+      y += 11;
+   tm->max_y = stbte__info_value("h:%*d",x,y, tm->max_y, tm->digits, STBTE__ID(STBTE__info,1));
+   x = x0+2;
+   y += 11;
+   in_region = (stbte__ui.hot_id & 127) == STBTE__map;
+   stbte__info_value(in_region ? "x:%*d" : "x:",x,y, (stbte__ui.hot_id>>19)&4095, tm->digits, 0);
+   if (mode)
+      x += s;
+   else
+      y += 11;
+   stbte__info_value(in_region ? "y:%*d" : "y:",x,y, (stbte__ui.hot_id>> 7)&4095, tm->digits, 0);
+   y += 15;
+   x = x0+2;
+   stbte__draw_text(x,y,"brush:",40,STBTE__TEXTCOLOR(STBTE__cpanel));
+   if (tm->cur_tile >= 0)
+      STBTE_DRAW_TILE(x+43,y-3,tm->tiles[tm->cur_tile].id,1,0);
+}
+
+static void stbte__layers(stbte_tilemap *tm, int x0, int y0, int w, int h)
+{
+   static const char *propmodes[3] = {
+      "default", "always", "never"
+   };
+   int num_rows;
+   int i, y, n;
+   int x1 = x0+w;
+   int y1 = y0+h;
+   int xoff = 20;
+
+   if (tm->has_layer_names) {
+      int side = stbte__ui.panel[STBTE__panel_layers].side;
+      xoff = stbte__region[side].width - 42;
+      xoff = (xoff < tm->layername_width + 10 ? xoff : tm->layername_width + 10);
+   }
+
+   x0 += 2;
+   y0 += 5;
+   if (!tm->has_layer_names) {
+      if (stbte__ui.event == STBTE__paint) {
+         stbte__draw_text(x0,y0, "Layers", w-4, STBTE__TEXTCOLOR(STBTE__cpanel));
+      }
+      y0 += 11;
+   }
+   num_rows = (y1-y0)/15;
+#ifndef STBTE_NO_PROPS
+   --num_rows;
+#endif
+   y = y0;
+   for (i=0; i < tm->num_layers; ++i) {
+      char text[3], *str = (char *) tm->layerinfo[i].name;
+      static char lockedchar[3] = { 'U', 'P', 'L' };
+      int locked = tm->layerinfo[i].locked;
+      int disabled = (tm->solo_layer >= 0 && tm->solo_layer != i);
+      if (i-tm->layer_scroll >= 0 && i-tm->layer_scroll < num_rows) {
+         if (str == NULL)
+            stbte__sprintf(str=text stbte__sizeof(text), "%2d", i+1);
+         if (stbte__button(STBTE__clayer_button, str, x0,y,(i+1<10)*2,xoff-2, STBTE__ID(STBTE__layer,i), tm->cur_layer==i,0))
+            tm->cur_layer = (tm->cur_layer == i ? -1 : i);
+         if (stbte__layerbutton(x0+xoff +  0,y+1,'H',STBTE__ID(STBTE__hide,i), tm->layerinfo[i].hidden,disabled,STBTE__clayer_hide))
+            tm->layerinfo[i].hidden = !tm->layerinfo[i].hidden;
+         if (stbte__layerbutton(x0+xoff + 12,y+1,lockedchar[locked],STBTE__ID(STBTE__lock,i), locked!=0,disabled,STBTE__clayer_lock))
+            tm->layerinfo[i].locked = (locked+1)%3;
+         if (stbte__layerbutton(x0+xoff + 24,y+1,'S',STBTE__ID(STBTE__solo,i), tm->solo_layer==i,0,STBTE__clayer_solo))
+            tm->solo_layer = (tm->solo_layer == i ? -1 : i);
+         y += 15;
+      }
+   }
+   stbte__scrollbar(x1-4, y0,y-2, &tm->layer_scroll, 0, tm->num_layers, num_rows, STBTE__ID(STBTE__scrollbar_id, STBTE__layer));
+#ifndef STBTE_NO_PROPS
+   n = stbte__text_width("prop:")+2;
+   stbte__draw_text(x0,y+2, "prop:", w, STBTE__TEXTCOLOR(STBTE__cpanel));
+   i = w - n - 4;
+   if (i > 50) i = 50;
+   if (stbte__button(STBTE__clayer_button, propmodes[tm->propmode], x0+n,y,0,i, STBTE__ID(STBTE__layer,256), 0,0))
+      tm->propmode = (tm->propmode+1)%3;
+#endif
+}
+
+static void stbte__categories(stbte_tilemap *tm, int x0, int y0, int w, int h)
+{
+   int s=11, x,y, i;
+   int num_rows = h / s;
+
+   w -= 4;
+   x = x0+2;
+   y = y0+4;
+   if (tm->category_scroll == 0) {
+      if (stbte__category_button("*ALL*", x,y, w, STBTE__ID(STBTE__categories, 65535), tm->cur_category == -1)) {
+         stbte__choose_category(tm, -1);
+      }
+      y += s;
+   }
+
+   for (i=0; i < tm->num_categories; ++i) {
+      if (i+1 - tm->category_scroll >= 0 && i+1 - tm->category_scroll < num_rows) {
+         if (y + 10 > y0+h)
+            return;
+         if (stbte__category_button(tm->categories[i], x,y,w, STBTE__ID(STBTE__categories,i), tm->cur_category == i))
+            stbte__choose_category(tm, i);
+         y += s;
+      }
+   }
+   stbte__scrollbar(x0+w, y0+4, y0+h-4, &tm->category_scroll, 0, tm->num_categories+1, num_rows, STBTE__ID(STBTE__scrollbar_id, STBTE__categories));
+}
+
+static void stbte__tile_in_palette(stbte_tilemap *tm, int x, int y, int slot)
+{
+   stbte__tileinfo *t = &tm->tiles[slot];
+   int x0=x, y0=y, x1 = x+tm->palette_spacing_x - 1, y1 = y+tm->palette_spacing_y;
+   int id = STBTE__ID(STBTE__palette, slot);
+   stbte__hittest(x0,y0,x1,y1, id);
+   switch (stbte__ui.event) {
+      case STBTE__paint:
+         stbte__draw_rect(x,y,x+tm->palette_spacing_x-1,y+tm->palette_spacing_x-1, STBTE_COLOR_TILEPALETTE_BACKGROUND);
+         STBTE_DRAW_TILE(x,y,id, slot == tm->cur_tile,0);
+         if (slot == tm->cur_tile)
+            stbte__draw_frame_delayed(x-1,y-1,x+tm->palette_spacing_x,y+tm->palette_spacing_y, STBTE_COLOR_TILEPALETTE_OUTLINE);
+         break;
+      default:
+         if (stbte__button_core(id))
+            tm->cur_tile = slot;
+         break;
+   }
+}
+
+static void stbte__palette_of_tiles(stbte_tilemap *tm, int x0, int y0, int w, int h)
+{
+   int i,x,y;
+   int num_vis_rows = (h-6) / tm->palette_spacing_y;
+   int num_columns = (w-2-6) / tm->palette_spacing_x;
+   int num_total_rows;
+   int column,row;
+   int x1 = x0+w, y1=y0+h;
+   x = x0+2;
+   y = y0+6;
+
+   if (num_columns == 0)
+      return;
+
+   num_total_rows = (tm->cur_palette_count + num_columns-1) / num_columns; // ceil()
+
+   column = 0;
+   row    = -tm->palette_scroll;
+   for (i=0; i < tm->num_tiles; ++i) {
+      stbte__tileinfo *t = &tm->tiles[i];
+
+      // filter based on category
+      if (tm->cur_category >= 0 && t->category_id != tm->cur_category)
+         continue;
+
+      // display it
+      if (row >= 0 && row < num_vis_rows) {
+         x = x0 + 2 + tm->palette_spacing_x * column;
+         y = y0 + 6 + tm->palette_spacing_y * row;
+         stbte__tile_in_palette(tm,x,y,i);
+      }
+
+      ++column;
+      if (column == num_columns) {
+         column = 0;
+         ++row;
+      }
+   }
+   stbte__flush_delay();
+   stbte__scrollbar(x1-4, y0+6, y1-2, &tm->palette_scroll, 0, num_total_rows, num_vis_rows, STBTE__ID(STBTE__scrollbar_id, STBTE__palette));
+}
+
+static float stbte__saved;
+static void stbte__props_panel(stbte_tilemap *tm, int x0, int y0, int w, int h)
+{
+   int x1 = x0+w;
+   int i;
+   int y = y0 + 5, x = x0+2;
+   int slider_width = 60;
+   int mx,my;
+   float *p;
+   short *data;
+   if (!stbte__is_single_selection())
+      return;
+   mx = stbte__ui.select_x0;
+   my = stbte__ui.select_y0;
+   p = tm->props[my][mx];
+   data = tm->data[my][mx];
+   STBTE__NOTUSED(data);
+   for (i=0; i < STBTE_MAX_PROPERTIES; ++i) {
+      unsigned int n = STBTE_PROP_TYPE(i, data, p);
+      if (n) {
+         char *s = (char*) STBTE_PROP_NAME(i, data, p);
+         if (s == NULL) s = (char*) "";
+         switch (n & 3) {
+            case STBTE_PROP_bool: {
+               int flag = (int) p[i];
+               if (stbte__layerbutton(x,y, flag ? 'x' : ' ', STBTE__ID(STBTE__prop_flag,i), flag, 0, 2)) {
+                  stbte__begin_undo(tm);
+                  stbte__undo_record_prop_float(tm,mx,my,i,(float) flag);
+                  p[i] = (float) !flag;
+                  stbte__end_undo(tm);
+               }
+               stbte__draw_text(x+13,y+1,s,x1-(x+13)-2,STBTE__TEXTCOLOR(STBTE__cpanel));
+               y += 13;
+               break;
+            }
+            case STBTE_PROP_int: {
+               int a = (int) STBTE_PROP_MIN(i,data,p);
+               int b = (int) STBTE_PROP_MAX(i,data,p);
+               int v = (int) p[i] - a;
+               if (a+v != p[i] || v < 0 || v > b-a) {
+                  if (v < 0) v = 0;
+                  if (v > b-a) v = b-a;
+                  p[i] = (float) (a+v); // @TODO undo
+               }
+               switch (stbte__slider(x, slider_width, y+7, b-a, &v, STBTE__ID(STBTE__prop_int,i)))
+               {
+                  case STBTE__begin:
+                     stbte__saved = p[i];
+                     // fallthrough
+                  case STBTE__change:
+                     p[i] = (float) (a+v); // @TODO undo
+                     break;
+                  case STBTE__end:
+                     if (p[i] != stbte__saved) {
+                        stbte__begin_undo(tm);
+                        stbte__undo_record_prop_float(tm,mx,my,i,stbte__saved);
+                        stbte__end_undo(tm);
+                     }
+                     break;
+               }
+               stbte__draw_text(x+slider_width+2,y+2, s, x1-1-(x+slider_width+2), STBTE__TEXTCOLOR(STBTE__cpanel));
+               y += 12;
+               break;
+            }
+            case STBTE_PROP_float: {
+               float a = (float) STBTE_PROP_MIN(i, data,p);
+               float b = (float) STBTE_PROP_MAX(i, data,p);
+               float c = STBTE_PROP_FLOAT_SCALE(i, data, p);
+               float old;
+               if (p[i] < a || p[i] > b) {
+                  // @TODO undo
+                  if (p[i] < a) p[i] = a;
+                  if (p[i] > b) p[i] = b;
+               }
+               old = p[i];
+               switch (stbte__float_control(x, y, 50, a, b, c, "%8.4f", &p[i], STBTE__layer,STBTE__ID(STBTE__prop_float,i))) {
+                  case STBTE__begin:
+                     stbte__saved = old;
+                     break;
+                  case STBTE__end:
+                     if (stbte__saved != p[i]) {
+                        stbte__begin_undo(tm);
+                        stbte__undo_record_prop_float(tm,mx,my,i, stbte__saved);
+                        stbte__end_undo(tm);
+                     }
+                     break;
+               }
+               stbte__draw_text(x+53,y+1, s, x1-1-(x+53), STBTE__TEXTCOLOR(STBTE__cpanel));
+               y += 12;
+               break;
+            }
+         }
+      }
+   }
+}
+
+static int stbte__cp_mode, stbte__cp_aspect, stbte__save, stbte__cp_altered;
+#ifdef STBTE__COLORPICKER
+static int stbte__cp_state, stbte__cp_index, stbte__color_copy;
+static void stbte__dump_colorstate(void)
+{
+   int i,j,k;
+   printf("static int stbte__color_table[STBTE__num_color_modes][STBTE__num_color_aspects][STBTE__num_color_states] =\n");
+   printf("{\n");
+   printf("   {\n");
+   for (k=0; k < STBTE__num_color_modes; ++k) {
+      for (j=0; j < STBTE__num_color_aspects; ++j) {
+         printf("      { ");
+         for (i=0; i < STBTE__num_color_states; ++i) {
+            printf("0x%06x, ", stbte__color_table[k][j][i]);
+         }
+         printf("},\n");
+      }
+      if (k+1 < STBTE__num_color_modes)
+         printf("   }, {\n");
+      else
+         printf("   },\n");
+   }
+   printf("};\n");
+}
+
+static void stbte__colorpicker(int x0, int y0, int w, int h)
+{
+   int x1 = x0+w, y1 = y0+h, x,y, i;
+
+   x =  x0+2; y = y0+6;
+
+   y += 5;
+   x += 8;
+
+
+   {
+      int color = stbte__color_table[stbte__cp_mode][stbte__cp_aspect][stbte__cp_index];
+      int rgb[3];
+      if (stbte__cp_altered && stbte__cp_index == STBTE__idle)
+         color = stbte__save;
+
+      if (stbte__minibutton(STBTE__cmapsize, x1-20,y+ 5, 'C', STBTE__ID2(STBTE__colorpick_id,4,0)))
+         stbte__color_copy = color;
+      if (stbte__minibutton(STBTE__cmapsize, x1-20,y+15, 'P', STBTE__ID2(STBTE__colorpick_id,4,1)))
+         color = stbte__color_copy;
+
+      rgb[0] = color >> 16; rgb[1] = (color>>8)&255; rgb[2] = color & 255;
+      for (i=0; i < 3; ++i) {
+         if (stbte__slider(x+8,64, y, 255, rgb+i, STBTE__ID2(STBTE__colorpick_id,3,i)) > 0)
+            stbte__dump_colorstate();
+         y += 15;
+      }
+      if (stbte__ui.event != STBTE__paint && stbte__ui.event != STBTE__tick)
+         stbte__color_table[stbte__cp_mode][stbte__cp_aspect][stbte__cp_index] = (rgb[0]<<16)|(rgb[1]<<8)|(rgb[2]);
+   }
+
+   y += 5;
+
+   // states
+   x = x0+2+35;
+   if (stbte__ui.event == STBTE__paint) {
+      static char *states[] = { "idle", "over", "down", "down&over", "selected", "selected&over", "disabled" };
+      stbte__draw_text(x, y+1, states[stbte__cp_index], x1-x-1, 0xffffff);
+   }
+
+   x = x0+24; y += 12;
+
+   for (i=3; i >= 0; --i) {
+      int state = 0 != (stbte__cp_state & (1 << i));
+      if (stbte__layerbutton(x,y, "OASD"[i], STBTE__ID2(STBTE__colorpick_id, 0,i), state,0, STBTE__clayer_button)) {
+         stbte__cp_state ^= (1 << i);
+         stbte__cp_index = stbte__state_to_index[0][0][0][stbte__cp_state];
+      }
+      x += 16;
+   }
+   x = x0+2; y += 18;
+
+   for (i=0; i < 3; ++i) {
+      static char *labels[] = { "Base", "Edge", "Text" };
+      if (stbte__button(STBTE__ctoolbar_button, labels[i], x,y,0,36, STBTE__ID2(STBTE__colorpick_id,1,i), stbte__cp_aspect==i,0))
+         stbte__cp_aspect = i;
+      x += 40;
+   }
+
+   y += 18;
+   x = x0+2;
+
+   for (i=0; i < STBTE__num_color_modes; ++i) {
+      if (stbte__button(STBTE__ctoolbar_button, stbte__color_names[i], x, y, 0,80, STBTE__ID2(STBTE__colorpick_id,2,i), stbte__cp_mode == i,0))
+         stbte__cp_mode = i;
+      y += 12;
+   }
+
+   // make the currently selected aspect flash, unless we're actively dragging color slider etc
+   if (stbte__ui.event == STBTE__tick) {
+      stbte__save = stbte__color_table[stbte__cp_mode][stbte__cp_aspect][STBTE__idle];
+      if ((stbte__ui.active_id & 127) != STBTE__colorpick_id) {
+         if ((stbte__ui.ms_time & 2047) < 200) {
+            stbte__color_table[stbte__cp_mode][stbte__cp_aspect][STBTE__idle] ^= 0x1f1f1f;
+            stbte__cp_altered = 1;
+         }
+      }
+   }
+}
+#endif
+
+static void stbte__editor_traverse(stbte_tilemap *tm)
+{
+   int i,j,i0,j0,i1,j1,n;
+
+   if (tm == NULL)
+      return;
+   if (stbte__ui.x0 == stbte__ui.x1 || stbte__ui.y0 == stbte__ui.y1)
+      return;
+
+   stbte__prepare_tileinfo(tm);
+
+   stbte__compute_panel_locations(tm); // @OPTIMIZE: we don't need to recompute this every time
+
+   if (stbte__ui.event == STBTE__paint) {
+      // fill screen with border
+      stbte__draw_rect(stbte__ui.x0, stbte__ui.y0, stbte__ui.x1, stbte__ui.y1, STBTE_COLOR_TILEMAP_BORDER);
+      // fill tilemap with tilemap background
+      stbte__draw_rect(stbte__ui.x0 - tm->scroll_x, stbte__ui.y0 - tm->scroll_y,
+                       stbte__ui.x0 - tm->scroll_x + tm->spacing_x * tm->max_x,
+                       stbte__ui.y0 - tm->scroll_y + tm->spacing_y * tm->max_y, STBTE_COLOR_TILEMAP_BACKGROUND);
+   }
+
+   // step 1: traverse all the tilemap data...
+
+   i0 = (tm->scroll_x - tm->spacing_x) / tm->spacing_x;
+   j0 = (tm->scroll_y - tm->spacing_y) / tm->spacing_y;
+   i1 = (tm->scroll_x + stbte__ui.x1 - stbte__ui.x0) / tm->spacing_x + 1;
+   j1 = (tm->scroll_y + stbte__ui.y1 - stbte__ui.y0) / tm->spacing_y + 1;
+
+   if (i0 < 0) i0 = 0;
+   if (j0 < 0) j0 = 0;
+   if (i1 > tm->max_x) i1 = tm->max_x;
+   if (j1 > tm->max_y) j1 = tm->max_y;
+
+   if (stbte__ui.event == STBTE__paint) {
+      // draw all of layer 0, then all of layer 1, etc, instead of old
+      // way which drew entire stack of each tile at once
+      for (n=0; n < tm->num_layers; ++n) {
+         for (j=j0; j < j1; ++j) {
+            for (i=i0; i < i1; ++i) {
+               int x = stbte__ui.x0 + i * tm->spacing_x - tm->scroll_x;
+               int y = stbte__ui.y0 + j * tm->spacing_y - tm->scroll_y;
+               stbte__tile_paint(tm, x, y, i, j, n);
+            }
+         }
+         if (n == 0 && stbte__ui.show_grid == 1) {
+            int x = stbte__ui.x0 + i0 * tm->spacing_x - tm->scroll_x;
+            int y = stbte__ui.y0 + j0 * tm->spacing_y - tm->scroll_y;
+            for (i=0; x < stbte__ui.x1 && i <= i1; ++i, x += tm->spacing_x)
+               stbte__draw_rect(x, stbte__ui.y0, x+1, stbte__ui.y1, STBTE_COLOR_GRID);
+            for (j=0; y < stbte__ui.y1 && j <= j1; ++j, y += tm->spacing_y)
+               stbte__draw_rect(stbte__ui.x0, y, stbte__ui.x1, y+1, STBTE_COLOR_GRID);
+         }
+      }
+   }
+
+   if (stbte__ui.event == STBTE__paint) {
+      // draw grid on top of everything except UI
+      if (stbte__ui.show_grid == 2) {
+         int x = stbte__ui.x0 + i0 * tm->spacing_x - tm->scroll_x;
+         int y = stbte__ui.y0 + j0 * tm->spacing_y - tm->scroll_y;
+         for (i=0; x < stbte__ui.x1 && i <= i1; ++i, x += tm->spacing_x)
+            stbte__draw_rect(x, stbte__ui.y0, x+1, stbte__ui.y1, STBTE_COLOR_GRID);
+         for (j=0; y < stbte__ui.y1 && j <= j1; ++j, y += tm->spacing_y)
+            stbte__draw_rect(stbte__ui.x0, y, stbte__ui.x1, y+1, STBTE_COLOR_GRID);
+      }
+   }
+
+   for (j=j0; j < j1; ++j) {
+      for (i=i0; i < i1; ++i) {
+         int x = stbte__ui.x0 + i * tm->spacing_x - tm->scroll_x;
+         int y = stbte__ui.y0 + j * tm->spacing_y - tm->scroll_y;
+         stbte__tile(tm, x, y, i, j);
+      }
+   }
+
+   if (stbte__ui.event == STBTE__paint) {
+      // draw the selection border
+      if (stbte__ui.has_selection) {
+         int x0,y0,x1,y1;
+         x0 = stbte__ui.x0 + (stbte__ui.select_x0    ) * tm->spacing_x - tm->scroll_x;
+         y0 = stbte__ui.y0 + (stbte__ui.select_y0    ) * tm->spacing_y - tm->scroll_y;
+         x1 = stbte__ui.x0 + (stbte__ui.select_x1 + 1) * tm->spacing_x - tm->scroll_x + 1;
+         y1 = stbte__ui.y0 + (stbte__ui.select_y1 + 1) * tm->spacing_y - tm->scroll_y + 1;
+         stbte__draw_frame(x0,y0,x1,y1, (stbte__ui.ms_time & 256 ? STBTE_COLOR_SELECTION_OUTLINE1 : STBTE_COLOR_SELECTION_OUTLINE2));
+      }
+
+      stbte__flush_delay(); // draw a dynamic link on top of the queued links
+
+      #ifdef STBTE_ALLOW_LINK
+      if (stbte__ui.linking && STBTE__IS_MAP_HOT()) {
+         int x0,y0,x1,y1;
+         int color;
+         int ex = ((stbte__ui.hot_id >> 19) & 4095);
+         int ey = ((stbte__ui.hot_id >>  7) & 4095);
+         x0 = stbte__ui.x0 + (stbte__ui.sx    ) * tm->spacing_x - tm->scroll_x + (tm->spacing_x>>1)+1;
+         y0 = stbte__ui.y0 + (stbte__ui.sy    ) * tm->spacing_y - tm->scroll_y + (tm->spacing_y>>1)+1;
+         x1 = stbte__ui.x0 + (ex              ) * tm->spacing_x - tm->scroll_x + (tm->spacing_x>>1)-1;
+         y1 = stbte__ui.y0 + (ey              ) * tm->spacing_y - tm->scroll_y + (tm->spacing_y>>1)-1;
+         if (STBTE_ALLOW_LINK(tm->data[stbte__ui.sy][stbte__ui.sx], tm->props[stbte__ui.sy][stbte__ui.sx], tm->data[ey][ex], tm->props[ey][ex]))
+            color = STBTE_LINK_COLOR_DRAWING;
+         else
+            color = STBTE_LINK_COLOR_DISALLOWED;
+         stbte__draw_link(x0,y0,x1,y1, color);
+      }
+      #endif
+   }
+   stbte__flush_delay();
+
+   // step 2: traverse the panels
+   for (i=0; i < STBTE__num_panel; ++i) {
+      stbte__panel *p = &stbte__ui.panel[i];
+      if (stbte__ui.event == STBTE__paint) {
+         stbte__draw_box(p->x0,p->y0,p->x0+p->width,p->y0+p->height, STBTE__cpanel, STBTE__idle);
+      }
+      // obscure tilemap data underneath panel
+      stbte__hittest(p->x0,p->y0,p->x0+p->width,p->y0+p->height, STBTE__ID2(STBTE__panel, i, 0));
+      switch (i) {
+         case STBTE__panel_toolbar:
+            if (stbte__ui.event == STBTE__paint)
+               stbte__draw_rect(p->x0,p->y0,p->x0+p->width,p->y0+p->height, stbte__color_table[STBTE__ctoolbar][STBTE__base][STBTE__idle]);
+            stbte__toolbar(tm,p->x0,p->y0,p->width,p->height);
+            break;
+         case STBTE__panel_info:
+            stbte__info(tm,p->x0,p->y0,p->width,p->height);
+            break;
+         case STBTE__panel_layers:
+            stbte__layers(tm,p->x0,p->y0,p->width,p->height);
+            break;
+         case STBTE__panel_categories:
+            stbte__categories(tm,p->x0,p->y0,p->width,p->height);
+            break;
+         case STBTE__panel_colorpick:
+#ifdef STBTE__COLORPICKER
+            stbte__colorpicker(p->x0,p->y0,p->width,p->height);
+#endif
+            break;
+         case STBTE__panel_tiles:
+            // erase boundary between categories and tiles if they're on same side
+            if (stbte__ui.event == STBTE__paint && p->side == stbte__ui.panel[STBTE__panel_categories].side)
+               stbte__draw_rect(p->x0+1,p->y0-1,p->x0+p->width-1,p->y0+1, stbte__color_table[STBTE__cpanel][STBTE__base][STBTE__idle]);
+            stbte__palette_of_tiles(tm,p->x0,p->y0,p->width,p->height);
+            break;
+         case STBTE__panel_props:
+            stbte__props_panel(tm,p->x0,p->y0,p->width,p->height);
+            break;
+      }
+      // draw the panel side selectors
+      for (j=0; j < 2; ++j) {
+         int result;
+         if (i == STBTE__panel_toolbar) continue;
+         result = stbte__microbutton(p->x0+p->width - 1 - 2*4 + 4*j,p->y0+2,3, STBTE__ID2(STBTE__panel, i, j+1), STBTE__cpanel_sider+j);
+         if (result) {
+            switch (j) {
+               case 0: p->side = result > 0 ? STBTE__side_left : STBTE__side_right; break;
+               case 1: p->delta_height += result; break;
+            }
+         }
+      }
+   }
+
+   if (stbte__ui.panel[STBTE__panel_categories].delta_height < -5) stbte__ui.panel[STBTE__panel_categories].delta_height = -5;
+   if (stbte__ui.panel[STBTE__panel_layers    ].delta_height < -5) stbte__ui.panel[STBTE__panel_layers    ].delta_height = -5;
+
+
+   // step 3: traverse the regions to place expander controls on them
+   for (i=0; i < 2; ++i) {
+      if (stbte__region[i].active) {
+         int x = stbte__region[i].x;
+         int width;
+         if (i == STBTE__side_left)
+            width =  stbte__ui.left_width , x += stbte__region[i].width + 1;
+         else
+            width = -stbte__ui.right_width, x -= 6;
+         if (stbte__microbutton_dragger(x, stbte__region[i].y+2, 5, STBTE__ID(STBTE__region,i), &width)) {
+            // if non-0, it is expanding, so retract it
+            if (stbte__region[i].retracted == 0.0)
+               stbte__region[i].retracted = 0.01f;
+            else
+               stbte__region[i].retracted = 0.0;
+         }
+         if (i == STBTE__side_left)
+            stbte__ui.left_width  =  width;
+         else
+            stbte__ui.right_width = -width;
+         if (stbte__ui.event == STBTE__tick) {
+            if (stbte__region[i].retracted && stbte__region[i].retracted < 1.0f) {
+               stbte__region[i].retracted += stbte__ui.dt*4;
+               if (stbte__region[i].retracted > 1)
+                  stbte__region[i].retracted = 1;
+            }
+         }
+      }
+   }
+
+   if (stbte__ui.event == STBTE__paint && stbte__ui.alert_msg) {
+      int w = stbte__text_width(stbte__ui.alert_msg);
+      int x = (stbte__ui.x0+stbte__ui.x1)/2;
+      int y = (stbte__ui.y0+stbte__ui.y1)*5/6;
+      stbte__draw_rect (x-w/2-4,y-8, x+w/2+4,y+8, 0x604020);
+      stbte__draw_frame(x-w/2-4,y-8, x+w/2+4,y+8, 0x906030);
+      stbte__draw_text (x-w/2,y-4, stbte__ui.alert_msg, w+1, 0xff8040);
+   }
+
+#ifdef STBTE_SHOW_CURSOR
+   if (stbte__ui.event == STBTE__paint)
+      stbte__draw_bitmap(stbte__ui.mx, stbte__ui.my, stbte__get_char_width(26), stbte__get_char_bitmap(26), 0xe0e0e0);
+#endif
+
+   if (stbte__ui.event == STBTE__tick && stbte__ui.alert_msg) {
+      stbte__ui.alert_timer -= stbte__ui.dt;
+      if (stbte__ui.alert_timer < 0) {
+         stbte__ui.alert_timer = 0;
+         stbte__ui.alert_msg = 0;
+      }
+   }
+
+   if (stbte__ui.event == STBTE__paint) {
+      stbte__color_table[stbte__cp_mode][stbte__cp_aspect][STBTE__idle] = stbte__save;
+      stbte__cp_altered = 0;
+   }
+}
+
+static void stbte__do_event(stbte_tilemap *tm)
+{
+   stbte__ui.next_hot_id = 0;
+   stbte__editor_traverse(tm);
+   stbte__ui.hot_id = stbte__ui.next_hot_id;
+
+   // automatically cancel on mouse-up in case the object that triggered it
+   // doesn't exist anymore
+   if (stbte__ui.active_id) {
+      if (stbte__ui.event == STBTE__leftup || stbte__ui.event == STBTE__rightup) {
+         if (!stbte__ui.pasting) {
+            stbte__activate(0);
+            if (stbte__ui.undoing)
+               stbte__end_undo(tm);
+            stbte__ui.scrolling = 0;
+            stbte__ui.dragging = 0;
+            stbte__ui.linking = 0;
+         }
+      }
+   }
+
+   // we could do this stuff in the widgets directly, but it would keep recomputing
+   // the same thing on every tile, which seems dumb.
+
+   if (stbte__ui.pasting) {
+      if (STBTE__IS_MAP_HOT()) {
+         // compute pasting location based on last hot
+         stbte__ui.paste_x = ((stbte__ui.hot_id >> 19) & 4095) - (stbte__ui.copy_width >> 1);
+         stbte__ui.paste_y = ((stbte__ui.hot_id >>  7) & 4095) - (stbte__ui.copy_height >> 1);
+      }
+   }
+   if (stbte__ui.dragging) {
+      if (STBTE__IS_MAP_HOT()) {
+         stbte__ui.drag_dest_x = ((stbte__ui.hot_id >> 19) & 4095) - stbte__ui.drag_offx;
+         stbte__ui.drag_dest_y = ((stbte__ui.hot_id >>  7) & 4095) - stbte__ui.drag_offy;
+      }
+   }
+}
+
+static void stbte__set_event(int event, int x, int y)
+{
+   stbte__ui.event = event;
+   stbte__ui.mx    = x;
+   stbte__ui.my    = y;
+   stbte__ui.dx    = x - stbte__ui.last_mouse_x;
+   stbte__ui.dy    = y - stbte__ui.last_mouse_y;
+   stbte__ui.last_mouse_x = x;
+   stbte__ui.last_mouse_y = y;
+   stbte__ui.accum_x += stbte__ui.dx;
+   stbte__ui.accum_y += stbte__ui.dy;
+}
+
+void stbte_draw(stbte_tilemap *tm)
+{
+   stbte__ui.event = STBTE__paint;
+   stbte__editor_traverse(tm);
+}
+
+void stbte_mouse_move(stbte_tilemap *tm, int x, int y, int shifted, int scrollkey)
+{
+   stbte__set_event(STBTE__mousemove, x,y);
+   stbte__ui.shift = shifted;
+   stbte__ui.scrollkey = scrollkey;
+   stbte__do_event(tm);
+}
+
+void stbte_mouse_button(stbte_tilemap *tm, int x, int y, int right, int down, int shifted, int scrollkey)
+{
+   static int events[2][2] = { { STBTE__leftup , STBTE__leftdown  },
+                               { STBTE__rightup, STBTE__rightdown } };
+   stbte__set_event(events[right][down], x,y);
+   stbte__ui.shift = shifted;
+   stbte__ui.scrollkey = scrollkey;
+
+   stbte__do_event(tm);
+}
+
+void stbte_mouse_wheel(stbte_tilemap *tm, int x, int y, int vscroll)
+{
+   // not implemented yet -- need different way of hittesting
+}
+
+void stbte_action(stbte_tilemap *tm, enum stbte_action act)
+{
+   switch (act) {
+      case STBTE_tool_select:      stbte__ui.tool = STBTE__tool_select;               break;
+      case STBTE_tool_brush:       stbte__ui.tool = STBTE__tool_brush;                break;
+      case STBTE_tool_erase:       stbte__ui.tool = STBTE__tool_erase;                break;
+      case STBTE_tool_rectangle:   stbte__ui.tool = STBTE__tool_rect;                 break;
+      case STBTE_tool_eyedropper:  stbte__ui.tool = STBTE__tool_eyedrop;              break;
+      case STBTE_tool_link:        stbte__ui.tool = STBTE__tool_link;                 break;
+      case STBTE_act_toggle_grid:  stbte__ui.show_grid = (stbte__ui.show_grid+1) % 3; break;
+      case STBTE_act_toggle_links: stbte__ui.show_links ^= 1;                         break;
+      case STBTE_act_undo:         stbte__undo(tm);                                   break;
+      case STBTE_act_redo:         stbte__redo(tm);                                   break;
+      case STBTE_act_cut:          stbte__copy_cut(tm, 1);                            break;
+      case STBTE_act_copy:         stbte__copy_cut(tm, 0);                            break;
+      case STBTE_act_paste:        stbte__start_paste(tm);                            break;
+      case STBTE_scroll_left:      tm->scroll_x -= tm->spacing_x;                     break;
+      case STBTE_scroll_right:     tm->scroll_x += tm->spacing_x;                     break;
+      case STBTE_scroll_up:        tm->scroll_y -= tm->spacing_y;                     break;
+      case STBTE_scroll_down:      tm->scroll_y += tm->spacing_y;                     break;
+   }
+}
+
+void stbte_tick(stbte_tilemap *tm, float dt)
+{
+   stbte__ui.event = STBTE__tick;
+   stbte__ui.dt    = dt;
+   stbte__do_event(tm);
+   stbte__ui.ms_time += (int) (dt * 1024) + 1; // make sure if time is superfast it always updates a little
+}
+
+void stbte_mouse_sdl(stbte_tilemap *tm, const void *sdl_event, float xs, float ys, int xo, int yo)
+{
+#ifdef _SDL_H
+   SDL_Event *event = (SDL_Event *) sdl_event;
+   SDL_Keymod km = SDL_GetModState();
+   int shift = (km & KMOD_LCTRL) || (km & KMOD_RCTRL);
+   int scrollkey = 0 != SDL_GetKeyboardState(NULL)[SDL_SCANCODE_SPACE];
+   switch (event->type) {
+      case SDL_MOUSEMOTION:
+         stbte_mouse_move(tm, (int) (xs*event->motion.x+xo), (int) (ys*event->motion.y+yo), shift, scrollkey);
+         break;
+      case SDL_MOUSEBUTTONUP:
+         stbte_mouse_button(tm, (int) (xs*event->button.x+xo), (int) (ys*event->button.y+yo), event->button.button != SDL_BUTTON_LEFT, 0, shift, scrollkey);
+         break;
+      case SDL_MOUSEBUTTONDOWN:
+         stbte_mouse_button(tm, (int) (xs*event->button.x+xo), (int) (ys*event->button.y+yo), event->button.button != SDL_BUTTON_LEFT, 1, shift, scrollkey);
+         break;
+      case SDL_MOUSEWHEEL:
+         stbte_mouse_wheel(tm, stbte__ui.mx, stbte__ui.my, event->wheel.y);
+         break;
+   }
+#else
+   STBTE__NOTUSED(tm);
+   STBTE__NOTUSED(sdl_event);
+   STBTE__NOTUSED(xs);
+   STBTE__NOTUSED(ys);
+   STBTE__NOTUSED(xo);
+   STBTE__NOTUSED(yo);
+#endif
+}
+
+#endif // STB_TILEMAP_EDITOR_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_truetype.h b/lib/stb/stb_truetype.h
new file mode 100644
index 0000000..90a5c2e
--- /dev/null
+++ b/lib/stb/stb_truetype.h
@@ -0,0 +1,5079 @@
+// stb_truetype.h - v1.26 - public domain
+// authored from 2009-2021 by Sean Barrett / RAD Game Tools
+//
+// =======================================================================
+//
+//    NO SECURITY GUARANTEE -- DO NOT USE THIS ON UNTRUSTED FONT FILES
+//
+// This library does no range checking of the offsets found in the file,
+// meaning an attacker can use it to read arbitrary memory.
+//
+// =======================================================================
+//
+//   This library processes TrueType files:
+//        parse files
+//        extract glyph metrics
+//        extract glyph shapes
+//        render glyphs to one-channel bitmaps with antialiasing (box filter)
+//        render glyphs to one-channel SDF bitmaps (signed-distance field/function)
+//
+//   Todo:
+//        non-MS cmaps
+//        crashproof on bad data
+//        hinting? (no longer patented)
+//        cleartype-style AA?
+//        optimize: use simple memory allocator for intermediates
+//        optimize: build edge-list directly from curves
+//        optimize: rasterize directly from curves?
+//
+// ADDITIONAL CONTRIBUTORS
+//
+//   Mikko Mononen: compound shape support, more cmap formats
+//   Tor Andersson: kerning, subpixel rendering
+//   Dougall Johnson: OpenType / Type 2 font handling
+//   Daniel Ribeiro Maciel: basic GPOS-based kerning
+//
+//   Misc other:
+//       Ryan Gordon
+//       Simon Glass
+//       github:IntellectualKitty
+//       Imanol Celaya
+//       Daniel Ribeiro Maciel
+//
+//   Bug/warning reports/fixes:
+//       "Zer" on mollyrocket       Fabian "ryg" Giesen   github:NiLuJe
+//       Cass Everitt               Martins Mozeiko       github:aloucks
+//       stoiko (Haemimont Games)   Cap Petschulat        github:oyvindjam
+//       Brian Hook                 Omar Cornut           github:vassvik
+//       Walter van Niftrik         Ryan Griege
+//       David Gow                  Peter LaValle
+//       David Given                Sergey Popov
+//       Ivan-Assen Ivanov          Giumo X. Clanjor
+//       Anthony Pesch              Higor Euripedes
+//       Johan Duparc               Thomas Fields
+//       Hou Qiming                 Derek Vinyard
+//       Rob Loach                  Cort Stratton
+//       Kenney Phillis Jr.         Brian Costabile
+//       Ken Voskuil (kaesve)       Yakov Galka
+//
+// VERSION HISTORY
+//
+//   1.26 (2021-08-28) fix broken rasterizer
+//   1.25 (2021-07-11) many fixes
+//   1.24 (2020-02-05) fix warning
+//   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
+//   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
+//   1.21 (2019-02-25) fix warning
+//   1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics()
+//   1.19 (2018-02-11) GPOS kerning, STBTT_fmod
+//   1.18 (2018-01-29) add missing function
+//   1.17 (2017-07-23) make more arguments const; doc fix
+//   1.16 (2017-07-12) SDF support
+//   1.15 (2017-03-03) make more arguments const
+//   1.14 (2017-01-16) num-fonts-in-TTC function
+//   1.13 (2017-01-02) support OpenType fonts, certain Apple fonts
+//   1.12 (2016-10-25) suppress warnings about casting away const with -Wcast-qual
+//   1.11 (2016-04-02) fix unused-variable warning
+//   1.10 (2016-04-02) user-defined fabs(); rare memory leak; remove duplicate typedef
+//   1.09 (2016-01-16) warning fix; avoid crash on outofmem; use allocation userdata properly
+//   1.08 (2015-09-13) document stbtt_Rasterize(); fixes for vertical & horizontal edges
+//   1.07 (2015-08-01) allow PackFontRanges to accept arrays of sparse codepoints;
+//                     variant PackFontRanges to pack and render in separate phases;
+//                     fix stbtt_GetFontOFfsetForIndex (never worked for non-0 input?);
+//                     fixed an assert() bug in the new rasterizer
+//                     replace assert() with STBTT_assert() in new rasterizer
+//
+//   Full history can be found at the end of this file.
+//
+// LICENSE
+//
+//   See end of file for license information.
+//
+// USAGE
+//
+//   Include this file in whatever places need to refer to it. In ONE C/C++
+//   file, write:
+//      #define STB_TRUETYPE_IMPLEMENTATION
+//   before the #include of this file. This expands out the actual
+//   implementation into that C/C++ file.
+//
+//   To make the implementation private to the file that generates the implementation,
+//      #define STBTT_STATIC
+//
+//   Simple 3D API (don't ship this, but it's fine for tools and quick start)
+//           stbtt_BakeFontBitmap()               -- bake a font to a bitmap for use as texture
+//           stbtt_GetBakedQuad()                 -- compute quad to draw for a given char
+//
+//   Improved 3D API (more shippable):
+//           #include "stb_rect_pack.h"           -- optional, but you really want it
+//           stbtt_PackBegin()
+//           stbtt_PackSetOversampling()          -- for improved quality on small fonts
+//           stbtt_PackFontRanges()               -- pack and renders
+//           stbtt_PackEnd()
+//           stbtt_GetPackedQuad()
+//
+//   "Load" a font file from a memory buffer (you have to keep the buffer loaded)
+//           stbtt_InitFont()
+//           stbtt_GetFontOffsetForIndex()        -- indexing for TTC font collections
+//           stbtt_GetNumberOfFonts()             -- number of fonts for TTC font collections
+//
+//   Render a unicode codepoint to a bitmap
+//           stbtt_GetCodepointBitmap()           -- allocates and returns a bitmap
+//           stbtt_MakeCodepointBitmap()          -- renders into bitmap you provide
+//           stbtt_GetCodepointBitmapBox()        -- how big the bitmap must be
+//
+//   Character advance/positioning
+//           stbtt_GetCodepointHMetrics()
+//           stbtt_GetFontVMetrics()
+//           stbtt_GetFontVMetricsOS2()
+//           stbtt_GetCodepointKernAdvance()
+//
+//   Starting with version 1.06, the rasterizer was replaced with a new,
+//   faster and generally-more-precise rasterizer. The new rasterizer more
+//   accurately measures pixel coverage for anti-aliasing, except in the case
+//   where multiple shapes overlap, in which case it overestimates the AA pixel
+//   coverage. Thus, anti-aliasing of intersecting shapes may look wrong. If
+//   this turns out to be a problem, you can re-enable the old rasterizer with
+//        #define STBTT_RASTERIZER_VERSION 1
+//   which will incur about a 15% speed hit.
+//
+// ADDITIONAL DOCUMENTATION
+//
+//   Immediately after this block comment are a series of sample programs.
+//
+//   After the sample programs is the "header file" section. This section
+//   includes documentation for each API function.
+//
+//   Some important concepts to understand to use this library:
+//
+//      Codepoint
+//         Characters are defined by unicode codepoints, e.g. 65 is
+//         uppercase A, 231 is lowercase c with a cedilla, 0x7e30 is
+//         the hiragana for "ma".
+//
+//      Glyph
+//         A visual character shape (every codepoint is rendered as
+//         some glyph)
+//
+//      Glyph index
+//         A font-specific integer ID representing a glyph
+//
+//      Baseline
+//         Glyph shapes are defined relative to a baseline, which is the
+//         bottom of uppercase characters. Characters extend both above
+//         and below the baseline.
+//
+//      Current Point
+//         As you draw text to the screen, you keep track of a "current point"
+//         which is the origin of each character. The current point's vertical
+//         position is the baseline. Even "baked fonts" use this model.
+//
+//      Vertical Font Metrics
+//         The vertical qualities of the font, used to vertically position
+//         and space the characters. See docs for stbtt_GetFontVMetrics.
+//
+//      Font Size in Pixels or Points
+//         The preferred interface for specifying font sizes in stb_truetype
+//         is to specify how tall the font's vertical extent should be in pixels.
+//         If that sounds good enough, skip the next paragraph.
+//
+//         Most font APIs instead use "points", which are a common typographic
+//         measurement for describing font size, defined as 72 points per inch.
+//         stb_truetype provides a point API for compatibility. However, true
+//         "per inch" conventions don't make much sense on computer displays
+//         since different monitors have different number of pixels per
+//         inch. For example, Windows traditionally uses a convention that
+//         there are 96 pixels per inch, thus making 'inch' measurements have
+//         nothing to do with inches, and thus effectively defining a point to
+//         be 1.333 pixels. Additionally, the TrueType font data provides
+//         an explicit scale factor to scale a given font's glyphs to points,
+//         but the author has observed that this scale factor is often wrong
+//         for non-commercial fonts, thus making fonts scaled in points
+//         according to the TrueType spec incoherently sized in practice.
+//
+// DETAILED USAGE:
+//
+//  Scale:
+//    Select how high you want the font to be, in points or pixels.
+//    Call ScaleForPixelHeight or ScaleForMappingEmToPixels to compute
+//    a scale factor SF that will be used by all other functions.
+//
+//  Baseline:
+//    You need to select a y-coordinate that is the baseline of where
+//    your text will appear. Call GetFontBoundingBox to get the baseline-relative
+//    bounding box for all characters. SF*-y0 will be the distance in pixels
+//    that the worst-case character could extend above the baseline, so if
+//    you want the top edge of characters to appear at the top of the
+//    screen where y=0, then you would set the baseline to SF*-y0.
+//
+//  Current point:
+//    Set the current point where the first character will appear. The
+//    first character could extend left of the current point; this is font
+//    dependent. You can either choose a current point that is the leftmost
+//    point and hope, or add some padding, or check the bounding box or
+//    left-side-bearing of the first character to be displayed and set
+//    the current point based on that.
+//
+//  Displaying a character:
+//    Compute the bounding box of the character. It will contain signed values
+//    relative to <current_point, baseline>. I.e. if it returns x0,y0,x1,y1,
+//    then the character should be displayed in the rectangle from
+//    <current_point+SF*x0, baseline+SF*y0> to <current_point+SF*x1,baseline+SF*y1).
+//
+//  Advancing for the next character:
+//    Call GlyphHMetrics, and compute 'current_point += SF * advance'.
+//
+//
+// ADVANCED USAGE
+//
+//   Quality:
+//
+//    - Use the functions with Subpixel at the end to allow your characters
+//      to have subpixel positioning. Since the font is anti-aliased, not
+//      hinted, this is very import for quality. (This is not possible with
+//      baked fonts.)
+//
+//    - Kerning is now supported, and if you're supporting subpixel rendering
+//      then kerning is worth using to give your text a polished look.
+//
+//   Performance:
+//
+//    - Convert Unicode codepoints to glyph indexes and operate on the glyphs;
+//      if you don't do this, stb_truetype is forced to do the conversion on
+//      every call.
+//
+//    - There are a lot of memory allocations. We should modify it to take
+//      a temp buffer and allocate from the temp buffer (without freeing),
+//      should help performance a lot.
+//
+// NOTES
+//
+//   The system uses the raw data found in the .ttf file without changing it
+//   and without building auxiliary data structures. This is a bit inefficient
+//   on little-endian systems (the data is big-endian), but assuming you're
+//   caching the bitmaps or glyph shapes this shouldn't be a big deal.
+//
+//   It appears to be very hard to programmatically determine what font a
+//   given file is in a general way. I provide an API for this, but I don't
+//   recommend it.
+//
+//
+// PERFORMANCE MEASUREMENTS FOR 1.06:
+//
+//                      32-bit     64-bit
+//   Previous release:  8.83 s     7.68 s
+//   Pool allocations:  7.72 s     6.34 s
+//   Inline sort     :  6.54 s     5.65 s
+//   New rasterizer  :  5.63 s     5.00 s
+
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+////
+////  SAMPLE PROGRAMS
+////
+//
+//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless.
+//  See "tests/truetype_demo_win32.c" for a complete version.
+#if 0
+#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
+#include "stb_truetype.h"
+
+unsigned char ttf_buffer[1<<20];
+unsigned char temp_bitmap[512*512];
+
+stbtt_bakedchar cdata[96]; // ASCII 32..126 is 95 glyphs
+GLuint ftex;
+
+void my_stbtt_initfont(void)
+{
+   fread(ttf_buffer, 1, 1<<20, fopen("c:/windows/fonts/times.ttf", "rb"));
+   stbtt_BakeFontBitmap(ttf_buffer,0, 32.0, temp_bitmap,512,512, 32,96, cdata); // no guarantee this fits!
+   // can free ttf_buffer at this point
+   glGenTextures(1, &ftex);
+   glBindTexture(GL_TEXTURE_2D, ftex);
+   glTexImage2D(GL_TEXTURE_2D, 0, GL_ALPHA, 512,512, 0, GL_ALPHA, GL_UNSIGNED_BYTE, temp_bitmap);
+   // can free temp_bitmap at this point
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+}
+
+void my_stbtt_print(float x, float y, char *text)
+{
+   // assume orthographic projection with units = screen pixels, origin at top left
+   glEnable(GL_BLEND);
+   glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+   glEnable(GL_TEXTURE_2D);
+   glBindTexture(GL_TEXTURE_2D, ftex);
+   glBegin(GL_QUADS);
+   while (*text) {
+      if (*text >= 32 && *text < 128) {
+         stbtt_aligned_quad q;
+         stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1);
+      }
+      ++text;
+   }
+   glEnd();
+}
+#endif
+//
+//
+//////////////////////////////////////////////////////////////////////////////
+//
+// Complete program (this compiles): get a single bitmap, print as ASCII art
+//
+#if 0
+#include <stdio.h>
+#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
+#include "stb_truetype.h"
+
+char ttf_buffer[1<<25];
+
+int main(int argc, char **argv)
+{
+   stbtt_fontinfo font;
+   unsigned char *bitmap;
+   int w,h,i,j,c = (argc > 1 ? atoi(argv[1]) : 'a'), s = (argc > 2 ? atoi(argv[2]) : 20);
+
+   fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/arialbd.ttf", "rb"));
+
+   stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
+   bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, s), c, &w, &h, 0,0);
+
+   for (j=0; j < h; ++j) {
+      for (i=0; i < w; ++i)
+         putchar(" .:ioVM@"[bitmap[j*w+i]>>5]);
+      putchar('\n');
+   }
+   return 0;
+}
+#endif
+//
+// Output:
+//
+//     .ii.
+//    @@@@@@.
+//   V@Mio@@o
+//   :i.  V@V
+//     :oM@@M
+//   :@@@MM@M
+//   @@o  o@M
+//  :@@.  M@M
+//   @@@o@@@@
+//   :M@@V:@@.
+//
+//////////////////////////////////////////////////////////////////////////////
+//
+// Complete program: print "Hello World!" banner, with bugs
+//
+#if 0
+char buffer[24<<20];
+unsigned char screen[20][79];
+
+int main(int arg, char **argv)
+{
+   stbtt_fontinfo font;
+   int i,j,ascent,baseline,ch=0;
+   float scale, xpos=2; // leave a little padding in case the character extends left
+   char *text = "Heljo World!"; // intentionally misspelled to show 'lj' brokenness
+
+   fread(buffer, 1, 1000000, fopen("c:/windows/fonts/arialbd.ttf", "rb"));
+   stbtt_InitFont(&font, buffer, 0);
+
+   scale = stbtt_ScaleForPixelHeight(&font, 15);
+   stbtt_GetFontVMetrics(&font, &ascent,0,0);
+   baseline = (int) (ascent*scale);
+
+   while (text[ch]) {
+      int advance,lsb,x0,y0,x1,y1;
+      float x_shift = xpos - (float) floor(xpos);
+      stbtt_GetCodepointHMetrics(&font, text[ch], &advance, &lsb);
+      stbtt_GetCodepointBitmapBoxSubpixel(&font, text[ch], scale,scale,x_shift,0, &x0,&y0,&x1,&y1);
+      stbtt_MakeCodepointBitmapSubpixel(&font, &screen[baseline + y0][(int) xpos + x0], x1-x0,y1-y0, 79, scale,scale,x_shift,0, text[ch]);
+      // note that this stomps the old data, so where character boxes overlap (e.g. 'lj') it's wrong
+      // because this API is really for baking character bitmaps into textures. if you want to render
+      // a sequence of characters, you really need to render each bitmap to a temp buffer, then
+      // "alpha blend" that into the working buffer
+      xpos += (advance * scale);
+      if (text[ch+1])
+         xpos += scale*stbtt_GetCodepointKernAdvance(&font, text[ch],text[ch+1]);
+      ++ch;
+   }
+
+   for (j=0; j < 20; ++j) {
+      for (i=0; i < 78; ++i)
+         putchar(" .:ioVM@"[screen[j][i]>>5]);
+      putchar('\n');
+   }
+
+   return 0;
+}
+#endif
+
+
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+////
+////   INTEGRATION WITH YOUR CODEBASE
+////
+////   The following sections allow you to supply alternate definitions
+////   of C library functions used by stb_truetype, e.g. if you don't
+////   link with the C runtime library.
+
+#ifdef STB_TRUETYPE_IMPLEMENTATION
+   // #define your own (u)stbtt_int8/16/32 before including to override this
+   #ifndef stbtt_uint8
+   typedef unsigned char   stbtt_uint8;
+   typedef signed   char   stbtt_int8;
+   typedef unsigned short  stbtt_uint16;
+   typedef signed   short  stbtt_int16;
+   typedef unsigned int    stbtt_uint32;
+   typedef signed   int    stbtt_int32;
+   #endif
+
+   typedef char stbtt__check_size32[sizeof(stbtt_int32)==4 ? 1 : -1];
+   typedef char stbtt__check_size16[sizeof(stbtt_int16)==2 ? 1 : -1];
+
+   // e.g. #define your own STBTT_ifloor/STBTT_iceil() to avoid math.h
+   #ifndef STBTT_ifloor
+   #include <math.h>
+   #define STBTT_ifloor(x)   ((int) floor(x))
+   #define STBTT_iceil(x)    ((int) ceil(x))
+   #endif
+
+   #ifndef STBTT_sqrt
+   #include <math.h>
+   #define STBTT_sqrt(x)      sqrt(x)
+   #define STBTT_pow(x,y)     pow(x,y)
+   #endif
+
+   #ifndef STBTT_fmod
+   #include <math.h>
+   #define STBTT_fmod(x,y)    fmod(x,y)
+   #endif
+
+   #ifndef STBTT_cos
+   #include <math.h>
+   #define STBTT_cos(x)       cos(x)
+   #define STBTT_acos(x)      acos(x)
+   #endif
+
+   #ifndef STBTT_fabs
+   #include <math.h>
+   #define STBTT_fabs(x)      fabs(x)
+   #endif
+
+   // #define your own functions "STBTT_malloc" / "STBTT_free" to avoid malloc.h
+   #ifndef STBTT_malloc
+   #include <stdlib.h>
+   #define STBTT_malloc(x,u)  ((void)(u),malloc(x))
+   #define STBTT_free(x,u)    ((void)(u),free(x))
+   #endif
+
+   #ifndef STBTT_assert
+   #include <assert.h>
+   #define STBTT_assert(x)    assert(x)
+   #endif
+
+   #ifndef STBTT_strlen
+   #include <string.h>
+   #define STBTT_strlen(x)    strlen(x)
+   #endif
+
+   #ifndef STBTT_memcpy
+   #include <string.h>
+   #define STBTT_memcpy       memcpy
+   #define STBTT_memset       memset
+   #endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+////
+////   INTERFACE
+////
+////
+
+#ifndef __STB_INCLUDE_STB_TRUETYPE_H__
+#define __STB_INCLUDE_STB_TRUETYPE_H__
+
+#ifdef STBTT_STATIC
+#define STBTT_DEF static
+#else
+#define STBTT_DEF extern
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// private structure
+typedef struct
+{
+   unsigned char *data;
+   int cursor;
+   int size;
+} stbtt__buf;
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// TEXTURE BAKING API
+//
+// If you use this API, you only have to call two functions ever.
+//
+
+typedef struct
+{
+   unsigned short x0,y0,x1,y1; // coordinates of bbox in bitmap
+   float xoff,yoff,xadvance;
+} stbtt_bakedchar;
+
+STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+                                float pixel_height,                     // height of font in pixels
+                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
+                                int first_char, int num_chars,          // characters to bake
+                                stbtt_bakedchar *chardata);             // you allocate this, it's num_chars long
+// if return is positive, the first unused row of the bitmap
+// if return is negative, returns the negative of the number of characters that fit
+// if return is 0, no characters fit and no rows were used
+// This uses a very crappy packing.
+
+typedef struct
+{
+   float x0,y0,s0,t0; // top-left
+   float x1,y1,s1,t1; // bottom-right
+} stbtt_aligned_quad;
+
+STBTT_DEF void stbtt_GetBakedQuad(const stbtt_bakedchar *chardata, int pw, int ph,  // same data as above
+                               int char_index,             // character to display
+                               float *xpos, float *ypos,   // pointers to current position in screen pixel space
+                               stbtt_aligned_quad *q,      // output: quad to draw
+                               int opengl_fillrule);       // true if opengl fill rule; false if DX9 or earlier
+// Call GetBakedQuad with char_index = 'character - first_char', and it
+// creates the quad you need to draw and advances the current position.
+//
+// The coordinate system used assumes y increases downwards.
+//
+// Characters will extend both above and below the current position;
+// see discussion of "BASELINE" above.
+//
+// It's inefficient; you might want to c&p it and optimize it.
+
+STBTT_DEF void stbtt_GetScaledFontVMetrics(const unsigned char *fontdata, int index, float size, float *ascent, float *descent, float *lineGap);
+// Query the font vertical metrics without having to create a font first.
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// NEW TEXTURE BAKING API
+//
+// This provides options for packing multiple fonts into one atlas, not
+// perfectly but better than nothing.
+
+typedef struct
+{
+   unsigned short x0,y0,x1,y1; // coordinates of bbox in bitmap
+   float xoff,yoff,xadvance;
+   float xoff2,yoff2;
+} stbtt_packedchar;
+
+typedef struct stbtt_pack_context stbtt_pack_context;
+typedef struct stbtt_fontinfo stbtt_fontinfo;
+#ifndef STB_RECT_PACK_VERSION
+typedef struct stbrp_rect stbrp_rect;
+#endif
+
+STBTT_DEF int  stbtt_PackBegin(stbtt_pack_context *spc, unsigned char *pixels, int width, int height, int stride_in_bytes, int padding, void *alloc_context);
+// Initializes a packing context stored in the passed-in stbtt_pack_context.
+// Future calls using this context will pack characters into the bitmap passed
+// in here: a 1-channel bitmap that is width * height. stride_in_bytes is
+// the distance from one row to the next (or 0 to mean they are packed tightly
+// together). "padding" is the amount of padding to leave between each
+// character (normally you want '1' for bitmaps you'll use as textures with
+// bilinear filtering).
+//
+// Returns 0 on failure, 1 on success.
+
+STBTT_DEF void stbtt_PackEnd  (stbtt_pack_context *spc);
+// Cleans up the packing context and frees all memory.
+
+#define STBTT_POINT_SIZE(x)   (-(x))
+
+STBTT_DEF int  stbtt_PackFontRange(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, float font_size,
+                                int first_unicode_char_in_range, int num_chars_in_range, stbtt_packedchar *chardata_for_range);
+// Creates character bitmaps from the font_index'th font found in fontdata (use
+// font_index=0 if you don't know what that is). It creates num_chars_in_range
+// bitmaps for characters with unicode values starting at first_unicode_char_in_range
+// and increasing. Data for how to render them is stored in chardata_for_range;
+// pass these to stbtt_GetPackedQuad to get back renderable quads.
+//
+// font_size is the full height of the character from ascender to descender,
+// as computed by stbtt_ScaleForPixelHeight. To use a point size as computed
+// by stbtt_ScaleForMappingEmToPixels, wrap the point size in STBTT_POINT_SIZE()
+// and pass that result as 'font_size':
+//       ...,                  20 , ... // font max minus min y is 20 pixels tall
+//       ..., STBTT_POINT_SIZE(20), ... // 'M' is 20 pixels tall
+
+typedef struct
+{
+   float font_size;
+   int first_unicode_codepoint_in_range;  // if non-zero, then the chars are continuous, and this is the first codepoint
+   int *array_of_unicode_codepoints;       // if non-zero, then this is an array of unicode codepoints
+   int num_chars;
+   stbtt_packedchar *chardata_for_range; // output
+   unsigned char h_oversample, v_oversample; // don't set these, they're used internally
+} stbtt_pack_range;
+
+STBTT_DEF int  stbtt_PackFontRanges(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges);
+// Creates character bitmaps from multiple ranges of characters stored in
+// ranges. This will usually create a better-packed bitmap than multiple
+// calls to stbtt_PackFontRange. Note that you can call this multiple
+// times within a single PackBegin/PackEnd.
+
+STBTT_DEF void stbtt_PackSetOversampling(stbtt_pack_context *spc, unsigned int h_oversample, unsigned int v_oversample);
+// Oversampling a font increases the quality by allowing higher-quality subpixel
+// positioning, and is especially valuable at smaller text sizes.
+//
+// This function sets the amount of oversampling for all following calls to
+// stbtt_PackFontRange(s) or stbtt_PackFontRangesGatherRects for a given
+// pack context. The default (no oversampling) is achieved by h_oversample=1
+// and v_oversample=1. The total number of pixels required is
+// h_oversample*v_oversample larger than the default; for example, 2x2
+// oversampling requires 4x the storage of 1x1. For best results, render
+// oversampled textures with bilinear filtering. Look at the readme in
+// stb/tests/oversample for information about oversampled fonts
+//
+// To use with PackFontRangesGather etc., you must set it before calls
+// call to PackFontRangesGatherRects.
+
+STBTT_DEF void stbtt_PackSetSkipMissingCodepoints(stbtt_pack_context *spc, int skip);
+// If skip != 0, this tells stb_truetype to skip any codepoints for which
+// there is no corresponding glyph. If skip=0, which is the default, then
+// codepoints without a glyph recived the font's "missing character" glyph,
+// typically an empty box by convention.
+
+STBTT_DEF void stbtt_GetPackedQuad(const stbtt_packedchar *chardata, int pw, int ph,  // same data as above
+                               int char_index,             // character to display
+                               float *xpos, float *ypos,   // pointers to current position in screen pixel space
+                               stbtt_aligned_quad *q,      // output: quad to draw
+                               int align_to_integer);
+
+STBTT_DEF int  stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
+STBTT_DEF void stbtt_PackFontRangesPackRects(stbtt_pack_context *spc, stbrp_rect *rects, int num_rects);
+STBTT_DEF int  stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects);
+// Calling these functions in sequence is roughly equivalent to calling
+// stbtt_PackFontRanges(). If you more control over the packing of multiple
+// fonts, or if you want to pack custom data into a font texture, take a look
+// at the source to of stbtt_PackFontRanges() and create a custom version
+// using these functions, e.g. call GatherRects multiple times,
+// building up a single array of rects, then call PackRects once,
+// then call RenderIntoRects repeatedly. This may result in a
+// better packing than calling PackFontRanges multiple times
+// (or it may not).
+
+// this is an opaque structure that you shouldn't mess with which holds
+// all the context needed from PackBegin to PackEnd.
+struct stbtt_pack_context {
+   void *user_allocator_context;
+   void *pack_info;
+   int   width;
+   int   height;
+   int   stride_in_bytes;
+   int   padding;
+   int   skip_missing;
+   unsigned int   h_oversample, v_oversample;
+   unsigned char *pixels;
+   void  *nodes;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// FONT LOADING
+//
+//
+
+STBTT_DEF int stbtt_GetNumberOfFonts(const unsigned char *data);
+// This function will determine the number of fonts in a font file.  TrueType
+// collection (.ttc) files may contain multiple fonts, while TrueType font
+// (.ttf) files only contain one font. The number of fonts can be used for
+// indexing with the previous function where the index is between zero and one
+// less than the total fonts. If an error occurs, -1 is returned.
+
+STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index);
+// Each .ttf/.ttc file may have more than one font. Each font has a sequential
+// index number starting from 0. Call this function to get the font offset for
+// a given index; it returns -1 if the index is out of range. A regular .ttf
+// file will only define one font and it always be at offset 0, so it will
+// return '0' for index 0, and -1 for all other indices.
+
+// The following structure is defined publicly so you can declare one on
+// the stack or as a global or etc, but you should treat it as opaque.
+struct stbtt_fontinfo
+{
+   void           * userdata;
+   unsigned char  * data;              // pointer to .ttf file
+   int              fontstart;         // offset of start of font
+
+   int numGlyphs;                     // number of glyphs, needed for range checking
+
+   int loca,head,glyf,hhea,hmtx,kern,gpos,svg; // table locations as offset from start of .ttf
+   int index_map;                     // a cmap mapping for our chosen character encoding
+   int indexToLocFormat;              // format needed to map from glyph index to glyph
+
+   stbtt__buf cff;                    // cff font data
+   stbtt__buf charstrings;            // the charstring index
+   stbtt__buf gsubrs;                 // global charstring subroutines index
+   stbtt__buf subrs;                  // private charstring subroutines index
+   stbtt__buf fontdicts;              // array of font dicts
+   stbtt__buf fdselect;               // map from glyph to fontdict
+};
+
+STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset);
+// Given an offset into the file that defines a font, this function builds
+// the necessary cached info for the rest of the system. You must allocate
+// the stbtt_fontinfo yourself, and stbtt_InitFont will fill it out. You don't
+// need to do anything special to free it, because the contents are pure
+// value data with no additional data structures. Returns 0 on failure.
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// CHARACTER TO GLYPH-INDEX CONVERSIOn
+
+STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint);
+// If you're going to perform multiple operations on the same character
+// and you want a speed-up, call this function with the character you're
+// going to process, then use glyph-based functions instead of the
+// codepoint-based functions.
+// Returns 0 if the character codepoint is not defined in the font.
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// CHARACTER PROPERTIES
+//
+
+STBTT_DEF float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float pixels);
+// computes a scale factor to produce a font whose "height" is 'pixels' tall.
+// Height is measured as the distance from the highest ascender to the lowest
+// descender; in other words, it's equivalent to calling stbtt_GetFontVMetrics
+// and computing:
+//       scale = pixels / (ascent - descent)
+// so if you prefer to measure height by the ascent only, use a similar calculation.
+
+STBTT_DEF float stbtt_ScaleForMappingEmToPixels(const stbtt_fontinfo *info, float pixels);
+// computes a scale factor to produce a font whose EM size is mapped to
+// 'pixels' tall. This is probably what traditional APIs compute, but
+// I'm not positive.
+
+STBTT_DEF void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap);
+// ascent is the coordinate above the baseline the font extends; descent
+// is the coordinate below the baseline the font extends (i.e. it is typically negative)
+// lineGap is the spacing between one row's descent and the next row's ascent...
+// so you should advance the vertical position by "*ascent - *descent + *lineGap"
+//   these are expressed in unscaled coordinates, so you must multiply by
+//   the scale factor for a given size
+
+STBTT_DEF int  stbtt_GetFontVMetricsOS2(const stbtt_fontinfo *info, int *typoAscent, int *typoDescent, int *typoLineGap);
+// analogous to GetFontVMetrics, but returns the "typographic" values from the OS/2
+// table (specific to MS/Windows TTF files).
+//
+// Returns 1 on success (table present), 0 on failure.
+
+STBTT_DEF void stbtt_GetFontBoundingBox(const stbtt_fontinfo *info, int *x0, int *y0, int *x1, int *y1);
+// the bounding box around all possible characters
+
+STBTT_DEF void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing);
+// leftSideBearing is the offset from the current horizontal position to the left edge of the character
+// advanceWidth is the offset from the current horizontal position to the next horizontal position
+//   these are expressed in unscaled coordinates
+
+STBTT_DEF int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2);
+// an additional amount to add to the 'advance' value between ch1 and ch2
+
+STBTT_DEF int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1);
+// Gets the bounding box of the visible part of the glyph, in unscaled coordinates
+
+STBTT_DEF void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing);
+STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2);
+STBTT_DEF int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
+// as above, but takes one or more glyph indices for greater efficiency
+
+typedef struct stbtt_kerningentry
+{
+   int glyph1; // use stbtt_FindGlyphIndex
+   int glyph2;
+   int advance;
+} stbtt_kerningentry;
+
+STBTT_DEF int  stbtt_GetKerningTableLength(const stbtt_fontinfo *info);
+STBTT_DEF int  stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningentry* table, int table_length);
+// Retrieves a complete list of all of the kerning pairs provided by the font
+// stbtt_GetKerningTable never writes more than table_length entries and returns how many entries it did write.
+// The table will be sorted by (a.glyph1 == b.glyph1)?(a.glyph2 < b.glyph2):(a.glyph1 < b.glyph1)
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// GLYPH SHAPES (you probably don't need these, but they have to go before
+// the bitmaps for C declaration-order reasons)
+//
+
+#ifndef STBTT_vmove // you can predefine these to use different values (but why?)
+   enum {
+      STBTT_vmove=1,
+      STBTT_vline,
+      STBTT_vcurve,
+      STBTT_vcubic
+   };
+#endif
+
+#ifndef stbtt_vertex // you can predefine this to use different values
+                   // (we share this with other code at RAD)
+   #define stbtt_vertex_type short // can't use stbtt_int16 because that's not visible in the header file
+   typedef struct
+   {
+      stbtt_vertex_type x,y,cx,cy,cx1,cy1;
+      unsigned char type,padding;
+   } stbtt_vertex;
+#endif
+
+STBTT_DEF int stbtt_IsGlyphEmpty(const stbtt_fontinfo *info, int glyph_index);
+// returns non-zero if nothing is drawn for this glyph
+
+STBTT_DEF int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices);
+STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **vertices);
+// returns # of vertices and fills *vertices with the pointer to them
+//   these are expressed in "unscaled" coordinates
+//
+// The shape is a series of contours. Each one starts with
+// a STBTT_moveto, then consists of a series of mixed
+// STBTT_lineto and STBTT_curveto segments. A lineto
+// draws a line from previous endpoint to its x,y; a curveto
+// draws a quadratic bezier from previous endpoint to
+// its x,y, using cx,cy as the bezier control point.
+
+STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
+// frees the data allocated above
+
+STBTT_DEF unsigned char *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl);
+STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg);
+STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg);
+// fills svg with the character's SVG data.
+// returns data size or 0 if SVG not found.
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// BITMAP RENDERING
+//
+
+STBTT_DEF void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata);
+// frees the bitmap allocated below
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff);
+// allocates a large-enough single-channel 8bpp bitmap and renders the
+// specified character/glyph at the specified scale into it, with
+// antialiasing. 0 is no coverage (transparent), 255 is fully covered (opaque).
+// *width & *height are filled out with the width & height of the bitmap,
+// which is stored left-to-right, top-to-bottom.
+//
+// xoff/yoff are the offset it pixel space from the glyph origin to the top-left of the bitmap
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint, int *width, int *height, int *xoff, int *yoff);
+// the same as stbtt_GetCodepoitnBitmap, but you can specify a subpixel
+// shift for the character
+
+STBTT_DEF void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint);
+// the same as stbtt_GetCodepointBitmap, but you pass in storage for the bitmap
+// in the form of 'output', with row spacing of 'out_stride' bytes. the bitmap
+// is clipped to out_w/out_h bytes. Call stbtt_GetCodepointBitmapBox to get the
+// width and height and positioning info for it first.
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint);
+// same as stbtt_MakeCodepointBitmap, but you can specify a subpixel
+// shift for the character
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int oversample_x, int oversample_y, float *sub_x, float *sub_y, int codepoint);
+// same as stbtt_MakeCodepointBitmapSubpixel, but prefiltering
+// is performed (see stbtt_PackSetOversampling)
+
+STBTT_DEF void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+// get the bbox of the bitmap centered around the glyph origin; so the
+// bitmap width is ix1-ix0, height is iy1-iy0, and location to place
+// the bitmap top left is (leftSideBearing*scale,iy0).
+// (Note that the bitmap uses y-increases-down, but the shape uses
+// y-increases-up, so CodepointBitmapBox and CodepointBox are inverted.)
+
+STBTT_DEF void stbtt_GetCodepointBitmapBoxSubpixel(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1);
+// same as stbtt_GetCodepointBitmapBox, but you can specify a subpixel
+// shift for the character
+
+// the following functions are equivalent to the above functions, but operate
+// on glyph indices instead of Unicode codepoints (for efficiency)
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff);
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int glyph, int *width, int *height, int *xoff, int *yoff);
+STBTT_DEF void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph);
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int glyph);
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int oversample_x, int oversample_y, float *sub_x, float *sub_y, int glyph);
+STBTT_DEF void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+STBTT_DEF void stbtt_GetGlyphBitmapBoxSubpixel(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y,float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1);
+
+
+// @TODO: don't expose this structure
+typedef struct
+{
+   int w,h,stride;
+   unsigned char *pixels;
+} stbtt__bitmap;
+
+// rasterize a shape with quadratic beziers into a bitmap
+STBTT_DEF void stbtt_Rasterize(stbtt__bitmap *result,        // 1-channel bitmap to draw into
+                               float flatness_in_pixels,     // allowable error of curve in pixels
+                               stbtt_vertex *vertices,       // array of vertices defining shape
+                               int num_verts,                // number of vertices in above array
+                               float scale_x, float scale_y, // scale applied to input vertices
+                               float shift_x, float shift_y, // translation applied to input vertices
+                               int x_off, int y_off,         // another translation applied to input
+                               int invert,                   // if non-zero, vertically flip shape
+                               void *userdata);              // context for to STBTT_MALLOC
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Signed Distance Function (or Field) rendering
+
+STBTT_DEF void stbtt_FreeSDF(unsigned char *bitmap, void *userdata);
+// frees the SDF bitmap allocated below
+
+STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float scale, int glyph, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff);
+STBTT_DEF unsigned char * stbtt_GetCodepointSDF(const stbtt_fontinfo *info, float scale, int codepoint, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff);
+// These functions compute a discretized SDF field for a single character, suitable for storing
+// in a single-channel texture, sampling with bilinear filtering, and testing against
+// larger than some threshold to produce scalable fonts.
+//        info              --  the font
+//        scale             --  controls the size of the resulting SDF bitmap, same as it would be creating a regular bitmap
+//        glyph/codepoint   --  the character to generate the SDF for
+//        padding           --  extra "pixels" around the character which are filled with the distance to the character (not 0),
+//                                 which allows effects like bit outlines
+//        onedge_value      --  value 0-255 to test the SDF against to reconstruct the character (i.e. the isocontour of the character)
+//        pixel_dist_scale  --  what value the SDF should increase by when moving one SDF "pixel" away from the edge (on the 0..255 scale)
+//                                 if positive, > onedge_value is inside; if negative, < onedge_value is inside
+//        width,height      --  output height & width of the SDF bitmap (including padding)
+//        xoff,yoff         --  output origin of the character
+//        return value      --  a 2D array of bytes 0..255, width*height in size
+//
+// pixel_dist_scale & onedge_value are a scale & bias that allows you to make
+// optimal use of the limited 0..255 for your application, trading off precision
+// and special effects. SDF values outside the range 0..255 are clamped to 0..255.
+//
+// Example:
+//      scale = stbtt_ScaleForPixelHeight(22)
+//      padding = 5
+//      onedge_value = 180
+//      pixel_dist_scale = 180/5.0 = 36.0
+//
+//      This will create an SDF bitmap in which the character is about 22 pixels
+//      high but the whole bitmap is about 22+5+5=32 pixels high. To produce a filled
+//      shape, sample the SDF at each pixel and fill the pixel if the SDF value
+//      is greater than or equal to 180/255. (You'll actually want to antialias,
+//      which is beyond the scope of this example.) Additionally, you can compute
+//      offset outlines (e.g. to stroke the character border inside & outside,
+//      or only outside). For example, to fill outside the character up to 3 SDF
+//      pixels, you would compare against (180-36.0*3)/255 = 72/255. The above
+//      choice of variables maps a range from 5 pixels outside the shape to
+//      2 pixels inside the shape to 0..255; this is intended primarily for apply
+//      outside effects only (the interior range is needed to allow proper
+//      antialiasing of the font at *smaller* sizes)
+//
+// The function computes the SDF analytically at each SDF pixel, not by e.g.
+// building a higher-res bitmap and approximating it. In theory the quality
+// should be as high as possible for an SDF of this size & representation, but
+// unclear if this is true in practice (perhaps building a higher-res bitmap
+// and computing from that can allow drop-out prevention).
+//
+// The algorithm has not been optimized at all, so expect it to be slow
+// if computing lots of characters or very large sizes.
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Finding the right font...
+//
+// You should really just solve this offline, keep your own tables
+// of what font is what, and don't try to get it out of the .ttf file.
+// That's because getting it out of the .ttf file is really hard, because
+// the names in the file can appear in many possible encodings, in many
+// possible languages, and e.g. if you need a case-insensitive comparison,
+// the details of that depend on the encoding & language in a complex way
+// (actually underspecified in truetype, but also gigantic).
+//
+// But you can use the provided functions in two possible ways:
+//     stbtt_FindMatchingFont() will use *case-sensitive* comparisons on
+//             unicode-encoded names to try to find the font you want;
+//             you can run this before calling stbtt_InitFont()
+//
+//     stbtt_GetFontNameString() lets you get any of the various strings
+//             from the file yourself and do your own comparisons on them.
+//             You have to have called stbtt_InitFont() first.
+
+
+STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *fontdata, const char *name, int flags);
+// returns the offset (not index) of the font that matches, or -1 if none
+//   if you use STBTT_MACSTYLE_DONTCARE, use a font name like "Arial Bold".
+//   if you use any other flag, use a font name like "Arial"; this checks
+//     the 'macStyle' header field; i don't know if fonts set this consistently
+#define STBTT_MACSTYLE_DONTCARE     0
+#define STBTT_MACSTYLE_BOLD         1
+#define STBTT_MACSTYLE_ITALIC       2
+#define STBTT_MACSTYLE_UNDERSCORE   4
+#define STBTT_MACSTYLE_NONE         8   // <= not same as 0, this makes us check the bitfield is 0
+
+STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2);
+// returns 1/0 whether the first string interpreted as utf8 is identical to
+// the second string interpreted as big-endian utf16... useful for strings from next func
+
+STBTT_DEF const char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID);
+// returns the string (which may be big-endian double byte, e.g. for unicode)
+// and puts the length in bytes in *length.
+//
+// some of the values for the IDs are below; for more see the truetype spec:
+//     http://developer.apple.com/textfonts/TTRefMan/RM06/Chap6name.html
+//     http://www.microsoft.com/typography/otspec/name.htm
+
+enum { // platformID
+   STBTT_PLATFORM_ID_UNICODE   =0,
+   STBTT_PLATFORM_ID_MAC       =1,
+   STBTT_PLATFORM_ID_ISO       =2,
+   STBTT_PLATFORM_ID_MICROSOFT =3
+};
+
+enum { // encodingID for STBTT_PLATFORM_ID_UNICODE
+   STBTT_UNICODE_EID_UNICODE_1_0    =0,
+   STBTT_UNICODE_EID_UNICODE_1_1    =1,
+   STBTT_UNICODE_EID_ISO_10646      =2,
+   STBTT_UNICODE_EID_UNICODE_2_0_BMP=3,
+   STBTT_UNICODE_EID_UNICODE_2_0_FULL=4
+};
+
+enum { // encodingID for STBTT_PLATFORM_ID_MICROSOFT
+   STBTT_MS_EID_SYMBOL        =0,
+   STBTT_MS_EID_UNICODE_BMP   =1,
+   STBTT_MS_EID_SHIFTJIS      =2,
+   STBTT_MS_EID_UNICODE_FULL  =10
+};
+
+enum { // encodingID for STBTT_PLATFORM_ID_MAC; same as Script Manager codes
+   STBTT_MAC_EID_ROMAN        =0,   STBTT_MAC_EID_ARABIC       =4,
+   STBTT_MAC_EID_JAPANESE     =1,   STBTT_MAC_EID_HEBREW       =5,
+   STBTT_MAC_EID_CHINESE_TRAD =2,   STBTT_MAC_EID_GREEK        =6,
+   STBTT_MAC_EID_KOREAN       =3,   STBTT_MAC_EID_RUSSIAN      =7
+};
+
+enum { // languageID for STBTT_PLATFORM_ID_MICROSOFT; same as LCID...
+       // problematic because there are e.g. 16 english LCIDs and 16 arabic LCIDs
+   STBTT_MS_LANG_ENGLISH     =0x0409,   STBTT_MS_LANG_ITALIAN     =0x0410,
+   STBTT_MS_LANG_CHINESE     =0x0804,   STBTT_MS_LANG_JAPANESE    =0x0411,
+   STBTT_MS_LANG_DUTCH       =0x0413,   STBTT_MS_LANG_KOREAN      =0x0412,
+   STBTT_MS_LANG_FRENCH      =0x040c,   STBTT_MS_LANG_RUSSIAN     =0x0419,
+   STBTT_MS_LANG_GERMAN      =0x0407,   STBTT_MS_LANG_SPANISH     =0x0409,
+   STBTT_MS_LANG_HEBREW      =0x040d,   STBTT_MS_LANG_SWEDISH     =0x041D
+};
+
+enum { // languageID for STBTT_PLATFORM_ID_MAC
+   STBTT_MAC_LANG_ENGLISH      =0 ,   STBTT_MAC_LANG_JAPANESE     =11,
+   STBTT_MAC_LANG_ARABIC       =12,   STBTT_MAC_LANG_KOREAN       =23,
+   STBTT_MAC_LANG_DUTCH        =4 ,   STBTT_MAC_LANG_RUSSIAN      =32,
+   STBTT_MAC_LANG_FRENCH       =1 ,   STBTT_MAC_LANG_SPANISH      =6 ,
+   STBTT_MAC_LANG_GERMAN       =2 ,   STBTT_MAC_LANG_SWEDISH      =5 ,
+   STBTT_MAC_LANG_HEBREW       =10,   STBTT_MAC_LANG_CHINESE_SIMPLIFIED =33,
+   STBTT_MAC_LANG_ITALIAN      =3 ,   STBTT_MAC_LANG_CHINESE_TRAD =19
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STB_INCLUDE_STB_TRUETYPE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+////
+////   IMPLEMENTATION
+////
+////
+
+#ifdef STB_TRUETYPE_IMPLEMENTATION
+
+#ifndef STBTT_MAX_OVERSAMPLE
+#define STBTT_MAX_OVERSAMPLE   8
+#endif
+
+#if STBTT_MAX_OVERSAMPLE > 255
+#error "STBTT_MAX_OVERSAMPLE cannot be > 255"
+#endif
+
+typedef int stbtt__test_oversample_pow2[(STBTT_MAX_OVERSAMPLE & (STBTT_MAX_OVERSAMPLE-1)) == 0 ? 1 : -1];
+
+#ifndef STBTT_RASTERIZER_VERSION
+#define STBTT_RASTERIZER_VERSION 2
+#endif
+
+#ifdef _MSC_VER
+#define STBTT__NOTUSED(v)  (void)(v)
+#else
+#define STBTT__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+//
+// stbtt__buf helpers to parse data from file
+//
+
+static stbtt_uint8 stbtt__buf_get8(stbtt__buf *b)
+{
+   if (b->cursor >= b->size)
+      return 0;
+   return b->data[b->cursor++];
+}
+
+static stbtt_uint8 stbtt__buf_peek8(stbtt__buf *b)
+{
+   if (b->cursor >= b->size)
+      return 0;
+   return b->data[b->cursor];
+}
+
+static void stbtt__buf_seek(stbtt__buf *b, int o)
+{
+   STBTT_assert(!(o > b->size || o < 0));
+   b->cursor = (o > b->size || o < 0) ? b->size : o;
+}
+
+static void stbtt__buf_skip(stbtt__buf *b, int o)
+{
+   stbtt__buf_seek(b, b->cursor + o);
+}
+
+static stbtt_uint32 stbtt__buf_get(stbtt__buf *b, int n)
+{
+   stbtt_uint32 v = 0;
+   int i;
+   STBTT_assert(n >= 1 && n <= 4);
+   for (i = 0; i < n; i++)
+      v = (v << 8) | stbtt__buf_get8(b);
+   return v;
+}
+
+static stbtt__buf stbtt__new_buf(const void *p, size_t size)
+{
+   stbtt__buf r;
+   STBTT_assert(size < 0x40000000);
+   r.data = (stbtt_uint8*) p;
+   r.size = (int) size;
+   r.cursor = 0;
+   return r;
+}
+
+#define stbtt__buf_get16(b)  stbtt__buf_get((b), 2)
+#define stbtt__buf_get32(b)  stbtt__buf_get((b), 4)
+
+static stbtt__buf stbtt__buf_range(const stbtt__buf *b, int o, int s)
+{
+   stbtt__buf r = stbtt__new_buf(NULL, 0);
+   if (o < 0 || s < 0 || o > b->size || s > b->size - o) return r;
+   r.data = b->data + o;
+   r.size = s;
+   return r;
+}
+
+static stbtt__buf stbtt__cff_get_index(stbtt__buf *b)
+{
+   int count, start, offsize;
+   start = b->cursor;
+   count = stbtt__buf_get16(b);
+   if (count) {
+      offsize = stbtt__buf_get8(b);
+      STBTT_assert(offsize >= 1 && offsize <= 4);
+      stbtt__buf_skip(b, offsize * count);
+      stbtt__buf_skip(b, stbtt__buf_get(b, offsize) - 1);
+   }
+   return stbtt__buf_range(b, start, b->cursor - start);
+}
+
+static stbtt_uint32 stbtt__cff_int(stbtt__buf *b)
+{
+   int b0 = stbtt__buf_get8(b);
+   if (b0 >= 32 && b0 <= 246)       return b0 - 139;
+   else if (b0 >= 247 && b0 <= 250) return (b0 - 247)*256 + stbtt__buf_get8(b) + 108;
+   else if (b0 >= 251 && b0 <= 254) return -(b0 - 251)*256 - stbtt__buf_get8(b) - 108;
+   else if (b0 == 28)               return stbtt__buf_get16(b);
+   else if (b0 == 29)               return stbtt__buf_get32(b);
+   STBTT_assert(0);
+   return 0;
+}
+
+static void stbtt__cff_skip_operand(stbtt__buf *b) {
+   int v, b0 = stbtt__buf_peek8(b);
+   STBTT_assert(b0 >= 28);
+   if (b0 == 30) {
+      stbtt__buf_skip(b, 1);
+      while (b->cursor < b->size) {
+         v = stbtt__buf_get8(b);
+         if ((v & 0xF) == 0xF || (v >> 4) == 0xF)
+            break;
+      }
+   } else {
+      stbtt__cff_int(b);
+   }
+}
+
+static stbtt__buf stbtt__dict_get(stbtt__buf *b, int key)
+{
+   stbtt__buf_seek(b, 0);
+   while (b->cursor < b->size) {
+      int start = b->cursor, end, op;
+      while (stbtt__buf_peek8(b) >= 28)
+         stbtt__cff_skip_operand(b);
+      end = b->cursor;
+      op = stbtt__buf_get8(b);
+      if (op == 12)  op = stbtt__buf_get8(b) | 0x100;
+      if (op == key) return stbtt__buf_range(b, start, end-start);
+   }
+   return stbtt__buf_range(b, 0, 0);
+}
+
+static void stbtt__dict_get_ints(stbtt__buf *b, int key, int outcount, stbtt_uint32 *out)
+{
+   int i;
+   stbtt__buf operands = stbtt__dict_get(b, key);
+   for (i = 0; i < outcount && operands.cursor < operands.size; i++)
+      out[i] = stbtt__cff_int(&operands);
+}
+
+static int stbtt__cff_index_count(stbtt__buf *b)
+{
+   stbtt__buf_seek(b, 0);
+   return stbtt__buf_get16(b);
+}
+
+static stbtt__buf stbtt__cff_index_get(stbtt__buf b, int i)
+{
+   int count, offsize, start, end;
+   stbtt__buf_seek(&b, 0);
+   count = stbtt__buf_get16(&b);
+   offsize = stbtt__buf_get8(&b);
+   STBTT_assert(i >= 0 && i < count);
+   STBTT_assert(offsize >= 1 && offsize <= 4);
+   stbtt__buf_skip(&b, i*offsize);
+   start = stbtt__buf_get(&b, offsize);
+   end = stbtt__buf_get(&b, offsize);
+   return stbtt__buf_range(&b, 2+(count+1)*offsize+start, end - start);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//
+// accessors to parse data from file
+//
+
+// on platforms that don't allow misaligned reads, if we want to allow
+// truetype fonts that aren't padded to alignment, define ALLOW_UNALIGNED_TRUETYPE
+
+#define ttBYTE(p)     (* (stbtt_uint8 *) (p))
+#define ttCHAR(p)     (* (stbtt_int8 *) (p))
+#define ttFixed(p)    ttLONG(p)
+
+static stbtt_uint16 ttUSHORT(stbtt_uint8 *p) { return p[0]*256 + p[1]; }
+static stbtt_int16 ttSHORT(stbtt_uint8 *p)   { return p[0]*256 + p[1]; }
+static stbtt_uint32 ttULONG(stbtt_uint8 *p)  { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+static stbtt_int32 ttLONG(stbtt_uint8 *p)    { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+
+#define stbtt_tag4(p,c0,c1,c2,c3) ((p)[0] == (c0) && (p)[1] == (c1) && (p)[2] == (c2) && (p)[3] == (c3))
+#define stbtt_tag(p,str)           stbtt_tag4(p,str[0],str[1],str[2],str[3])
+
+static int stbtt__isfont(stbtt_uint8 *font)
+{
+   // check the version number
+   if (stbtt_tag4(font, '1',0,0,0))  return 1; // TrueType 1
+   if (stbtt_tag(font, "typ1"))   return 1; // TrueType with type 1 font -- we don't support this!
+   if (stbtt_tag(font, "OTTO"))   return 1; // OpenType with CFF
+   if (stbtt_tag4(font, 0,1,0,0)) return 1; // OpenType 1.0
+   if (stbtt_tag(font, "true"))   return 1; // Apple specification for TrueType fonts
+   return 0;
+}
+
+// @OPTIMIZE: binary search
+static stbtt_uint32 stbtt__find_table(stbtt_uint8 *data, stbtt_uint32 fontstart, const char *tag)
+{
+   stbtt_int32 num_tables = ttUSHORT(data+fontstart+4);
+   stbtt_uint32 tabledir = fontstart + 12;
+   stbtt_int32 i;
+   for (i=0; i < num_tables; ++i) {
+      stbtt_uint32 loc = tabledir + 16*i;
+      if (stbtt_tag(data+loc+0, tag))
+         return ttULONG(data+loc+8);
+   }
+   return 0;
+}
+
+static int stbtt_GetFontOffsetForIndex_internal(unsigned char *font_collection, int index)
+{
+   // if it's just a font, there's only one valid index
+   if (stbtt__isfont(font_collection))
+      return index == 0 ? 0 : -1;
+
+   // check if it's a TTC
+   if (stbtt_tag(font_collection, "ttcf")) {
+      // version 1?
+      if (ttULONG(font_collection+4) == 0x00010000 || ttULONG(font_collection+4) == 0x00020000) {
+         stbtt_int32 n = ttLONG(font_collection+8);
+         if (index >= n)
+            return -1;
+         return ttULONG(font_collection+12+index*4);
+      }
+   }
+   return -1;
+}
+
+static int stbtt_GetNumberOfFonts_internal(unsigned char *font_collection)
+{
+   // if it's just a font, there's only one valid font
+   if (stbtt__isfont(font_collection))
+      return 1;
+
+   // check if it's a TTC
+   if (stbtt_tag(font_collection, "ttcf")) {
+      // version 1?
+      if (ttULONG(font_collection+4) == 0x00010000 || ttULONG(font_collection+4) == 0x00020000) {
+         return ttLONG(font_collection+8);
+      }
+   }
+   return 0;
+}
+
+static stbtt__buf stbtt__get_subrs(stbtt__buf cff, stbtt__buf fontdict)
+{
+   stbtt_uint32 subrsoff = 0, private_loc[2] = { 0, 0 };
+   stbtt__buf pdict;
+   stbtt__dict_get_ints(&fontdict, 18, 2, private_loc);
+   if (!private_loc[1] || !private_loc[0]) return stbtt__new_buf(NULL, 0);
+   pdict = stbtt__buf_range(&cff, private_loc[1], private_loc[0]);
+   stbtt__dict_get_ints(&pdict, 19, 1, &subrsoff);
+   if (!subrsoff) return stbtt__new_buf(NULL, 0);
+   stbtt__buf_seek(&cff, private_loc[1]+subrsoff);
+   return stbtt__cff_get_index(&cff);
+}
+
+// since most people won't use this, find this table the first time it's needed
+static int stbtt__get_svg(stbtt_fontinfo *info)
+{
+   stbtt_uint32 t;
+   if (info->svg < 0) {
+      t = stbtt__find_table(info->data, info->fontstart, "SVG ");
+      if (t) {
+         stbtt_uint32 offset = ttULONG(info->data + t + 2);
+         info->svg = t + offset;
+      } else {
+         info->svg = 0;
+      }
+   }
+   return info->svg;
+}
+
+static int stbtt_InitFont_internal(stbtt_fontinfo *info, unsigned char *data, int fontstart)
+{
+   stbtt_uint32 cmap, t;
+   stbtt_int32 i,numTables;
+
+   info->data = data;
+   info->fontstart = fontstart;
+   info->cff = stbtt__new_buf(NULL, 0);
+
+   cmap = stbtt__find_table(data, fontstart, "cmap");       // required
+   info->loca = stbtt__find_table(data, fontstart, "loca"); // required
+   info->head = stbtt__find_table(data, fontstart, "head"); // required
+   info->glyf = stbtt__find_table(data, fontstart, "glyf"); // required
+   info->hhea = stbtt__find_table(data, fontstart, "hhea"); // required
+   info->hmtx = stbtt__find_table(data, fontstart, "hmtx"); // required
+   info->kern = stbtt__find_table(data, fontstart, "kern"); // not required
+   info->gpos = stbtt__find_table(data, fontstart, "GPOS"); // not required
+
+   if (!cmap || !info->head || !info->hhea || !info->hmtx)
+      return 0;
+   if (info->glyf) {
+      // required for truetype
+      if (!info->loca) return 0;
+   } else {
+      // initialization for CFF / Type2 fonts (OTF)
+      stbtt__buf b, topdict, topdictidx;
+      stbtt_uint32 cstype = 2, charstrings = 0, fdarrayoff = 0, fdselectoff = 0;
+      stbtt_uint32 cff;
+
+      cff = stbtt__find_table(data, fontstart, "CFF ");
+      if (!cff) return 0;
+
+      info->fontdicts = stbtt__new_buf(NULL, 0);
+      info->fdselect = stbtt__new_buf(NULL, 0);
+
+      // @TODO this should use size from table (not 512MB)
+      info->cff = stbtt__new_buf(data+cff, 512*1024*1024);
+      b = info->cff;
+
+      // read the header
+      stbtt__buf_skip(&b, 2);
+      stbtt__buf_seek(&b, stbtt__buf_get8(&b)); // hdrsize
+
+      // @TODO the name INDEX could list multiple fonts,
+      // but we just use the first one.
+      stbtt__cff_get_index(&b);  // name INDEX
+      topdictidx = stbtt__cff_get_index(&b);
+      topdict = stbtt__cff_index_get(topdictidx, 0);
+      stbtt__cff_get_index(&b);  // string INDEX
+      info->gsubrs = stbtt__cff_get_index(&b);
+
+      stbtt__dict_get_ints(&topdict, 17, 1, &charstrings);
+      stbtt__dict_get_ints(&topdict, 0x100 | 6, 1, &cstype);
+      stbtt__dict_get_ints(&topdict, 0x100 | 36, 1, &fdarrayoff);
+      stbtt__dict_get_ints(&topdict, 0x100 | 37, 1, &fdselectoff);
+      info->subrs = stbtt__get_subrs(b, topdict);
+
+      // we only support Type 2 charstrings
+      if (cstype != 2) return 0;
+      if (charstrings == 0) return 0;
+
+      if (fdarrayoff) {
+         // looks like a CID font
+         if (!fdselectoff) return 0;
+         stbtt__buf_seek(&b, fdarrayoff);
+         info->fontdicts = stbtt__cff_get_index(&b);
+         info->fdselect = stbtt__buf_range(&b, fdselectoff, b.size-fdselectoff);
+      }
+
+      stbtt__buf_seek(&b, charstrings);
+      info->charstrings = stbtt__cff_get_index(&b);
+   }
+
+   t = stbtt__find_table(data, fontstart, "maxp");
+   if (t)
+      info->numGlyphs = ttUSHORT(data+t+4);
+   else
+      info->numGlyphs = 0xffff;
+
+   info->svg = -1;
+
+   // find a cmap encoding table we understand *now* to avoid searching
+   // later. (todo: could make this installable)
+   // the same regardless of glyph.
+   numTables = ttUSHORT(data + cmap + 2);
+   info->index_map = 0;
+   for (i=0; i < numTables; ++i) {
+      stbtt_uint32 encoding_record = cmap + 4 + 8 * i;
+      // find an encoding we understand:
+      switch(ttUSHORT(data+encoding_record)) {
+         case STBTT_PLATFORM_ID_MICROSOFT:
+            switch (ttUSHORT(data+encoding_record+2)) {
+               case STBTT_MS_EID_UNICODE_BMP:
+               case STBTT_MS_EID_UNICODE_FULL:
+                  // MS/Unicode
+                  info->index_map = cmap + ttULONG(data+encoding_record+4);
+                  break;
+            }
+            break;
+        case STBTT_PLATFORM_ID_UNICODE:
+            // Mac/iOS has these
+            // all the encodingIDs are unicode, so we don't bother to check it
+            info->index_map = cmap + ttULONG(data+encoding_record+4);
+            break;
+      }
+   }
+   if (info->index_map == 0)
+      return 0;
+
+   info->indexToLocFormat = ttUSHORT(data+info->head + 50);
+   return 1;
+}
+
+STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint)
+{
+   stbtt_uint8 *data = info->data;
+   stbtt_uint32 index_map = info->index_map;
+
+   stbtt_uint16 format = ttUSHORT(data + index_map + 0);
+   if (format == 0) { // apple byte encoding
+      stbtt_int32 bytes = ttUSHORT(data + index_map + 2);
+      if (unicode_codepoint < bytes-6)
+         return ttBYTE(data + index_map + 6 + unicode_codepoint);
+      return 0;
+   } else if (format == 6) {
+      stbtt_uint32 first = ttUSHORT(data + index_map + 6);
+      stbtt_uint32 count = ttUSHORT(data + index_map + 8);
+      if ((stbtt_uint32) unicode_codepoint >= first && (stbtt_uint32) unicode_codepoint < first+count)
+         return ttUSHORT(data + index_map + 10 + (unicode_codepoint - first)*2);
+      return 0;
+   } else if (format == 2) {
+      STBTT_assert(0); // @TODO: high-byte mapping for japanese/chinese/korean
+      return 0;
+   } else if (format == 4) { // standard mapping for windows fonts: binary search collection of ranges
+      stbtt_uint16 segcount = ttUSHORT(data+index_map+6) >> 1;
+      stbtt_uint16 searchRange = ttUSHORT(data+index_map+8) >> 1;
+      stbtt_uint16 entrySelector = ttUSHORT(data+index_map+10);
+      stbtt_uint16 rangeShift = ttUSHORT(data+index_map+12) >> 1;
+
+      // do a binary search of the segments
+      stbtt_uint32 endCount = index_map + 14;
+      stbtt_uint32 search = endCount;
+
+      if (unicode_codepoint > 0xffff)
+         return 0;
+
+      // they lie from endCount .. endCount + segCount
+      // but searchRange is the nearest power of two, so...
+      if (unicode_codepoint >= ttUSHORT(data + search + rangeShift*2))
+         search += rangeShift*2;
+
+      // now decrement to bias correctly to find smallest
+      search -= 2;
+      while (entrySelector) {
+         stbtt_uint16 end;
+         searchRange >>= 1;
+         end = ttUSHORT(data + search + searchRange*2);
+         if (unicode_codepoint > end)
+            search += searchRange*2;
+         --entrySelector;
+      }
+      search += 2;
+
+      {
+         stbtt_uint16 offset, start, last;
+         stbtt_uint16 item = (stbtt_uint16) ((search - endCount) >> 1);
+
+         start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
+         last = ttUSHORT(data + endCount + 2*item);
+         if (unicode_codepoint < start || unicode_codepoint > last)
+            return 0;
+
+         offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);
+         if (offset == 0)
+            return (stbtt_uint16) (unicode_codepoint + ttSHORT(data + index_map + 14 + segcount*4 + 2 + 2*item));
+
+         return ttUSHORT(data + offset + (unicode_codepoint-start)*2 + index_map + 14 + segcount*6 + 2 + 2*item);
+      }
+   } else if (format == 12 || format == 13) {
+      stbtt_uint32 ngroups = ttULONG(data+index_map+12);
+      stbtt_int32 low,high;
+      low = 0; high = (stbtt_int32)ngroups;
+      // Binary search the right group.
+      while (low < high) {
+         stbtt_int32 mid = low + ((high-low) >> 1); // rounds down, so low <= mid < high
+         stbtt_uint32 start_char = ttULONG(data+index_map+16+mid*12);
+         stbtt_uint32 end_char = ttULONG(data+index_map+16+mid*12+4);
+         if ((stbtt_uint32) unicode_codepoint < start_char)
+            high = mid;
+         else if ((stbtt_uint32) unicode_codepoint > end_char)
+            low = mid+1;
+         else {
+            stbtt_uint32 start_glyph = ttULONG(data+index_map+16+mid*12+8);
+            if (format == 12)
+               return start_glyph + unicode_codepoint-start_char;
+            else // format == 13
+               return start_glyph;
+         }
+      }
+      return 0; // not found
+   }
+   // @TODO
+   STBTT_assert(0);
+   return 0;
+}
+
+STBTT_DEF int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices)
+{
+   return stbtt_GetGlyphShape(info, stbtt_FindGlyphIndex(info, unicode_codepoint), vertices);
+}
+
+static void stbtt_setvertex(stbtt_vertex *v, stbtt_uint8 type, stbtt_int32 x, stbtt_int32 y, stbtt_int32 cx, stbtt_int32 cy)
+{
+   v->type = type;
+   v->x = (stbtt_int16) x;
+   v->y = (stbtt_int16) y;
+   v->cx = (stbtt_int16) cx;
+   v->cy = (stbtt_int16) cy;
+}
+
+static int stbtt__GetGlyfOffset(const stbtt_fontinfo *info, int glyph_index)
+{
+   int g1,g2;
+
+   STBTT_assert(!info->cff.size);
+
+   if (glyph_index >= info->numGlyphs) return -1; // glyph index out of range
+   if (info->indexToLocFormat >= 2)    return -1; // unknown index->glyph map format
+
+   if (info->indexToLocFormat == 0) {
+      g1 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2) * 2;
+      g2 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2 + 2) * 2;
+   } else {
+      g1 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4);
+      g2 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4 + 4);
+   }
+
+   return g1==g2 ? -1 : g1; // if length is 0, return -1
+}
+
+static int stbtt__GetGlyphInfoT2(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
+
+STBTT_DEF int stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
+{
+   if (info->cff.size) {
+      stbtt__GetGlyphInfoT2(info, glyph_index, x0, y0, x1, y1);
+   } else {
+      int g = stbtt__GetGlyfOffset(info, glyph_index);
+      if (g < 0) return 0;
+
+      if (x0) *x0 = ttSHORT(info->data + g + 2);
+      if (y0) *y0 = ttSHORT(info->data + g + 4);
+      if (x1) *x1 = ttSHORT(info->data + g + 6);
+      if (y1) *y1 = ttSHORT(info->data + g + 8);
+   }
+   return 1;
+}
+
+STBTT_DEF int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1)
+{
+   return stbtt_GetGlyphBox(info, stbtt_FindGlyphIndex(info,codepoint), x0,y0,x1,y1);
+}
+
+STBTT_DEF int stbtt_IsGlyphEmpty(const stbtt_fontinfo *info, int glyph_index)
+{
+   stbtt_int16 numberOfContours;
+   int g;
+   if (info->cff.size)
+      return stbtt__GetGlyphInfoT2(info, glyph_index, NULL, NULL, NULL, NULL) == 0;
+   g = stbtt__GetGlyfOffset(info, glyph_index);
+   if (g < 0) return 1;
+   numberOfContours = ttSHORT(info->data + g);
+   return numberOfContours == 0;
+}
+
+static int stbtt__close_shape(stbtt_vertex *vertices, int num_vertices, int was_off, int start_off,
+    stbtt_int32 sx, stbtt_int32 sy, stbtt_int32 scx, stbtt_int32 scy, stbtt_int32 cx, stbtt_int32 cy)
+{
+   if (start_off) {
+      if (was_off)
+         stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, (cx+scx)>>1, (cy+scy)>>1, cx,cy);
+      stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, sx,sy,scx,scy);
+   } else {
+      if (was_off)
+         stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve,sx,sy,cx,cy);
+      else
+         stbtt_setvertex(&vertices[num_vertices++], STBTT_vline,sx,sy,0,0);
+   }
+   return num_vertices;
+}
+
+static int stbtt__GetGlyphShapeTT(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+{
+   stbtt_int16 numberOfContours;
+   stbtt_uint8 *endPtsOfContours;
+   stbtt_uint8 *data = info->data;
+   stbtt_vertex *vertices=0;
+   int num_vertices=0;
+   int g = stbtt__GetGlyfOffset(info, glyph_index);
+
+   *pvertices = NULL;
+
+   if (g < 0) return 0;
+
+   numberOfContours = ttSHORT(data + g);
+
+   if (numberOfContours > 0) {
+      stbtt_uint8 flags=0,flagcount;
+      stbtt_int32 ins, i,j=0,m,n, next_move, was_off=0, off, start_off=0;
+      stbtt_int32 x,y,cx,cy,sx,sy, scx,scy;
+      stbtt_uint8 *points;
+      endPtsOfContours = (data + g + 10);
+      ins = ttUSHORT(data + g + 10 + numberOfContours * 2);
+      points = data + g + 10 + numberOfContours * 2 + 2 + ins;
+
+      n = 1+ttUSHORT(endPtsOfContours + numberOfContours*2-2);
+
+      m = n + 2*numberOfContours;  // a loose bound on how many vertices we might need
+      vertices = (stbtt_vertex *) STBTT_malloc(m * sizeof(vertices[0]), info->userdata);
+      if (vertices == 0)
+         return 0;
+
+      next_move = 0;
+      flagcount=0;
+
+      // in first pass, we load uninterpreted data into the allocated array
+      // above, shifted to the end of the array so we won't overwrite it when
+      // we create our final data starting from the front
+
+      off = m - n; // starting offset for uninterpreted data, regardless of how m ends up being calculated
+
+      // first load flags
+
+      for (i=0; i < n; ++i) {
+         if (flagcount == 0) {
+            flags = *points++;
+            if (flags & 8)
+               flagcount = *points++;
+         } else
+            --flagcount;
+         vertices[off+i].type = flags;
+      }
+
+      // now load x coordinates
+      x=0;
+      for (i=0; i < n; ++i) {
+         flags = vertices[off+i].type;
+         if (flags & 2) {
+            stbtt_int16 dx = *points++;
+            x += (flags & 16) ? dx : -dx; // ???
+         } else {
+            if (!(flags & 16)) {
+               x = x + (stbtt_int16) (points[0]*256 + points[1]);
+               points += 2;
+            }
+         }
+         vertices[off+i].x = (stbtt_int16) x;
+      }
+
+      // now load y coordinates
+      y=0;
+      for (i=0; i < n; ++i) {
+         flags = vertices[off+i].type;
+         if (flags & 4) {
+            stbtt_int16 dy = *points++;
+            y += (flags & 32) ? dy : -dy; // ???
+         } else {
+            if (!(flags & 32)) {
+               y = y + (stbtt_int16) (points[0]*256 + points[1]);
+               points += 2;
+            }
+         }
+         vertices[off+i].y = (stbtt_int16) y;
+      }
+
+      // now convert them to our format
+      num_vertices=0;
+      sx = sy = cx = cy = scx = scy = 0;
+      for (i=0; i < n; ++i) {
+         flags = vertices[off+i].type;
+         x     = (stbtt_int16) vertices[off+i].x;
+         y     = (stbtt_int16) vertices[off+i].y;
+
+         if (next_move == i) {
+            if (i != 0)
+               num_vertices = stbtt__close_shape(vertices, num_vertices, was_off, start_off, sx,sy,scx,scy,cx,cy);
+
+            // now start the new one
+            start_off = !(flags & 1);
+            if (start_off) {
+               // if we start off with an off-curve point, then when we need to find a point on the curve
+               // where we can start, and we need to save some state for when we wraparound.
+               scx = x;
+               scy = y;
+               if (!(vertices[off+i+1].type & 1)) {
+                  // next point is also a curve point, so interpolate an on-point curve
+                  sx = (x + (stbtt_int32) vertices[off+i+1].x) >> 1;
+                  sy = (y + (stbtt_int32) vertices[off+i+1].y) >> 1;
+               } else {
+                  // otherwise just use the next point as our start point
+                  sx = (stbtt_int32) vertices[off+i+1].x;
+                  sy = (stbtt_int32) vertices[off+i+1].y;
+                  ++i; // we're using point i+1 as the starting point, so skip it
+               }
+            } else {
+               sx = x;
+               sy = y;
+            }
+            stbtt_setvertex(&vertices[num_vertices++], STBTT_vmove,sx,sy,0,0);
+            was_off = 0;
+            next_move = 1 + ttUSHORT(endPtsOfContours+j*2);
+            ++j;
+         } else {
+            if (!(flags & 1)) { // if it's a curve
+               if (was_off) // two off-curve control points in a row means interpolate an on-curve midpoint
+                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, (cx+x)>>1, (cy+y)>>1, cx, cy);
+               cx = x;
+               cy = y;
+               was_off = 1;
+            } else {
+               if (was_off)
+                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, x,y, cx, cy);
+               else
+                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vline, x,y,0,0);
+               was_off = 0;
+            }
+         }
+      }
+      num_vertices = stbtt__close_shape(vertices, num_vertices, was_off, start_off, sx,sy,scx,scy,cx,cy);
+   } else if (numberOfContours < 0) {
+      // Compound shapes.
+      int more = 1;
+      stbtt_uint8 *comp = data + g + 10;
+      num_vertices = 0;
+      vertices = 0;
+      while (more) {
+         stbtt_uint16 flags, gidx;
+         int comp_num_verts = 0, i;
+         stbtt_vertex *comp_verts = 0, *tmp = 0;
+         float mtx[6] = {1,0,0,1,0,0}, m, n;
+
+         flags = ttSHORT(comp); comp+=2;
+         gidx = ttSHORT(comp); comp+=2;
+
+         if (flags & 2) { // XY values
+            if (flags & 1) { // shorts
+               mtx[4] = ttSHORT(comp); comp+=2;
+               mtx[5] = ttSHORT(comp); comp+=2;
+            } else {
+               mtx[4] = ttCHAR(comp); comp+=1;
+               mtx[5] = ttCHAR(comp); comp+=1;
+            }
+         }
+         else {
+            // @TODO handle matching point
+            STBTT_assert(0);
+         }
+         if (flags & (1<<3)) { // WE_HAVE_A_SCALE
+            mtx[0] = mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[1] = mtx[2] = 0;
+         } else if (flags & (1<<6)) { // WE_HAVE_AN_X_AND_YSCALE
+            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[1] = mtx[2] = 0;
+            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+         } else if (flags & (1<<7)) { // WE_HAVE_A_TWO_BY_TWO
+            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[1] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[2] = ttSHORT(comp)/16384.0f; comp+=2;
+            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+         }
+
+         // Find transformation scales.
+         m = (float) STBTT_sqrt(mtx[0]*mtx[0] + mtx[1]*mtx[1]);
+         n = (float) STBTT_sqrt(mtx[2]*mtx[2] + mtx[3]*mtx[3]);
+
+         // Get indexed glyph.
+         comp_num_verts = stbtt_GetGlyphShape(info, gidx, &comp_verts);
+         if (comp_num_verts > 0) {
+            // Transform vertices.
+            for (i = 0; i < comp_num_verts; ++i) {
+               stbtt_vertex* v = &comp_verts[i];
+               stbtt_vertex_type x,y;
+               x=v->x; y=v->y;
+               v->x = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
+               v->y = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
+               x=v->cx; y=v->cy;
+               v->cx = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
+               v->cy = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
+            }
+            // Append vertices.
+            tmp = (stbtt_vertex*)STBTT_malloc((num_vertices+comp_num_verts)*sizeof(stbtt_vertex), info->userdata);
+            if (!tmp) {
+               if (vertices) STBTT_free(vertices, info->userdata);
+               if (comp_verts) STBTT_free(comp_verts, info->userdata);
+               return 0;
+            }
+            if (num_vertices > 0 && vertices) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
+            STBTT_memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
+            if (vertices) STBTT_free(vertices, info->userdata);
+            vertices = tmp;
+            STBTT_free(comp_verts, info->userdata);
+            num_vertices += comp_num_verts;
+         }
+         // More components ?
+         more = flags & (1<<5);
+      }
+   } else {
+      // numberOfCounters == 0, do nothing
+   }
+
+   *pvertices = vertices;
+   return num_vertices;
+}
+
+typedef struct
+{
+   int bounds;
+   int started;
+   float first_x, first_y;
+   float x, y;
+   stbtt_int32 min_x, max_x, min_y, max_y;
+
+   stbtt_vertex *pvertices;
+   int num_vertices;
+} stbtt__csctx;
+
+#define STBTT__CSCTX_INIT(bounds) {bounds,0, 0,0, 0,0, 0,0,0,0, NULL, 0}
+
+static void stbtt__track_vertex(stbtt__csctx *c, stbtt_int32 x, stbtt_int32 y)
+{
+   if (x > c->max_x || !c->started) c->max_x = x;
+   if (y > c->max_y || !c->started) c->max_y = y;
+   if (x < c->min_x || !c->started) c->min_x = x;
+   if (y < c->min_y || !c->started) c->min_y = y;
+   c->started = 1;
+}
+
+static void stbtt__csctx_v(stbtt__csctx *c, stbtt_uint8 type, stbtt_int32 x, stbtt_int32 y, stbtt_int32 cx, stbtt_int32 cy, stbtt_int32 cx1, stbtt_int32 cy1)
+{
+   if (c->bounds) {
+      stbtt__track_vertex(c, x, y);
+      if (type == STBTT_vcubic) {
+         stbtt__track_vertex(c, cx, cy);
+         stbtt__track_vertex(c, cx1, cy1);
+      }
+   } else {
+      stbtt_setvertex(&c->pvertices[c->num_vertices], type, x, y, cx, cy);
+      c->pvertices[c->num_vertices].cx1 = (stbtt_int16) cx1;
+      c->pvertices[c->num_vertices].cy1 = (stbtt_int16) cy1;
+   }
+   c->num_vertices++;
+}
+
+static void stbtt__csctx_close_shape(stbtt__csctx *ctx)
+{
+   if (ctx->first_x != ctx->x || ctx->first_y != ctx->y)
+      stbtt__csctx_v(ctx, STBTT_vline, (int)ctx->first_x, (int)ctx->first_y, 0, 0, 0, 0);
+}
+
+static void stbtt__csctx_rmove_to(stbtt__csctx *ctx, float dx, float dy)
+{
+   stbtt__csctx_close_shape(ctx);
+   ctx->first_x = ctx->x = ctx->x + dx;
+   ctx->first_y = ctx->y = ctx->y + dy;
+   stbtt__csctx_v(ctx, STBTT_vmove, (int)ctx->x, (int)ctx->y, 0, 0, 0, 0);
+}
+
+static void stbtt__csctx_rline_to(stbtt__csctx *ctx, float dx, float dy)
+{
+   ctx->x += dx;
+   ctx->y += dy;
+   stbtt__csctx_v(ctx, STBTT_vline, (int)ctx->x, (int)ctx->y, 0, 0, 0, 0);
+}
+
+static void stbtt__csctx_rccurve_to(stbtt__csctx *ctx, float dx1, float dy1, float dx2, float dy2, float dx3, float dy3)
+{
+   float cx1 = ctx->x + dx1;
+   float cy1 = ctx->y + dy1;
+   float cx2 = cx1 + dx2;
+   float cy2 = cy1 + dy2;
+   ctx->x = cx2 + dx3;
+   ctx->y = cy2 + dy3;
+   stbtt__csctx_v(ctx, STBTT_vcubic, (int)ctx->x, (int)ctx->y, (int)cx1, (int)cy1, (int)cx2, (int)cy2);
+}
+
+static stbtt__buf stbtt__get_subr(stbtt__buf idx, int n)
+{
+   int count = stbtt__cff_index_count(&idx);
+   int bias = 107;
+   if (count >= 33900)
+      bias = 32768;
+   else if (count >= 1240)
+      bias = 1131;
+   n += bias;
+   if (n < 0 || n >= count)
+      return stbtt__new_buf(NULL, 0);
+   return stbtt__cff_index_get(idx, n);
+}
+
+static stbtt__buf stbtt__cid_get_glyph_subrs(const stbtt_fontinfo *info, int glyph_index)
+{
+   stbtt__buf fdselect = info->fdselect;
+   int nranges, start, end, v, fmt, fdselector = -1, i;
+
+   stbtt__buf_seek(&fdselect, 0);
+   fmt = stbtt__buf_get8(&fdselect);
+   if (fmt == 0) {
+      // untested
+      stbtt__buf_skip(&fdselect, glyph_index);
+      fdselector = stbtt__buf_get8(&fdselect);
+   } else if (fmt == 3) {
+      nranges = stbtt__buf_get16(&fdselect);
+      start = stbtt__buf_get16(&fdselect);
+      for (i = 0; i < nranges; i++) {
+         v = stbtt__buf_get8(&fdselect);
+         end = stbtt__buf_get16(&fdselect);
+         if (glyph_index >= start && glyph_index < end) {
+            fdselector = v;
+            break;
+         }
+         start = end;
+      }
+   }
+   if (fdselector == -1) stbtt__new_buf(NULL, 0);
+   return stbtt__get_subrs(info->cff, stbtt__cff_index_get(info->fontdicts, fdselector));
+}
+
+static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, stbtt__csctx *c)
+{
+   int in_header = 1, maskbits = 0, subr_stack_height = 0, sp = 0, v, i, b0;
+   int has_subrs = 0, clear_stack;
+   float s[48];
+   stbtt__buf subr_stack[10], subrs = info->subrs, b;
+   float f;
+
+#define STBTT__CSERR(s) (0)
+
+   // this currently ignores the initial width value, which isn't needed if we have hmtx
+   b = stbtt__cff_index_get(info->charstrings, glyph_index);
+   while (b.cursor < b.size) {
+      i = 0;
+      clear_stack = 1;
+      b0 = stbtt__buf_get8(&b);
+      switch (b0) {
+      // @TODO implement hinting
+      case 0x13: // hintmask
+      case 0x14: // cntrmask
+         if (in_header)
+            maskbits += (sp / 2); // implicit "vstem"
+         in_header = 0;
+         stbtt__buf_skip(&b, (maskbits + 7) / 8);
+         break;
+
+      case 0x01: // hstem
+      case 0x03: // vstem
+      case 0x12: // hstemhm
+      case 0x17: // vstemhm
+         maskbits += (sp / 2);
+         break;
+
+      case 0x15: // rmoveto
+         in_header = 0;
+         if (sp < 2) return STBTT__CSERR("rmoveto stack");
+         stbtt__csctx_rmove_to(c, s[sp-2], s[sp-1]);
+         break;
+      case 0x04: // vmoveto
+         in_header = 0;
+         if (sp < 1) return STBTT__CSERR("vmoveto stack");
+         stbtt__csctx_rmove_to(c, 0, s[sp-1]);
+         break;
+      case 0x16: // hmoveto
+         in_header = 0;
+         if (sp < 1) return STBTT__CSERR("hmoveto stack");
+         stbtt__csctx_rmove_to(c, s[sp-1], 0);
+         break;
+
+      case 0x05: // rlineto
+         if (sp < 2) return STBTT__CSERR("rlineto stack");
+         for (; i + 1 < sp; i += 2)
+            stbtt__csctx_rline_to(c, s[i], s[i+1]);
+         break;
+
+      // hlineto/vlineto and vhcurveto/hvcurveto alternate horizontal and vertical
+      // starting from a different place.
+
+      case 0x07: // vlineto
+         if (sp < 1) return STBTT__CSERR("vlineto stack");
+         goto vlineto;
+      case 0x06: // hlineto
+         if (sp < 1) return STBTT__CSERR("hlineto stack");
+         for (;;) {
+            if (i >= sp) break;
+            stbtt__csctx_rline_to(c, s[i], 0);
+            i++;
+      vlineto:
+            if (i >= sp) break;
+            stbtt__csctx_rline_to(c, 0, s[i]);
+            i++;
+         }
+         break;
+
+      case 0x1F: // hvcurveto
+         if (sp < 4) return STBTT__CSERR("hvcurveto stack");
+         goto hvcurveto;
+      case 0x1E: // vhcurveto
+         if (sp < 4) return STBTT__CSERR("vhcurveto stack");
+         for (;;) {
+            if (i + 3 >= sp) break;
+            stbtt__csctx_rccurve_to(c, 0, s[i], s[i+1], s[i+2], s[i+3], (sp - i == 5) ? s[i + 4] : 0.0f);
+            i += 4;
+      hvcurveto:
+            if (i + 3 >= sp) break;
+            stbtt__csctx_rccurve_to(c, s[i], 0, s[i+1], s[i+2], (sp - i == 5) ? s[i+4] : 0.0f, s[i+3]);
+            i += 4;
+         }
+         break;
+
+      case 0x08: // rrcurveto
+         if (sp < 6) return STBTT__CSERR("rcurveline stack");
+         for (; i + 5 < sp; i += 6)
+            stbtt__csctx_rccurve_to(c, s[i], s[i+1], s[i+2], s[i+3], s[i+4], s[i+5]);
+         break;
+
+      case 0x18: // rcurveline
+         if (sp < 8) return STBTT__CSERR("rcurveline stack");
+         for (; i + 5 < sp - 2; i += 6)
+            stbtt__csctx_rccurve_to(c, s[i], s[i+1], s[i+2], s[i+3], s[i+4], s[i+5]);
+         if (i + 1 >= sp) return STBTT__CSERR("rcurveline stack");
+         stbtt__csctx_rline_to(c, s[i], s[i+1]);
+         break;
+
+      case 0x19: // rlinecurve
+         if (sp < 8) return STBTT__CSERR("rlinecurve stack");
+         for (; i + 1 < sp - 6; i += 2)
+            stbtt__csctx_rline_to(c, s[i], s[i+1]);
+         if (i + 5 >= sp) return STBTT__CSERR("rlinecurve stack");
+         stbtt__csctx_rccurve_to(c, s[i], s[i+1], s[i+2], s[i+3], s[i+4], s[i+5]);
+         break;
+
+      case 0x1A: // vvcurveto
+      case 0x1B: // hhcurveto
+         if (sp < 4) return STBTT__CSERR("(vv|hh)curveto stack");
+         f = 0.0;
+         if (sp & 1) { f = s[i]; i++; }
+         for (; i + 3 < sp; i += 4) {
+            if (b0 == 0x1B)
+               stbtt__csctx_rccurve_to(c, s[i], f, s[i+1], s[i+2], s[i+3], 0.0);
+            else
+               stbtt__csctx_rccurve_to(c, f, s[i], s[i+1], s[i+2], 0.0, s[i+3]);
+            f = 0.0;
+         }
+         break;
+
+      case 0x0A: // callsubr
+         if (!has_subrs) {
+            if (info->fdselect.size)
+               subrs = stbtt__cid_get_glyph_subrs(info, glyph_index);
+            has_subrs = 1;
+         }
+         // FALLTHROUGH
+      case 0x1D: // callgsubr
+         if (sp < 1) return STBTT__CSERR("call(g|)subr stack");
+         v = (int) s[--sp];
+         if (subr_stack_height >= 10) return STBTT__CSERR("recursion limit");
+         subr_stack[subr_stack_height++] = b;
+         b = stbtt__get_subr(b0 == 0x0A ? subrs : info->gsubrs, v);
+         if (b.size == 0) return STBTT__CSERR("subr not found");
+         b.cursor = 0;
+         clear_stack = 0;
+         break;
+
+      case 0x0B: // return
+         if (subr_stack_height <= 0) return STBTT__CSERR("return outside subr");
+         b = subr_stack[--subr_stack_height];
+         clear_stack = 0;
+         break;
+
+      case 0x0E: // endchar
+         stbtt__csctx_close_shape(c);
+         return 1;
+
+      case 0x0C: { // two-byte escape
+         float dx1, dx2, dx3, dx4, dx5, dx6, dy1, dy2, dy3, dy4, dy5, dy6;
+         float dx, dy;
+         int b1 = stbtt__buf_get8(&b);
+         switch (b1) {
+         // @TODO These "flex" implementations ignore the flex-depth and resolution,
+         // and always draw beziers.
+         case 0x22: // hflex
+            if (sp < 7) return STBTT__CSERR("hflex stack");
+            dx1 = s[0];
+            dx2 = s[1];
+            dy2 = s[2];
+            dx3 = s[3];
+            dx4 = s[4];
+            dx5 = s[5];
+            dx6 = s[6];
+            stbtt__csctx_rccurve_to(c, dx1, 0, dx2, dy2, dx3, 0);
+            stbtt__csctx_rccurve_to(c, dx4, 0, dx5, -dy2, dx6, 0);
+            break;
+
+         case 0x23: // flex
+            if (sp < 13) return STBTT__CSERR("flex stack");
+            dx1 = s[0];
+            dy1 = s[1];
+            dx2 = s[2];
+            dy2 = s[3];
+            dx3 = s[4];
+            dy3 = s[5];
+            dx4 = s[6];
+            dy4 = s[7];
+            dx5 = s[8];
+            dy5 = s[9];
+            dx6 = s[10];
+            dy6 = s[11];
+            //fd is s[12]
+            stbtt__csctx_rccurve_to(c, dx1, dy1, dx2, dy2, dx3, dy3);
+            stbtt__csctx_rccurve_to(c, dx4, dy4, dx5, dy5, dx6, dy6);
+            break;
+
+         case 0x24: // hflex1
+            if (sp < 9) return STBTT__CSERR("hflex1 stack");
+            dx1 = s[0];
+            dy1 = s[1];
+            dx2 = s[2];
+            dy2 = s[3];
+            dx3 = s[4];
+            dx4 = s[5];
+            dx5 = s[6];
+            dy5 = s[7];
+            dx6 = s[8];
+            stbtt__csctx_rccurve_to(c, dx1, dy1, dx2, dy2, dx3, 0);
+            stbtt__csctx_rccurve_to(c, dx4, 0, dx5, dy5, dx6, -(dy1+dy2+dy5));
+            break;
+
+         case 0x25: // flex1
+            if (sp < 11) return STBTT__CSERR("flex1 stack");
+            dx1 = s[0];
+            dy1 = s[1];
+            dx2 = s[2];
+            dy2 = s[3];
+            dx3 = s[4];
+            dy3 = s[5];
+            dx4 = s[6];
+            dy4 = s[7];
+            dx5 = s[8];
+            dy5 = s[9];
+            dx6 = dy6 = s[10];
+            dx = dx1+dx2+dx3+dx4+dx5;
+            dy = dy1+dy2+dy3+dy4+dy5;
+            if (STBTT_fabs(dx) > STBTT_fabs(dy))
+               dy6 = -dy;
+            else
+               dx6 = -dx;
+            stbtt__csctx_rccurve_to(c, dx1, dy1, dx2, dy2, dx3, dy3);
+            stbtt__csctx_rccurve_to(c, dx4, dy4, dx5, dy5, dx6, dy6);
+            break;
+
+         default:
+            return STBTT__CSERR("unimplemented");
+         }
+      } break;
+
+      default:
+         if (b0 != 255 && b0 != 28 && b0 < 32)
+            return STBTT__CSERR("reserved operator");
+
+         // push immediate
+         if (b0 == 255) {
+            f = (float)(stbtt_int32)stbtt__buf_get32(&b) / 0x10000;
+         } else {
+            stbtt__buf_skip(&b, -1);
+            f = (float)(stbtt_int16)stbtt__cff_int(&b);
+         }
+         if (sp >= 48) return STBTT__CSERR("push stack overflow");
+         s[sp++] = f;
+         clear_stack = 0;
+         break;
+      }
+      if (clear_stack) sp = 0;
+   }
+   return STBTT__CSERR("no endchar");
+
+#undef STBTT__CSERR
+}
+
+static int stbtt__GetGlyphShapeT2(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+{
+   // runs the charstring twice, once to count and once to output (to avoid realloc)
+   stbtt__csctx count_ctx = STBTT__CSCTX_INIT(1);
+   stbtt__csctx output_ctx = STBTT__CSCTX_INIT(0);
+   if (stbtt__run_charstring(info, glyph_index, &count_ctx)) {
+      *pvertices = (stbtt_vertex*)STBTT_malloc(count_ctx.num_vertices*sizeof(stbtt_vertex), info->userdata);
+      output_ctx.pvertices = *pvertices;
+      if (stbtt__run_charstring(info, glyph_index, &output_ctx)) {
+         STBTT_assert(output_ctx.num_vertices == count_ctx.num_vertices);
+         return output_ctx.num_vertices;
+      }
+   }
+   *pvertices = NULL;
+   return 0;
+}
+
+static int stbtt__GetGlyphInfoT2(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
+{
+   stbtt__csctx c = STBTT__CSCTX_INIT(1);
+   int r = stbtt__run_charstring(info, glyph_index, &c);
+   if (x0)  *x0 = r ? c.min_x : 0;
+   if (y0)  *y0 = r ? c.min_y : 0;
+   if (x1)  *x1 = r ? c.max_x : 0;
+   if (y1)  *y1 = r ? c.max_y : 0;
+   return r ? c.num_vertices : 0;
+}
+
+STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+{
+   if (!info->cff.size)
+      return stbtt__GetGlyphShapeTT(info, glyph_index, pvertices);
+   else
+      return stbtt__GetGlyphShapeT2(info, glyph_index, pvertices);
+}
+
+STBTT_DEF void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing)
+{
+   stbtt_uint16 numOfLongHorMetrics = ttUSHORT(info->data+info->hhea + 34);
+   if (glyph_index < numOfLongHorMetrics) {
+      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*glyph_index);
+      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*glyph_index + 2);
+   } else {
+      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*(numOfLongHorMetrics-1));
+      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*numOfLongHorMetrics + 2*(glyph_index - numOfLongHorMetrics));
+   }
+}
+
+STBTT_DEF int  stbtt_GetKerningTableLength(const stbtt_fontinfo *info)
+{
+   stbtt_uint8 *data = info->data + info->kern;
+
+   // we only look at the first table. it must be 'horizontal' and format 0.
+   if (!info->kern)
+      return 0;
+   if (ttUSHORT(data+2) < 1) // number of tables, need at least 1
+      return 0;
+   if (ttUSHORT(data+8) != 1) // horizontal flag must be set in format
+      return 0;
+
+   return ttUSHORT(data+10);
+}
+
+STBTT_DEF int stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningentry* table, int table_length)
+{
+   stbtt_uint8 *data = info->data + info->kern;
+   int k, length;
+
+   // we only look at the first table. it must be 'horizontal' and format 0.
+   if (!info->kern)
+      return 0;
+   if (ttUSHORT(data+2) < 1) // number of tables, need at least 1
+      return 0;
+   if (ttUSHORT(data+8) != 1) // horizontal flag must be set in format
+      return 0;
+
+   length = ttUSHORT(data+10);
+   if (table_length < length)
+      length = table_length;
+
+   for (k = 0; k < length; k++)
+   {
+      table[k].glyph1 = ttUSHORT(data+18+(k*6));
+      table[k].glyph2 = ttUSHORT(data+20+(k*6));
+      table[k].advance = ttSHORT(data+22+(k*6));
+   }
+
+   return length;
+}
+
+static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+{
+   stbtt_uint8 *data = info->data + info->kern;
+   stbtt_uint32 needle, straw;
+   int l, r, m;
+
+   // we only look at the first table. it must be 'horizontal' and format 0.
+   if (!info->kern)
+      return 0;
+   if (ttUSHORT(data+2) < 1) // number of tables, need at least 1
+      return 0;
+   if (ttUSHORT(data+8) != 1) // horizontal flag must be set in format
+      return 0;
+
+   l = 0;
+   r = ttUSHORT(data+10) - 1;
+   needle = glyph1 << 16 | glyph2;
+   while (l <= r) {
+      m = (l + r) >> 1;
+      straw = ttULONG(data+18+(m*6)); // note: unaligned read
+      if (needle < straw)
+         r = m - 1;
+      else if (needle > straw)
+         l = m + 1;
+      else
+         return ttSHORT(data+22+(m*6));
+   }
+   return 0;
+}
+
+static stbtt_int32 stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph)
+{
+   stbtt_uint16 coverageFormat = ttUSHORT(coverageTable);
+   switch (coverageFormat) {
+      case 1: {
+         stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2);
+
+         // Binary search.
+         stbtt_int32 l=0, r=glyphCount-1, m;
+         int straw, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *glyphArray = coverageTable + 4;
+            stbtt_uint16 glyphID;
+            m = (l + r) >> 1;
+            glyphID = ttUSHORT(glyphArray + 2 * m);
+            straw = glyphID;
+            if (needle < straw)
+               r = m - 1;
+            else if (needle > straw)
+               l = m + 1;
+            else {
+               return m;
+            }
+         }
+         break;
+      }
+
+      case 2: {
+         stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2);
+         stbtt_uint8 *rangeArray = coverageTable + 4;
+
+         // Binary search.
+         stbtt_int32 l=0, r=rangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *rangeRecord;
+            m = (l + r) >> 1;
+            rangeRecord = rangeArray + 6 * m;
+            strawStart = ttUSHORT(rangeRecord);
+            strawEnd = ttUSHORT(rangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else {
+               stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4);
+               return startCoverageIndex + glyph - strawStart;
+            }
+         }
+         break;
+      }
+
+      default: return -1; // unsupported
+   }
+
+   return -1;
+}
+
+static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
+{
+   stbtt_uint16 classDefFormat = ttUSHORT(classDefTable);
+   switch (classDefFormat)
+   {
+      case 1: {
+         stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2);
+         stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4);
+         stbtt_uint8 *classDef1ValueArray = classDefTable + 6;
+
+         if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
+            return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
+         break;
+      }
+
+      case 2: {
+         stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2);
+         stbtt_uint8 *classRangeRecords = classDefTable + 4;
+
+         // Binary search.
+         stbtt_int32 l=0, r=classRangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *classRangeRecord;
+            m = (l + r) >> 1;
+            classRangeRecord = classRangeRecords + 6 * m;
+            strawStart = ttUSHORT(classRangeRecord);
+            strawEnd = ttUSHORT(classRangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else
+               return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
+         }
+         break;
+      }
+
+      default:
+         return -1; // Unsupported definition type, return an error.
+   }
+
+   // "All glyphs not assigned to a class fall into class 0". (OpenType spec)
+   return 0;
+}
+
+// Define to STBTT_assert(x) if you want to break on unimplemented formats.
+#define STBTT_GPOS_TODO_assert(x)
+
+static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+{
+   stbtt_uint16 lookupListOffset;
+   stbtt_uint8 *lookupList;
+   stbtt_uint16 lookupCount;
+   stbtt_uint8 *data;
+   stbtt_int32 i, sti;
+
+   if (!info->gpos) return 0;
+
+   data = info->data + info->gpos;
+
+   if (ttUSHORT(data+0) != 1) return 0; // Major version 1
+   if (ttUSHORT(data+2) != 0) return 0; // Minor version 0
+
+   lookupListOffset = ttUSHORT(data+8);
+   lookupList = data + lookupListOffset;
+   lookupCount = ttUSHORT(lookupList);
+
+   for (i=0; i<lookupCount; ++i) {
+      stbtt_uint16 lookupOffset = ttUSHORT(lookupList + 2 + 2 * i);
+      stbtt_uint8 *lookupTable = lookupList + lookupOffset;
+
+      stbtt_uint16 lookupType = ttUSHORT(lookupTable);
+      stbtt_uint16 subTableCount = ttUSHORT(lookupTable + 4);
+      stbtt_uint8 *subTableOffsets = lookupTable + 6;
+      if (lookupType != 2) // Pair Adjustment Positioning Subtable
+         continue;
+
+      for (sti=0; sti<subTableCount; sti++) {
+         stbtt_uint16 subtableOffset = ttUSHORT(subTableOffsets + 2 * sti);
+         stbtt_uint8 *table = lookupTable + subtableOffset;
+         stbtt_uint16 posFormat = ttUSHORT(table);
+         stbtt_uint16 coverageOffset = ttUSHORT(table + 2);
+         stbtt_int32 coverageIndex = stbtt__GetCoverageIndex(table + coverageOffset, glyph1);
+         if (coverageIndex == -1) continue;
+
+         switch (posFormat) {
+            case 1: {
+               stbtt_int32 l, r, m;
+               int straw, needle;
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_int32 valueRecordPairSizeInBytes = 2;
+                  stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
+                  stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
+                  stbtt_uint8 *pairValueTable = table + pairPosOffset;
+                  stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
+                  stbtt_uint8 *pairValueArray = pairValueTable + 2;
+
+                  if (coverageIndex >= pairSetCount) return 0;
+
+                  needle=glyph2;
+                  r=pairValueCount-1;
+                  l=0;
+
+                  // Binary search.
+                  while (l <= r) {
+                     stbtt_uint16 secondGlyph;
+                     stbtt_uint8 *pairValue;
+                     m = (l + r) >> 1;
+                     pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
+                     secondGlyph = ttUSHORT(pairValue);
+                     straw = secondGlyph;
+                     if (needle < straw)
+                        r = m - 1;
+                     else if (needle > straw)
+                        l = m + 1;
+                     else {
+                        stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
+                        return xAdvance;
+                     }
+                  }
+               } else
+                  return 0;
+               break;
+            }
+
+            case 2: {
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
+                  stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
+                  int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
+                  int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
+
+                  stbtt_uint16 class1Count = ttUSHORT(table + 12);
+                  stbtt_uint16 class2Count = ttUSHORT(table + 14);
+                  stbtt_uint8 *class1Records, *class2Records;
+                  stbtt_int16 xAdvance;
+
+                  if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
+                  if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed
+
+                  class1Records = table + 16;
+                  class2Records = class1Records + 2 * (glyph1class * class2Count);
+                  xAdvance = ttSHORT(class2Records + 2 * glyph2class);
+                  return xAdvance;
+               } else
+                  return 0;
+               break;
+            }
+
+            default:
+               return 0; // Unsupported position format
+         }
+      }
+   }
+
+   return 0;
+}
+
+STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int g1, int g2)
+{
+   int xAdvance = 0;
+
+   if (info->gpos)
+      xAdvance += stbtt__GetGlyphGPOSInfoAdvance(info, g1, g2);
+   else if (info->kern)
+      xAdvance += stbtt__GetGlyphKernInfoAdvance(info, g1, g2);
+
+   return xAdvance;
+}
+
+STBTT_DEF int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2)
+{
+   if (!info->kern && !info->gpos) // if no kerning table, don't waste time looking up both codepoint->glyphs
+      return 0;
+   return stbtt_GetGlyphKernAdvance(info, stbtt_FindGlyphIndex(info,ch1), stbtt_FindGlyphIndex(info,ch2));
+}
+
+STBTT_DEF void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing)
+{
+   stbtt_GetGlyphHMetrics(info, stbtt_FindGlyphIndex(info,codepoint), advanceWidth, leftSideBearing);
+}
+
+STBTT_DEF void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap)
+{
+   if (ascent ) *ascent  = ttSHORT(info->data+info->hhea + 4);
+   if (descent) *descent = ttSHORT(info->data+info->hhea + 6);
+   if (lineGap) *lineGap = ttSHORT(info->data+info->hhea + 8);
+}
+
+STBTT_DEF int  stbtt_GetFontVMetricsOS2(const stbtt_fontinfo *info, int *typoAscent, int *typoDescent, int *typoLineGap)
+{
+   int tab = stbtt__find_table(info->data, info->fontstart, "OS/2");
+   if (!tab)
+      return 0;
+   if (typoAscent ) *typoAscent  = ttSHORT(info->data+tab + 68);
+   if (typoDescent) *typoDescent = ttSHORT(info->data+tab + 70);
+   if (typoLineGap) *typoLineGap = ttSHORT(info->data+tab + 72);
+   return 1;
+}
+
+STBTT_DEF void stbtt_GetFontBoundingBox(const stbtt_fontinfo *info, int *x0, int *y0, int *x1, int *y1)
+{
+   *x0 = ttSHORT(info->data + info->head + 36);
+   *y0 = ttSHORT(info->data + info->head + 38);
+   *x1 = ttSHORT(info->data + info->head + 40);
+   *y1 = ttSHORT(info->data + info->head + 42);
+}
+
+STBTT_DEF float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float height)
+{
+   int fheight = ttSHORT(info->data + info->hhea + 4) - ttSHORT(info->data + info->hhea + 6);
+   return (float) height / fheight;
+}
+
+STBTT_DEF float stbtt_ScaleForMappingEmToPixels(const stbtt_fontinfo *info, float pixels)
+{
+   int unitsPerEm = ttUSHORT(info->data + info->head + 18);
+   return pixels / unitsPerEm;
+}
+
+STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *v)
+{
+   STBTT_free(v, info->userdata);
+}
+
+STBTT_DEF stbtt_uint8 *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl)
+{
+   int i;
+   stbtt_uint8 *data = info->data;
+   stbtt_uint8 *svg_doc_list = data + stbtt__get_svg((stbtt_fontinfo *) info);
+
+   int numEntries = ttUSHORT(svg_doc_list);
+   stbtt_uint8 *svg_docs = svg_doc_list + 2;
+
+   for(i=0; i<numEntries; i++) {
+      stbtt_uint8 *svg_doc = svg_docs + (12 * i);
+      if ((gl >= ttUSHORT(svg_doc)) && (gl <= ttUSHORT(svg_doc + 2)))
+         return svg_doc;
+   }
+   return 0;
+}
+
+STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg)
+{
+   stbtt_uint8 *data = info->data;
+   stbtt_uint8 *svg_doc;
+
+   if (info->svg == 0)
+      return 0;
+
+   svg_doc = stbtt_FindSVGDoc(info, gl);
+   if (svg_doc != NULL) {
+      *svg = (char *) data + info->svg + ttULONG(svg_doc + 4);
+      return ttULONG(svg_doc + 8);
+   } else {
+      return 0;
+   }
+}
+
+STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg)
+{
+   return stbtt_GetGlyphSVG(info, stbtt_FindGlyphIndex(info, unicode_codepoint), svg);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// antialiasing software rasterizer
+//
+
+STBTT_DEF void stbtt_GetGlyphBitmapBoxSubpixel(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y,float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   int x0=0,y0=0,x1,y1; // =0 suppresses compiler warning
+   if (!stbtt_GetGlyphBox(font, glyph, &x0,&y0,&x1,&y1)) {
+      // e.g. space character
+      if (ix0) *ix0 = 0;
+      if (iy0) *iy0 = 0;
+      if (ix1) *ix1 = 0;
+      if (iy1) *iy1 = 0;
+   } else {
+      // move to integral bboxes (treating pixels as little squares, what pixels get touched)?
+      if (ix0) *ix0 = STBTT_ifloor( x0 * scale_x + shift_x);
+      if (iy0) *iy0 = STBTT_ifloor(-y1 * scale_y + shift_y);
+      if (ix1) *ix1 = STBTT_iceil ( x1 * scale_x + shift_x);
+      if (iy1) *iy1 = STBTT_iceil (-y0 * scale_y + shift_y);
+   }
+}
+
+STBTT_DEF void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   stbtt_GetGlyphBitmapBoxSubpixel(font, glyph, scale_x, scale_y,0.0f,0.0f, ix0, iy0, ix1, iy1);
+}
+
+STBTT_DEF void stbtt_GetCodepointBitmapBoxSubpixel(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, float shift_x, float shift_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   stbtt_GetGlyphBitmapBoxSubpixel(font, stbtt_FindGlyphIndex(font,codepoint), scale_x, scale_y,shift_x,shift_y, ix0,iy0,ix1,iy1);
+}
+
+STBTT_DEF void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
+{
+   stbtt_GetCodepointBitmapBoxSubpixel(font, codepoint, scale_x, scale_y,0.0f,0.0f, ix0,iy0,ix1,iy1);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  Rasterizer
+
+typedef struct stbtt__hheap_chunk
+{
+   struct stbtt__hheap_chunk *next;
+} stbtt__hheap_chunk;
+
+typedef struct stbtt__hheap
+{
+   struct stbtt__hheap_chunk *head;
+   void   *first_free;
+   int    num_remaining_in_head_chunk;
+} stbtt__hheap;
+
+static void *stbtt__hheap_alloc(stbtt__hheap *hh, size_t size, void *userdata)
+{
+   if (hh->first_free) {
+      void *p = hh->first_free;
+      hh->first_free = * (void **) p;
+      return p;
+   } else {
+      if (hh->num_remaining_in_head_chunk == 0) {
+         int count = (size < 32 ? 2000 : size < 128 ? 800 : 100);
+         stbtt__hheap_chunk *c = (stbtt__hheap_chunk *) STBTT_malloc(sizeof(stbtt__hheap_chunk) + size * count, userdata);
+         if (c == NULL)
+            return NULL;
+         c->next = hh->head;
+         hh->head = c;
+         hh->num_remaining_in_head_chunk = count;
+      }
+      --hh->num_remaining_in_head_chunk;
+      return (char *) (hh->head) + sizeof(stbtt__hheap_chunk) + size * hh->num_remaining_in_head_chunk;
+   }
+}
+
+static void stbtt__hheap_free(stbtt__hheap *hh, void *p)
+{
+   *(void **) p = hh->first_free;
+   hh->first_free = p;
+}
+
+static void stbtt__hheap_cleanup(stbtt__hheap *hh, void *userdata)
+{
+   stbtt__hheap_chunk *c = hh->head;
+   while (c) {
+      stbtt__hheap_chunk *n = c->next;
+      STBTT_free(c, userdata);
+      c = n;
+   }
+}
+
+typedef struct stbtt__edge {
+   float x0,y0, x1,y1;
+   int invert;
+} stbtt__edge;
+
+
+typedef struct stbtt__active_edge
+{
+   struct stbtt__active_edge *next;
+   #if STBTT_RASTERIZER_VERSION==1
+   int x,dx;
+   float ey;
+   int direction;
+   #elif STBTT_RASTERIZER_VERSION==2
+   float fx,fdx,fdy;
+   float direction;
+   float sy;
+   float ey;
+   #else
+   #error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+   #endif
+} stbtt__active_edge;
+
+#if STBTT_RASTERIZER_VERSION == 1
+#define STBTT_FIXSHIFT   10
+#define STBTT_FIX        (1 << STBTT_FIXSHIFT)
+#define STBTT_FIXMASK    (STBTT_FIX-1)
+
+static stbtt__active_edge *stbtt__new_active(stbtt__hheap *hh, stbtt__edge *e, int off_x, float start_point, void *userdata)
+{
+   stbtt__active_edge *z = (stbtt__active_edge *) stbtt__hheap_alloc(hh, sizeof(*z), userdata);
+   float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
+   STBTT_assert(z != NULL);
+   if (!z) return z;
+
+   // round dx down to avoid overshooting
+   if (dxdy < 0)
+      z->dx = -STBTT_ifloor(STBTT_FIX * -dxdy);
+   else
+      z->dx = STBTT_ifloor(STBTT_FIX * dxdy);
+
+   z->x = STBTT_ifloor(STBTT_FIX * e->x0 + z->dx * (start_point - e->y0)); // use z->dx so when we offset later it's by the same amount
+   z->x -= off_x * STBTT_FIX;
+
+   z->ey = e->y1;
+   z->next = 0;
+   z->direction = e->invert ? 1 : -1;
+   return z;
+}
+#elif STBTT_RASTERIZER_VERSION == 2
+static stbtt__active_edge *stbtt__new_active(stbtt__hheap *hh, stbtt__edge *e, int off_x, float start_point, void *userdata)
+{
+   stbtt__active_edge *z = (stbtt__active_edge *) stbtt__hheap_alloc(hh, sizeof(*z), userdata);
+   float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
+   STBTT_assert(z != NULL);
+   //STBTT_assert(e->y0 <= start_point);
+   if (!z) return z;
+   z->fdx = dxdy;
+   z->fdy = dxdy != 0.0f ? (1.0f/dxdy) : 0.0f;
+   z->fx = e->x0 + dxdy * (start_point - e->y0);
+   z->fx -= off_x;
+   z->direction = e->invert ? 1.0f : -1.0f;
+   z->sy = e->y0;
+   z->ey = e->y1;
+   z->next = 0;
+   return z;
+}
+#else
+#error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+#endif
+
+#if STBTT_RASTERIZER_VERSION == 1
+// note: this routine clips fills that extend off the edges... ideally this
+// wouldn't happen, but it could happen if the truetype glyph bounding boxes
+// are wrong, or if the user supplies a too-small bitmap
+static void stbtt__fill_active_edges(unsigned char *scanline, int len, stbtt__active_edge *e, int max_weight)
+{
+   // non-zero winding fill
+   int x0=0, w=0;
+
+   while (e) {
+      if (w == 0) {
+         // if we're currently at zero, we need to record the edge start point
+         x0 = e->x; w += e->direction;
+      } else {
+         int x1 = e->x; w += e->direction;
+         // if we went to zero, we need to draw
+         if (w == 0) {
+            int i = x0 >> STBTT_FIXSHIFT;
+            int j = x1 >> STBTT_FIXSHIFT;
+
+            if (i < len && j >= 0) {
+               if (i == j) {
+                  // x0,x1 are the same pixel, so compute combined coverage
+                  scanline[i] = scanline[i] + (stbtt_uint8) ((x1 - x0) * max_weight >> STBTT_FIXSHIFT);
+               } else {
+                  if (i >= 0) // add antialiasing for x0
+                     scanline[i] = scanline[i] + (stbtt_uint8) (((STBTT_FIX - (x0 & STBTT_FIXMASK)) * max_weight) >> STBTT_FIXSHIFT);
+                  else
+                     i = -1; // clip
+
+                  if (j < len) // add antialiasing for x1
+                     scanline[j] = scanline[j] + (stbtt_uint8) (((x1 & STBTT_FIXMASK) * max_weight) >> STBTT_FIXSHIFT);
+                  else
+                     j = len; // clip
+
+                  for (++i; i < j; ++i) // fill pixels between x0 and x1
+                     scanline[i] = scanline[i] + (stbtt_uint8) max_weight;
+               }
+            }
+         }
+      }
+
+      e = e->next;
+   }
+}
+
+static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e, int n, int vsubsample, int off_x, int off_y, void *userdata)
+{
+   stbtt__hheap hh = { 0, 0, 0 };
+   stbtt__active_edge *active = NULL;
+   int y,j=0;
+   int max_weight = (255 / vsubsample);  // weight per vertical scanline
+   int s; // vertical subsample index
+   unsigned char scanline_data[512], *scanline;
+
+   if (result->w > 512)
+      scanline = (unsigned char *) STBTT_malloc(result->w, userdata);
+   else
+      scanline = scanline_data;
+
+   y = off_y * vsubsample;
+   e[n].y0 = (off_y + result->h) * (float) vsubsample + 1;
+
+   while (j < result->h) {
+      STBTT_memset(scanline, 0, result->w);
+      for (s=0; s < vsubsample; ++s) {
+         // find center of pixel for this scanline
+         float scan_y = y + 0.5f;
+         stbtt__active_edge **step = &active;
+
+         // update all active edges;
+         // remove all active edges that terminate before the center of this scanline
+         while (*step) {
+            stbtt__active_edge * z = *step;
+            if (z->ey <= scan_y) {
+               *step = z->next; // delete from list
+               STBTT_assert(z->direction);
+               z->direction = 0;
+               stbtt__hheap_free(&hh, z);
+            } else {
+               z->x += z->dx; // advance to position for current scanline
+               step = &((*step)->next); // advance through list
+            }
+         }
+
+         // resort the list if needed
+         for(;;) {
+            int changed=0;
+            step = &active;
+            while (*step && (*step)->next) {
+               if ((*step)->x > (*step)->next->x) {
+                  stbtt__active_edge *t = *step;
+                  stbtt__active_edge *q = t->next;
+
+                  t->next = q->next;
+                  q->next = t;
+                  *step = q;
+                  changed = 1;
+               }
+               step = &(*step)->next;
+            }
+            if (!changed) break;
+         }
+
+         // insert all edges that start before the center of this scanline -- omit ones that also end on this scanline
+         while (e->y0 <= scan_y) {
+            if (e->y1 > scan_y) {
+               stbtt__active_edge *z = stbtt__new_active(&hh, e, off_x, scan_y, userdata);
+               if (z != NULL) {
+                  // find insertion point
+                  if (active == NULL)
+                     active = z;
+                  else if (z->x < active->x) {
+                     // insert at front
+                     z->next = active;
+                     active = z;
+                  } else {
+                     // find thing to insert AFTER
+                     stbtt__active_edge *p = active;
+                     while (p->next && p->next->x < z->x)
+                        p = p->next;
+                     // at this point, p->next->x is NOT < z->x
+                     z->next = p->next;
+                     p->next = z;
+                  }
+               }
+            }
+            ++e;
+         }
+
+         // now process all active edges in XOR fashion
+         if (active)
+            stbtt__fill_active_edges(scanline, result->w, active, max_weight);
+
+         ++y;
+      }
+      STBTT_memcpy(result->pixels + j * result->stride, scanline, result->w);
+      ++j;
+   }
+
+   stbtt__hheap_cleanup(&hh, userdata);
+
+   if (scanline != scanline_data)
+      STBTT_free(scanline, userdata);
+}
+
+#elif STBTT_RASTERIZER_VERSION == 2
+
+// the edge passed in here does not cross the vertical line at x or the vertical line at x+1
+// (i.e. it has already been clipped to those)
+static void stbtt__handle_clipped_edge(float *scanline, int x, stbtt__active_edge *e, float x0, float y0, float x1, float y1)
+{
+   if (y0 == y1) return;
+   STBTT_assert(y0 < y1);
+   STBTT_assert(e->sy <= e->ey);
+   if (y0 > e->ey) return;
+   if (y1 < e->sy) return;
+   if (y0 < e->sy) {
+      x0 += (x1-x0) * (e->sy - y0) / (y1-y0);
+      y0 = e->sy;
+   }
+   if (y1 > e->ey) {
+      x1 += (x1-x0) * (e->ey - y1) / (y1-y0);
+      y1 = e->ey;
+   }
+
+   if (x0 == x)
+      STBTT_assert(x1 <= x+1);
+   else if (x0 == x+1)
+      STBTT_assert(x1 >= x);
+   else if (x0 <= x)
+      STBTT_assert(x1 <= x);
+   else if (x0 >= x+1)
+      STBTT_assert(x1 >= x+1);
+   else
+      STBTT_assert(x1 >= x && x1 <= x+1);
+
+   if (x0 <= x && x1 <= x)
+      scanline[x] += e->direction * (y1-y0);
+   else if (x0 >= x+1 && x1 >= x+1)
+      ;
+   else {
+      STBTT_assert(x0 >= x && x0 <= x+1 && x1 >= x && x1 <= x+1);
+      scanline[x] += e->direction * (y1-y0) * (1-((x0-x)+(x1-x))/2); // coverage = 1 - average x position
+   }
+}
+
+static float stbtt__sized_trapezoid_area(float height, float top_width, float bottom_width)
+{
+   STBTT_assert(top_width >= 0);
+   STBTT_assert(bottom_width >= 0);
+   return (top_width + bottom_width) / 2.0f * height;
+}
+
+static float stbtt__position_trapezoid_area(float height, float tx0, float tx1, float bx0, float bx1)
+{
+   return stbtt__sized_trapezoid_area(height, tx1 - tx0, bx1 - bx0);
+}
+
+static float stbtt__sized_triangle_area(float height, float width)
+{
+   return height * width / 2;
+}
+
+static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, int len, stbtt__active_edge *e, float y_top)
+{
+   float y_bottom = y_top+1;
+
+   while (e) {
+      // brute force every pixel
+
+      // compute intersection points with top & bottom
+      STBTT_assert(e->ey >= y_top);
+
+      if (e->fdx == 0) {
+         float x0 = e->fx;
+         if (x0 < len) {
+            if (x0 >= 0) {
+               stbtt__handle_clipped_edge(scanline,(int) x0,e, x0,y_top, x0,y_bottom);
+               stbtt__handle_clipped_edge(scanline_fill-1,(int) x0+1,e, x0,y_top, x0,y_bottom);
+            } else {
+               stbtt__handle_clipped_edge(scanline_fill-1,0,e, x0,y_top, x0,y_bottom);
+            }
+         }
+      } else {
+         float x0 = e->fx;
+         float dx = e->fdx;
+         float xb = x0 + dx;
+         float x_top, x_bottom;
+         float sy0,sy1;
+         float dy = e->fdy;
+         STBTT_assert(e->sy <= y_bottom && e->ey >= y_top);
+
+         // compute endpoints of line segment clipped to this scanline (if the
+         // line segment starts on this scanline. x0 is the intersection of the
+         // line with y_top, but that may be off the line segment.
+         if (e->sy > y_top) {
+            x_top = x0 + dx * (e->sy - y_top);
+            sy0 = e->sy;
+         } else {
+            x_top = x0;
+            sy0 = y_top;
+         }
+         if (e->ey < y_bottom) {
+            x_bottom = x0 + dx * (e->ey - y_top);
+            sy1 = e->ey;
+         } else {
+            x_bottom = xb;
+            sy1 = y_bottom;
+         }
+
+         if (x_top >= 0 && x_bottom >= 0 && x_top < len && x_bottom < len) {
+            // from here on, we don't have to range check x values
+
+            if ((int) x_top == (int) x_bottom) {
+               float height;
+               // simple case, only spans one pixel
+               int x = (int) x_top;
+               height = (sy1 - sy0) * e->direction;
+               STBTT_assert(x >= 0 && x < len);
+               scanline[x]      += stbtt__position_trapezoid_area(height, x_top, x+1.0f, x_bottom, x+1.0f);
+               scanline_fill[x] += height; // everything right of this pixel is filled
+            } else {
+               int x,x1,x2;
+               float y_crossing, y_final, step, sign, area;
+               // covers 2+ pixels
+               if (x_top > x_bottom) {
+                  // flip scanline vertically; signed area is the same
+                  float t;
+                  sy0 = y_bottom - (sy0 - y_top);
+                  sy1 = y_bottom - (sy1 - y_top);
+                  t = sy0, sy0 = sy1, sy1 = t;
+                  t = x_bottom, x_bottom = x_top, x_top = t;
+                  dx = -dx;
+                  dy = -dy;
+                  t = x0, x0 = xb, xb = t;
+               }
+               STBTT_assert(dy >= 0);
+               STBTT_assert(dx >= 0);
+
+               x1 = (int) x_top;
+               x2 = (int) x_bottom;
+               // compute intersection with y axis at x1+1
+               y_crossing = y_top + dy * (x1+1 - x0);
+
+               // compute intersection with y axis at x2
+               y_final = y_top + dy * (x2 - x0);
+
+               //           x1    x_top                            x2    x_bottom
+               //     y_top  +------|-----+------------+------------+--------|---+------------+
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //       sy0  |      Txxxxx|............|............|............|............|
+               // y_crossing |            *xxxxx.......|............|............|............|
+               //            |            |     xxxxx..|............|............|............|
+               //            |            |     /-   xx*xxxx........|............|............|
+               //            |            | dy <       |    xxxxxx..|............|............|
+               //   y_final  |            |     \-     |          xx*xxx.........|............|
+               //       sy1  |            |            |            |   xxxxxB...|............|
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //  y_bottom  +------------+------------+------------+------------+------------+
+               //
+               // goal is to measure the area covered by '.' in each pixel
+
+               // if x2 is right at the right edge of x1, y_crossing can blow up, github #1057
+               // @TODO: maybe test against sy1 rather than y_bottom?
+               if (y_crossing > y_bottom)
+                  y_crossing = y_bottom;
+
+               sign = e->direction;
+
+               // area of the rectangle covered from sy0..y_crossing
+               area = sign * (y_crossing-sy0);
+
+               // area of the triangle (x_top,sy0), (x1+1,sy0), (x1+1,y_crossing)
+               scanline[x1] += stbtt__sized_triangle_area(area, x1+1 - x_top);
+
+               // check if final y_crossing is blown up; no test case for this
+               if (y_final > y_bottom) {
+                  y_final = y_bottom;
+                  dy = (y_final - y_crossing ) / (x2 - (x1+1)); // if denom=0, y_final = y_crossing, so y_final <= y_bottom
+               }
+
+               // in second pixel, area covered by line segment found in first pixel
+               // is always a rectangle 1 wide * the height of that line segment; this
+               // is exactly what the variable 'area' stores. it also gets a contribution
+               // from the line segment within it. the THIRD pixel will get the first
+               // pixel's rectangle contribution, the second pixel's rectangle contribution,
+               // and its own contribution. the 'own contribution' is the same in every pixel except
+               // the leftmost and rightmost, a trapezoid that slides down in each pixel.
+               // the second pixel's contribution to the third pixel will be the
+               // rectangle 1 wide times the height change in the second pixel, which is dy.
+
+               step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x,
+               // which multiplied by 1-pixel-width is how much pixel area changes for each step in x
+               // so the area advances by 'step' every time
+
+               for (x = x1+1; x < x2; ++x) {
+                  scanline[x] += area + step/2; // area of trapezoid is 1*step/2
+                  area += step;
+               }
+               STBTT_assert(STBTT_fabs(area) <= 1.01f); // accumulated error from area += step unless we round step down
+               STBTT_assert(sy1 > y_final-0.01f);
+
+               // area covered in the last pixel is the rectangle from all the pixels to the left,
+               // plus the trapezoid filled by the line segment in this pixel all the way to the right edge
+               scanline[x2] += area + sign * stbtt__position_trapezoid_area(sy1-y_final, (float) x2, x2+1.0f, x_bottom, x2+1.0f);
+
+               // the rest of the line is filled based on the total height of the line segment in this pixel
+               scanline_fill[x2] += sign * (sy1-sy0);
+            }
+         } else {
+            // if edge goes outside of box we're drawing, we require
+            // clipping logic. since this does not match the intended use
+            // of this library, we use a different, very slow brute
+            // force implementation
+            // note though that this does happen some of the time because
+            // x_top and x_bottom can be extrapolated at the top & bottom of
+            // the shape and actually lie outside the bounding box
+            int x;
+            for (x=0; x < len; ++x) {
+               // cases:
+               //
+               // there can be up to two intersections with the pixel. any intersection
+               // with left or right edges can be handled by splitting into two (or three)
+               // regions. intersections with top & bottom do not necessitate case-wise logic.
+               //
+               // the old way of doing this found the intersections with the left & right edges,
+               // then used some simple logic to produce up to three segments in sorted order
+               // from top-to-bottom. however, this had a problem: if an x edge was epsilon
+               // across the x border, then the corresponding y position might not be distinct
+               // from the other y segment, and it might ignored as an empty segment. to avoid
+               // that, we need to explicitly produce segments based on x positions.
+
+               // rename variables to clearly-defined pairs
+               float y0 = y_top;
+               float x1 = (float) (x);
+               float x2 = (float) (x+1);
+               float x3 = xb;
+               float y3 = y_bottom;
+
+               // x = e->x + e->dx * (y-y_top)
+               // (y-y_top) = (x - e->x) / e->dx
+               // y = (x - e->x) / e->dx + y_top
+               float y1 = (x - x0) / dx + y_top;
+               float y2 = (x+1 - x0) / dx + y_top;
+
+               if (x0 < x1 && x3 > x2) {         // three segments descending down-right
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x3,y3);
+               } else if (x3 < x1 && x0 > x2) {  // three segments descending down-left
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x3,y3);
+               } else if (x0 < x1 && x3 > x1) {  // two segments across x, down-right
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x3,y3);
+               } else if (x3 < x1 && x0 > x1) {  // two segments across x, down-left
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x1,y1);
+                  stbtt__handle_clipped_edge(scanline,x,e, x1,y1, x3,y3);
+               } else if (x0 < x2 && x3 > x2) {  // two segments across x+1, down-right
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x3,y3);
+               } else if (x3 < x2 && x0 > x2) {  // two segments across x+1, down-left
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x2,y2);
+                  stbtt__handle_clipped_edge(scanline,x,e, x2,y2, x3,y3);
+               } else {  // one segment
+                  stbtt__handle_clipped_edge(scanline,x,e, x0,y0, x3,y3);
+               }
+            }
+         }
+      }
+      e = e->next;
+   }
+}
+
+// directly AA rasterize edges w/o supersampling
+static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e, int n, int vsubsample, int off_x, int off_y, void *userdata)
+{
+   stbtt__hheap hh = { 0, 0, 0 };
+   stbtt__active_edge *active = NULL;
+   int y,j=0, i;
+   float scanline_data[129], *scanline, *scanline2;
+
+   STBTT__NOTUSED(vsubsample);
+
+   if (result->w > 64)
+      scanline = (float *) STBTT_malloc((result->w*2+1) * sizeof(float), userdata);
+   else
+      scanline = scanline_data;
+
+   scanline2 = scanline + result->w;
+
+   y = off_y;
+   e[n].y0 = (float) (off_y + result->h) + 1;
+
+   while (j < result->h) {
+      // find center of pixel for this scanline
+      float scan_y_top    = y + 0.0f;
+      float scan_y_bottom = y + 1.0f;
+      stbtt__active_edge **step = &active;
+
+      STBTT_memset(scanline , 0, result->w*sizeof(scanline[0]));
+      STBTT_memset(scanline2, 0, (result->w+1)*sizeof(scanline[0]));
+
+      // update all active edges;
+      // remove all active edges that terminate before the top of this scanline
+      while (*step) {
+         stbtt__active_edge * z = *step;
+         if (z->ey <= scan_y_top) {
+            *step = z->next; // delete from list
+            STBTT_assert(z->direction);
+            z->direction = 0;
+            stbtt__hheap_free(&hh, z);
+         } else {
+            step = &((*step)->next); // advance through list
+         }
+      }
+
+      // insert all edges that start before the bottom of this scanline
+      while (e->y0 <= scan_y_bottom) {
+         if (e->y0 != e->y1) {
+            stbtt__active_edge *z = stbtt__new_active(&hh, e, off_x, scan_y_top, userdata);
+            if (z != NULL) {
+               if (j == 0 && off_y != 0) {
+                  if (z->ey < scan_y_top) {
+                     // this can happen due to subpixel positioning and some kind of fp rounding error i think
+                     z->ey = scan_y_top;
+                  }
+               }
+               STBTT_assert(z->ey >= scan_y_top); // if we get really unlucky a tiny bit of an edge can be out of bounds
+               // insert at front
+               z->next = active;
+               active = z;
+            }
+         }
+         ++e;
+      }
+
+      // now process all active edges
+      if (active)
+         stbtt__fill_active_edges_new(scanline, scanline2+1, result->w, active, scan_y_top);
+
+      {
+         float sum = 0;
+         for (i=0; i < result->w; ++i) {
+            float k;
+            int m;
+            sum += scanline2[i];
+            k = scanline[i] + sum;
+            k = (float) STBTT_fabs(k)*255 + 0.5f;
+            m = (int) k;
+            if (m > 255) m = 255;
+            result->pixels[j*result->stride + i] = (unsigned char) m;
+         }
+      }
+      // advance all the edges
+      step = &active;
+      while (*step) {
+         stbtt__active_edge *z = *step;
+         z->fx += z->fdx; // advance to position for current scanline
+         step = &((*step)->next); // advance through list
+      }
+
+      ++y;
+      ++j;
+   }
+
+   stbtt__hheap_cleanup(&hh, userdata);
+
+   if (scanline != scanline_data)
+      STBTT_free(scanline, userdata);
+}
+#else
+#error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+#endif
+
+#define STBTT__COMPARE(a,b)  ((a)->y0 < (b)->y0)
+
+static void stbtt__sort_edges_ins_sort(stbtt__edge *p, int n)
+{
+   int i,j;
+   for (i=1; i < n; ++i) {
+      stbtt__edge t = p[i], *a = &t;
+      j = i;
+      while (j > 0) {
+         stbtt__edge *b = &p[j-1];
+         int c = STBTT__COMPARE(a,b);
+         if (!c) break;
+         p[j] = p[j-1];
+         --j;
+      }
+      if (i != j)
+         p[j] = t;
+   }
+}
+
+static void stbtt__sort_edges_quicksort(stbtt__edge *p, int n)
+{
+   /* threshold for transitioning to insertion sort */
+   while (n > 12) {
+      stbtt__edge t;
+      int c01,c12,c,m,i,j;
+
+      /* compute median of three */
+      m = n >> 1;
+      c01 = STBTT__COMPARE(&p[0],&p[m]);
+      c12 = STBTT__COMPARE(&p[m],&p[n-1]);
+      /* if 0 >= mid >= end, or 0 < mid < end, then use mid */
+      if (c01 != c12) {
+         /* otherwise, we'll need to swap something else to middle */
+         int z;
+         c = STBTT__COMPARE(&p[0],&p[n-1]);
+         /* 0>mid && mid<n:  0>n => n; 0<n => 0 */
+         /* 0<mid && mid>n:  0>n => 0; 0<n => n */
+         z = (c == c12) ? 0 : n-1;
+         t = p[z];
+         p[z] = p[m];
+         p[m] = t;
+      }
+      /* now p[m] is the median-of-three */
+      /* swap it to the beginning so it won't move around */
+      t = p[0];
+      p[0] = p[m];
+      p[m] = t;
+
+      /* partition loop */
+      i=1;
+      j=n-1;
+      for(;;) {
+         /* handling of equality is crucial here */
+         /* for sentinels & efficiency with duplicates */
+         for (;;++i) {
+            if (!STBTT__COMPARE(&p[i], &p[0])) break;
+         }
+         for (;;--j) {
+            if (!STBTT__COMPARE(&p[0], &p[j])) break;
+         }
+         /* make sure we haven't crossed */
+         if (i >= j) break;
+         t = p[i];
+         p[i] = p[j];
+         p[j] = t;
+
+         ++i;
+         --j;
+      }
+      /* recurse on smaller side, iterate on larger */
+      if (j < (n-i)) {
+         stbtt__sort_edges_quicksort(p,j);
+         p = p+i;
+         n = n-i;
+      } else {
+         stbtt__sort_edges_quicksort(p+i, n-i);
+         n = j;
+      }
+   }
+}
+
+static void stbtt__sort_edges(stbtt__edge *p, int n)
+{
+   stbtt__sort_edges_quicksort(p, n);
+   stbtt__sort_edges_ins_sort(p, n);
+}
+
+typedef struct
+{
+   float x,y;
+} stbtt__point;
+
+static void stbtt__rasterize(stbtt__bitmap *result, stbtt__point *pts, int *wcount, int windings, float scale_x, float scale_y, float shift_x, float shift_y, int off_x, int off_y, int invert, void *userdata)
+{
+   float y_scale_inv = invert ? -scale_y : scale_y;
+   stbtt__edge *e;
+   int n,i,j,k,m;
+#if STBTT_RASTERIZER_VERSION == 1
+   int vsubsample = result->h < 8 ? 15 : 5;
+#elif STBTT_RASTERIZER_VERSION == 2
+   int vsubsample = 1;
+#else
+   #error "Unrecognized value of STBTT_RASTERIZER_VERSION"
+#endif
+   // vsubsample should divide 255 evenly; otherwise we won't reach full opacity
+
+   // now we have to blow out the windings into explicit edge lists
+   n = 0;
+   for (i=0; i < windings; ++i)
+      n += wcount[i];
+
+   e = (stbtt__edge *) STBTT_malloc(sizeof(*e) * (n+1), userdata); // add an extra one as a sentinel
+   if (e == 0) return;
+   n = 0;
+
+   m=0;
+   for (i=0; i < windings; ++i) {
+      stbtt__point *p = pts + m;
+      m += wcount[i];
+      j = wcount[i]-1;
+      for (k=0; k < wcount[i]; j=k++) {
+         int a=k,b=j;
+         // skip the edge if horizontal
+         if (p[j].y == p[k].y)
+            continue;
+         // add edge from j to k to the list
+         e[n].invert = 0;
+         if (invert ? p[j].y > p[k].y : p[j].y < p[k].y) {
+            e[n].invert = 1;
+            a=j,b=k;
+         }
+         e[n].x0 = p[a].x * scale_x + shift_x;
+         e[n].y0 = (p[a].y * y_scale_inv + shift_y) * vsubsample;
+         e[n].x1 = p[b].x * scale_x + shift_x;
+         e[n].y1 = (p[b].y * y_scale_inv + shift_y) * vsubsample;
+         ++n;
+      }
+   }
+
+   // now sort the edges by their highest point (should snap to integer, and then by x)
+   //STBTT_sort(e, n, sizeof(e[0]), stbtt__edge_compare);
+   stbtt__sort_edges(e, n);
+
+   // now, traverse the scanlines and find the intersections on each scanline, use xor winding rule
+   stbtt__rasterize_sorted_edges(result, e, n, vsubsample, off_x, off_y, userdata);
+
+   STBTT_free(e, userdata);
+}
+
+static void stbtt__add_point(stbtt__point *points, int n, float x, float y)
+{
+   if (!points) return; // during first pass, it's unallocated
+   points[n].x = x;
+   points[n].y = y;
+}
+
+// tessellate until threshold p is happy... @TODO warped to compensate for non-linear stretching
+static int stbtt__tesselate_curve(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float objspace_flatness_squared, int n)
+{
+   // midpoint
+   float mx = (x0 + 2*x1 + x2)/4;
+   float my = (y0 + 2*y1 + y2)/4;
+   // versus directly drawn line
+   float dx = (x0+x2)/2 - mx;
+   float dy = (y0+y2)/2 - my;
+   if (n > 16) // 65536 segments on one curve better be enough!
+      return 1;
+   if (dx*dx+dy*dy > objspace_flatness_squared) { // half-pixel error allowed... need to be smaller if AA
+      stbtt__tesselate_curve(points, num_points, x0,y0, (x0+x1)/2.0f,(y0+y1)/2.0f, mx,my, objspace_flatness_squared,n+1);
+      stbtt__tesselate_curve(points, num_points, mx,my, (x1+x2)/2.0f,(y1+y2)/2.0f, x2,y2, objspace_flatness_squared,n+1);
+   } else {
+      stbtt__add_point(points, *num_points,x2,y2);
+      *num_points = *num_points+1;
+   }
+   return 1;
+}
+
+static void stbtt__tesselate_cubic(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3, float objspace_flatness_squared, int n)
+{
+   // @TODO this "flatness" calculation is just made-up nonsense that seems to work well enough
+   float dx0 = x1-x0;
+   float dy0 = y1-y0;
+   float dx1 = x2-x1;
+   float dy1 = y2-y1;
+   float dx2 = x3-x2;
+   float dy2 = y3-y2;
+   float dx = x3-x0;
+   float dy = y3-y0;
+   float longlen = (float) (STBTT_sqrt(dx0*dx0+dy0*dy0)+STBTT_sqrt(dx1*dx1+dy1*dy1)+STBTT_sqrt(dx2*dx2+dy2*dy2));
+   float shortlen = (float) STBTT_sqrt(dx*dx+dy*dy);
+   float flatness_squared = longlen*longlen-shortlen*shortlen;
+
+   if (n > 16) // 65536 segments on one curve better be enough!
+      return;
+
+   if (flatness_squared > objspace_flatness_squared) {
+      float x01 = (x0+x1)/2;
+      float y01 = (y0+y1)/2;
+      float x12 = (x1+x2)/2;
+      float y12 = (y1+y2)/2;
+      float x23 = (x2+x3)/2;
+      float y23 = (y2+y3)/2;
+
+      float xa = (x01+x12)/2;
+      float ya = (y01+y12)/2;
+      float xb = (x12+x23)/2;
+      float yb = (y12+y23)/2;
+
+      float mx = (xa+xb)/2;
+      float my = (ya+yb)/2;
+
+      stbtt__tesselate_cubic(points, num_points, x0,y0, x01,y01, xa,ya, mx,my, objspace_flatness_squared,n+1);
+      stbtt__tesselate_cubic(points, num_points, mx,my, xb,yb, x23,y23, x3,y3, objspace_flatness_squared,n+1);
+   } else {
+      stbtt__add_point(points, *num_points,x3,y3);
+      *num_points = *num_points+1;
+   }
+}
+
+// returns number of contours
+static stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts, float objspace_flatness, int **contour_lengths, int *num_contours, void *userdata)
+{
+   stbtt__point *points=0;
+   int num_points=0;
+
+   float objspace_flatness_squared = objspace_flatness * objspace_flatness;
+   int i,n=0,start=0, pass;
+
+   // count how many "moves" there are to get the contour count
+   for (i=0; i < num_verts; ++i)
+      if (vertices[i].type == STBTT_vmove)
+         ++n;
+
+   *num_contours = n;
+   if (n == 0) return 0;
+
+   *contour_lengths = (int *) STBTT_malloc(sizeof(**contour_lengths) * n, userdata);
+
+   if (*contour_lengths == 0) {
+      *num_contours = 0;
+      return 0;
+   }
+
+   // make two passes through the points so we don't need to realloc
+   for (pass=0; pass < 2; ++pass) {
+      float x=0,y=0;
+      if (pass == 1) {
+         points = (stbtt__point *) STBTT_malloc(num_points * sizeof(points[0]), userdata);
+         if (points == NULL) goto error;
+      }
+      num_points = 0;
+      n= -1;
+      for (i=0; i < num_verts; ++i) {
+         switch (vertices[i].type) {
+            case STBTT_vmove:
+               // start the next contour
+               if (n >= 0)
+                  (*contour_lengths)[n] = num_points - start;
+               ++n;
+               start = num_points;
+
+               x = vertices[i].x, y = vertices[i].y;
+               stbtt__add_point(points, num_points++, x,y);
+               break;
+            case STBTT_vline:
+               x = vertices[i].x, y = vertices[i].y;
+               stbtt__add_point(points, num_points++, x, y);
+               break;
+            case STBTT_vcurve:
+               stbtt__tesselate_curve(points, &num_points, x,y,
+                                        vertices[i].cx, vertices[i].cy,
+                                        vertices[i].x,  vertices[i].y,
+                                        objspace_flatness_squared, 0);
+               x = vertices[i].x, y = vertices[i].y;
+               break;
+            case STBTT_vcubic:
+               stbtt__tesselate_cubic(points, &num_points, x,y,
+                                        vertices[i].cx, vertices[i].cy,
+                                        vertices[i].cx1, vertices[i].cy1,
+                                        vertices[i].x,  vertices[i].y,
+                                        objspace_flatness_squared, 0);
+               x = vertices[i].x, y = vertices[i].y;
+               break;
+         }
+      }
+      (*contour_lengths)[n] = num_points - start;
+   }
+
+   return points;
+error:
+   STBTT_free(points, userdata);
+   STBTT_free(*contour_lengths, userdata);
+   *contour_lengths = 0;
+   *num_contours = 0;
+   return NULL;
+}
+
+STBTT_DEF void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, float shift_x, float shift_y, int x_off, int y_off, int invert, void *userdata)
+{
+   float scale            = scale_x > scale_y ? scale_y : scale_x;
+   int winding_count      = 0;
+   int *winding_lengths   = NULL;
+   stbtt__point *windings = stbtt_FlattenCurves(vertices, num_verts, flatness_in_pixels / scale, &winding_lengths, &winding_count, userdata);
+   if (windings) {
+      stbtt__rasterize(result, windings, winding_lengths, winding_count, scale_x, scale_y, shift_x, shift_y, x_off, y_off, invert, userdata);
+      STBTT_free(winding_lengths, userdata);
+      STBTT_free(windings, userdata);
+   }
+}
+
+STBTT_DEF void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata)
+{
+   STBTT_free(bitmap, userdata);
+}
+
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int glyph, int *width, int *height, int *xoff, int *yoff)
+{
+   int ix0,iy0,ix1,iy1;
+   stbtt__bitmap gbm;
+   stbtt_vertex *vertices;
+   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
+
+   if (scale_x == 0) scale_x = scale_y;
+   if (scale_y == 0) {
+      if (scale_x == 0) {
+         STBTT_free(vertices, info->userdata);
+         return NULL;
+      }
+      scale_y = scale_x;
+   }
+
+   stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale_x, scale_y, shift_x, shift_y, &ix0,&iy0,&ix1,&iy1);
+
+   // now we get the size
+   gbm.w = (ix1 - ix0);
+   gbm.h = (iy1 - iy0);
+   gbm.pixels = NULL; // in case we error
+
+   if (width ) *width  = gbm.w;
+   if (height) *height = gbm.h;
+   if (xoff  ) *xoff   = ix0;
+   if (yoff  ) *yoff   = iy0;
+
+   if (gbm.w && gbm.h) {
+      gbm.pixels = (unsigned char *) STBTT_malloc(gbm.w * gbm.h, info->userdata);
+      if (gbm.pixels) {
+         gbm.stride = gbm.w;
+
+         stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, shift_x, shift_y, ix0, iy0, 1, info->userdata);
+      }
+   }
+   STBTT_free(vertices, info->userdata);
+   return gbm.pixels;
+}
+
+STBTT_DEF unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetGlyphBitmapSubpixel(info, scale_x, scale_y, 0.0f, 0.0f, glyph, width, height, xoff, yoff);
+}
+
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int glyph)
+{
+   int ix0,iy0;
+   stbtt_vertex *vertices;
+   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
+   stbtt__bitmap gbm;
+
+   stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale_x, scale_y, shift_x, shift_y, &ix0,&iy0,0,0);
+   gbm.pixels = output;
+   gbm.w = out_w;
+   gbm.h = out_h;
+   gbm.stride = out_stride;
+
+   if (gbm.w && gbm.h)
+      stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, shift_x, shift_y, ix0,iy0, 1, info->userdata);
+
+   STBTT_free(vertices, info->userdata);
+}
+
+STBTT_DEF void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph)
+{
+   stbtt_MakeGlyphBitmapSubpixel(info, output, out_w, out_h, out_stride, scale_x, scale_y, 0.0f,0.0f, glyph);
+}
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmapSubpixel(const stbtt_fontinfo *info, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetGlyphBitmapSubpixel(info, scale_x, scale_y,shift_x,shift_y, stbtt_FindGlyphIndex(info,codepoint), width,height,xoff,yoff);
+}
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int oversample_x, int oversample_y, float *sub_x, float *sub_y, int codepoint)
+{
+   stbtt_MakeGlyphBitmapSubpixelPrefilter(info, output, out_w, out_h, out_stride, scale_x, scale_y, shift_x, shift_y, oversample_x, oversample_y, sub_x, sub_y, stbtt_FindGlyphIndex(info,codepoint));
+}
+
+STBTT_DEF void stbtt_MakeCodepointBitmapSubpixel(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int codepoint)
+{
+   stbtt_MakeGlyphBitmapSubpixel(info, output, out_w, out_h, out_stride, scale_x, scale_y, shift_x, shift_y, stbtt_FindGlyphIndex(info,codepoint));
+}
+
+STBTT_DEF unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetCodepointBitmapSubpixel(info, scale_x, scale_y, 0.0f,0.0f, codepoint, width,height,xoff,yoff);
+}
+
+STBTT_DEF void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint)
+{
+   stbtt_MakeCodepointBitmapSubpixel(info, output, out_w, out_h, out_stride, scale_x, scale_y, 0.0f,0.0f, codepoint);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// bitmap baking
+//
+// This is SUPER-CRAPPY packing to keep source code small
+
+static int stbtt_BakeFontBitmap_internal(unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+                                float pixel_height,                     // height of font in pixels
+                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
+                                int first_char, int num_chars,          // characters to bake
+                                stbtt_bakedchar *chardata)
+{
+   float scale;
+   int x,y,bottom_y, i;
+   stbtt_fontinfo f;
+   f.userdata = NULL;
+   if (!stbtt_InitFont(&f, data, offset))
+      return -1;
+   STBTT_memset(pixels, 0, pw*ph); // background of 0 around pixels
+   x=y=1;
+   bottom_y = 1;
+
+   scale = stbtt_ScaleForPixelHeight(&f, pixel_height);
+
+   for (i=0; i < num_chars; ++i) {
+      int advance, lsb, x0,y0,x1,y1,gw,gh;
+      int g = stbtt_FindGlyphIndex(&f, first_char + i);
+      stbtt_GetGlyphHMetrics(&f, g, &advance, &lsb);
+      stbtt_GetGlyphBitmapBox(&f, g, scale,scale, &x0,&y0,&x1,&y1);
+      gw = x1-x0;
+      gh = y1-y0;
+      if (x + gw + 1 >= pw)
+         y = bottom_y, x = 1; // advance to next row
+      if (y + gh + 1 >= ph) // check if it fits vertically AFTER potentially moving to next row
+         return -i;
+      STBTT_assert(x+gw < pw);
+      STBTT_assert(y+gh < ph);
+      stbtt_MakeGlyphBitmap(&f, pixels+x+y*pw, gw,gh,pw, scale,scale, g);
+      chardata[i].x0 = (stbtt_int16) x;
+      chardata[i].y0 = (stbtt_int16) y;
+      chardata[i].x1 = (stbtt_int16) (x + gw);
+      chardata[i].y1 = (stbtt_int16) (y + gh);
+      chardata[i].xadvance = scale * advance;
+      chardata[i].xoff     = (float) x0;
+      chardata[i].yoff     = (float) y0;
+      x = x + gw + 1;
+      if (y+gh+1 > bottom_y)
+         bottom_y = y+gh+1;
+   }
+   return bottom_y;
+}
+
+STBTT_DEF void stbtt_GetBakedQuad(const stbtt_bakedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int opengl_fillrule)
+{
+   float d3d_bias = opengl_fillrule ? 0 : -0.5f;
+   float ipw = 1.0f / pw, iph = 1.0f / ph;
+   const stbtt_bakedchar *b = chardata + char_index;
+   int round_x = STBTT_ifloor((*xpos + b->xoff) + 0.5f);
+   int round_y = STBTT_ifloor((*ypos + b->yoff) + 0.5f);
+
+   q->x0 = round_x + d3d_bias;
+   q->y0 = round_y + d3d_bias;
+   q->x1 = round_x + b->x1 - b->x0 + d3d_bias;
+   q->y1 = round_y + b->y1 - b->y0 + d3d_bias;
+
+   q->s0 = b->x0 * ipw;
+   q->t0 = b->y0 * iph;
+   q->s1 = b->x1 * ipw;
+   q->t1 = b->y1 * iph;
+
+   *xpos += b->xadvance;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// rectangle packing replacement routines if you don't have stb_rect_pack.h
+//
+
+#ifndef STB_RECT_PACK_VERSION
+
+typedef int stbrp_coord;
+
+////////////////////////////////////////////////////////////////////////////////////
+//                                                                                //
+//                                                                                //
+// COMPILER WARNING ?!?!?                                                         //
+//                                                                                //
+//                                                                                //
+// if you get a compile warning due to these symbols being defined more than      //
+// once, move #include "stb_rect_pack.h" before #include "stb_truetype.h"         //
+//                                                                                //
+////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct
+{
+   int width,height;
+   int x,y,bottom_y;
+} stbrp_context;
+
+typedef struct
+{
+   unsigned char x;
+} stbrp_node;
+
+struct stbrp_rect
+{
+   stbrp_coord x,y;
+   int id,w,h,was_packed;
+};
+
+static void stbrp_init_target(stbrp_context *con, int pw, int ph, stbrp_node *nodes, int num_nodes)
+{
+   con->width  = pw;
+   con->height = ph;
+   con->x = 0;
+   con->y = 0;
+   con->bottom_y = 0;
+   STBTT__NOTUSED(nodes);
+   STBTT__NOTUSED(num_nodes);
+}
+
+static void stbrp_pack_rects(stbrp_context *con, stbrp_rect *rects, int num_rects)
+{
+   int i;
+   for (i=0; i < num_rects; ++i) {
+      if (con->x + rects[i].w > con->width) {
+         con->x = 0;
+         con->y = con->bottom_y;
+      }
+      if (con->y + rects[i].h > con->height)
+         break;
+      rects[i].x = con->x;
+      rects[i].y = con->y;
+      rects[i].was_packed = 1;
+      con->x += rects[i].w;
+      if (con->y + rects[i].h > con->bottom_y)
+         con->bottom_y = con->y + rects[i].h;
+   }
+   for (   ; i < num_rects; ++i)
+      rects[i].was_packed = 0;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// bitmap baking
+//
+// This is SUPER-AWESOME (tm Ryan Gordon) packing using stb_rect_pack.h. If
+// stb_rect_pack.h isn't available, it uses the BakeFontBitmap strategy.
+
+STBTT_DEF int stbtt_PackBegin(stbtt_pack_context *spc, unsigned char *pixels, int pw, int ph, int stride_in_bytes, int padding, void *alloc_context)
+{
+   stbrp_context *context = (stbrp_context *) STBTT_malloc(sizeof(*context)            ,alloc_context);
+   int            num_nodes = pw - padding;
+   stbrp_node    *nodes   = (stbrp_node    *) STBTT_malloc(sizeof(*nodes  ) * num_nodes,alloc_context);
+
+   if (context == NULL || nodes == NULL) {
+      if (context != NULL) STBTT_free(context, alloc_context);
+      if (nodes   != NULL) STBTT_free(nodes  , alloc_context);
+      return 0;
+   }
+
+   spc->user_allocator_context = alloc_context;
+   spc->width = pw;
+   spc->height = ph;
+   spc->pixels = pixels;
+   spc->pack_info = context;
+   spc->nodes = nodes;
+   spc->padding = padding;
+   spc->stride_in_bytes = stride_in_bytes != 0 ? stride_in_bytes : pw;
+   spc->h_oversample = 1;
+   spc->v_oversample = 1;
+   spc->skip_missing = 0;
+
+   stbrp_init_target(context, pw-padding, ph-padding, nodes, num_nodes);
+
+   if (pixels)
+      STBTT_memset(pixels, 0, pw*ph); // background of 0 around pixels
+
+   return 1;
+}
+
+STBTT_DEF void stbtt_PackEnd  (stbtt_pack_context *spc)
+{
+   STBTT_free(spc->nodes    , spc->user_allocator_context);
+   STBTT_free(spc->pack_info, spc->user_allocator_context);
+}
+
+STBTT_DEF void stbtt_PackSetOversampling(stbtt_pack_context *spc, unsigned int h_oversample, unsigned int v_oversample)
+{
+   STBTT_assert(h_oversample <= STBTT_MAX_OVERSAMPLE);
+   STBTT_assert(v_oversample <= STBTT_MAX_OVERSAMPLE);
+   if (h_oversample <= STBTT_MAX_OVERSAMPLE)
+      spc->h_oversample = h_oversample;
+   if (v_oversample <= STBTT_MAX_OVERSAMPLE)
+      spc->v_oversample = v_oversample;
+}
+
+STBTT_DEF void stbtt_PackSetSkipMissingCodepoints(stbtt_pack_context *spc, int skip)
+{
+   spc->skip_missing = skip;
+}
+
+#define STBTT__OVER_MASK  (STBTT_MAX_OVERSAMPLE-1)
+
+static void stbtt__h_prefilter(unsigned char *pixels, int w, int h, int stride_in_bytes, unsigned int kernel_width)
+{
+   unsigned char buffer[STBTT_MAX_OVERSAMPLE];
+   int safe_w = w - kernel_width;
+   int j;
+   STBTT_memset(buffer, 0, STBTT_MAX_OVERSAMPLE); // suppress bogus warning from VS2013 -analyze
+   for (j=0; j < h; ++j) {
+      int i;
+      unsigned int total;
+      STBTT_memset(buffer, 0, kernel_width);
+
+      total = 0;
+
+      // make kernel_width a constant in common cases so compiler can optimize out the divide
+      switch (kernel_width) {
+         case 2:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 2);
+            }
+            break;
+         case 3:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 3);
+            }
+            break;
+         case 4:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 4);
+            }
+            break;
+         case 5:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / 5);
+            }
+            break;
+         default:
+            for (i=0; i <= safe_w; ++i) {
+               total += pixels[i] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i];
+               pixels[i] = (unsigned char) (total / kernel_width);
+            }
+            break;
+      }
+
+      for (; i < w; ++i) {
+         STBTT_assert(pixels[i] == 0);
+         total -= buffer[i & STBTT__OVER_MASK];
+         pixels[i] = (unsigned char) (total / kernel_width);
+      }
+
+      pixels += stride_in_bytes;
+   }
+}
+
+static void stbtt__v_prefilter(unsigned char *pixels, int w, int h, int stride_in_bytes, unsigned int kernel_width)
+{
+   unsigned char buffer[STBTT_MAX_OVERSAMPLE];
+   int safe_h = h - kernel_width;
+   int j;
+   STBTT_memset(buffer, 0, STBTT_MAX_OVERSAMPLE); // suppress bogus warning from VS2013 -analyze
+   for (j=0; j < w; ++j) {
+      int i;
+      unsigned int total;
+      STBTT_memset(buffer, 0, kernel_width);
+
+      total = 0;
+
+      // make kernel_width a constant in common cases so compiler can optimize out the divide
+      switch (kernel_width) {
+         case 2:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 2);
+            }
+            break;
+         case 3:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 3);
+            }
+            break;
+         case 4:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 4);
+            }
+            break;
+         case 5:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / 5);
+            }
+            break;
+         default:
+            for (i=0; i <= safe_h; ++i) {
+               total += pixels[i*stride_in_bytes] - buffer[i & STBTT__OVER_MASK];
+               buffer[(i+kernel_width) & STBTT__OVER_MASK] = pixels[i*stride_in_bytes];
+               pixels[i*stride_in_bytes] = (unsigned char) (total / kernel_width);
+            }
+            break;
+      }
+
+      for (; i < h; ++i) {
+         STBTT_assert(pixels[i*stride_in_bytes] == 0);
+         total -= buffer[i & STBTT__OVER_MASK];
+         pixels[i*stride_in_bytes] = (unsigned char) (total / kernel_width);
+      }
+
+      pixels += 1;
+   }
+}
+
+static float stbtt__oversample_shift(int oversample)
+{
+   if (!oversample)
+      return 0.0f;
+
+   // The prefilter is a box filter of width "oversample",
+   // which shifts phase by (oversample - 1)/2 pixels in
+   // oversampled space. We want to shift in the opposite
+   // direction to counter this.
+   return (float)-(oversample - 1) / (2.0f * (float)oversample);
+}
+
+// rects array must be big enough to accommodate all characters in the given ranges
+STBTT_DEF int stbtt_PackFontRangesGatherRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
+{
+   int i,j,k;
+   int missing_glyph_added = 0;
+
+   k=0;
+   for (i=0; i < num_ranges; ++i) {
+      float fh = ranges[i].font_size;
+      float scale = fh > 0 ? stbtt_ScaleForPixelHeight(info, fh) : stbtt_ScaleForMappingEmToPixels(info, -fh);
+      ranges[i].h_oversample = (unsigned char) spc->h_oversample;
+      ranges[i].v_oversample = (unsigned char) spc->v_oversample;
+      for (j=0; j < ranges[i].num_chars; ++j) {
+         int x0,y0,x1,y1;
+         int codepoint = ranges[i].array_of_unicode_codepoints == NULL ? ranges[i].first_unicode_codepoint_in_range + j : ranges[i].array_of_unicode_codepoints[j];
+         int glyph = stbtt_FindGlyphIndex(info, codepoint);
+         if (glyph == 0 && (spc->skip_missing || missing_glyph_added)) {
+            rects[k].w = rects[k].h = 0;
+         } else {
+            stbtt_GetGlyphBitmapBoxSubpixel(info,glyph,
+                                            scale * spc->h_oversample,
+                                            scale * spc->v_oversample,
+                                            0,0,
+                                            &x0,&y0,&x1,&y1);
+            rects[k].w = (stbrp_coord) (x1-x0 + spc->padding + spc->h_oversample-1);
+            rects[k].h = (stbrp_coord) (y1-y0 + spc->padding + spc->v_oversample-1);
+            if (glyph == 0)
+               missing_glyph_added = 1;
+         }
+         ++k;
+      }
+   }
+
+   return k;
+}
+
+STBTT_DEF void stbtt_MakeGlyphBitmapSubpixelPrefilter(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, float shift_x, float shift_y, int prefilter_x, int prefilter_y, float *sub_x, float *sub_y, int glyph)
+{
+   stbtt_MakeGlyphBitmapSubpixel(info,
+                                 output,
+                                 out_w - (prefilter_x - 1),
+                                 out_h - (prefilter_y - 1),
+                                 out_stride,
+                                 scale_x,
+                                 scale_y,
+                                 shift_x,
+                                 shift_y,
+                                 glyph);
+
+   if (prefilter_x > 1)
+      stbtt__h_prefilter(output, out_w, out_h, out_stride, prefilter_x);
+
+   if (prefilter_y > 1)
+      stbtt__v_prefilter(output, out_w, out_h, out_stride, prefilter_y);
+
+   *sub_x = stbtt__oversample_shift(prefilter_x);
+   *sub_y = stbtt__oversample_shift(prefilter_y);
+}
+
+// rects array must be big enough to accommodate all characters in the given ranges
+STBTT_DEF int stbtt_PackFontRangesRenderIntoRects(stbtt_pack_context *spc, const stbtt_fontinfo *info, stbtt_pack_range *ranges, int num_ranges, stbrp_rect *rects)
+{
+   int i,j,k, missing_glyph = -1, return_value = 1;
+
+   // save current values
+   int old_h_over = spc->h_oversample;
+   int old_v_over = spc->v_oversample;
+
+   k = 0;
+   for (i=0; i < num_ranges; ++i) {
+      float fh = ranges[i].font_size;
+      float scale = fh > 0 ? stbtt_ScaleForPixelHeight(info, fh) : stbtt_ScaleForMappingEmToPixels(info, -fh);
+      float recip_h,recip_v,sub_x,sub_y;
+      spc->h_oversample = ranges[i].h_oversample;
+      spc->v_oversample = ranges[i].v_oversample;
+      recip_h = 1.0f / spc->h_oversample;
+      recip_v = 1.0f / spc->v_oversample;
+      sub_x = stbtt__oversample_shift(spc->h_oversample);
+      sub_y = stbtt__oversample_shift(spc->v_oversample);
+      for (j=0; j < ranges[i].num_chars; ++j) {
+         stbrp_rect *r = &rects[k];
+         if (r->was_packed && r->w != 0 && r->h != 0) {
+            stbtt_packedchar *bc = &ranges[i].chardata_for_range[j];
+            int advance, lsb, x0,y0,x1,y1;
+            int codepoint = ranges[i].array_of_unicode_codepoints == NULL ? ranges[i].first_unicode_codepoint_in_range + j : ranges[i].array_of_unicode_codepoints[j];
+            int glyph = stbtt_FindGlyphIndex(info, codepoint);
+            stbrp_coord pad = (stbrp_coord) spc->padding;
+
+            // pad on left and top
+            r->x += pad;
+            r->y += pad;
+            r->w -= pad;
+            r->h -= pad;
+            stbtt_GetGlyphHMetrics(info, glyph, &advance, &lsb);
+            stbtt_GetGlyphBitmapBox(info, glyph,
+                                    scale * spc->h_oversample,
+                                    scale * spc->v_oversample,
+                                    &x0,&y0,&x1,&y1);
+            stbtt_MakeGlyphBitmapSubpixel(info,
+                                          spc->pixels + r->x + r->y*spc->stride_in_bytes,
+                                          r->w - spc->h_oversample+1,
+                                          r->h - spc->v_oversample+1,
+                                          spc->stride_in_bytes,
+                                          scale * spc->h_oversample,
+                                          scale * spc->v_oversample,
+                                          0,0,
+                                          glyph);
+
+            if (spc->h_oversample > 1)
+               stbtt__h_prefilter(spc->pixels + r->x + r->y*spc->stride_in_bytes,
+                                  r->w, r->h, spc->stride_in_bytes,
+                                  spc->h_oversample);
+
+            if (spc->v_oversample > 1)
+               stbtt__v_prefilter(spc->pixels + r->x + r->y*spc->stride_in_bytes,
+                                  r->w, r->h, spc->stride_in_bytes,
+                                  spc->v_oversample);
+
+            bc->x0       = (stbtt_int16)  r->x;
+            bc->y0       = (stbtt_int16)  r->y;
+            bc->x1       = (stbtt_int16) (r->x + r->w);
+            bc->y1       = (stbtt_int16) (r->y + r->h);
+            bc->xadvance =                scale * advance;
+            bc->xoff     =       (float)  x0 * recip_h + sub_x;
+            bc->yoff     =       (float)  y0 * recip_v + sub_y;
+            bc->xoff2    =                (x0 + r->w) * recip_h + sub_x;
+            bc->yoff2    =                (y0 + r->h) * recip_v + sub_y;
+
+            if (glyph == 0)
+               missing_glyph = j;
+         } else if (spc->skip_missing) {
+            return_value = 0;
+         } else if (r->was_packed && r->w == 0 && r->h == 0 && missing_glyph >= 0) {
+            ranges[i].chardata_for_range[j] = ranges[i].chardata_for_range[missing_glyph];
+         } else {
+            return_value = 0; // if any fail, report failure
+         }
+
+         ++k;
+      }
+   }
+
+   // restore original values
+   spc->h_oversample = old_h_over;
+   spc->v_oversample = old_v_over;
+
+   return return_value;
+}
+
+STBTT_DEF void stbtt_PackFontRangesPackRects(stbtt_pack_context *spc, stbrp_rect *rects, int num_rects)
+{
+   stbrp_pack_rects((stbrp_context *) spc->pack_info, rects, num_rects);
+}
+
+STBTT_DEF int stbtt_PackFontRanges(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, stbtt_pack_range *ranges, int num_ranges)
+{
+   stbtt_fontinfo info;
+   int i,j,n, return_value = 1;
+   //stbrp_context *context = (stbrp_context *) spc->pack_info;
+   stbrp_rect    *rects;
+
+   // flag all characters as NOT packed
+   for (i=0; i < num_ranges; ++i)
+      for (j=0; j < ranges[i].num_chars; ++j)
+         ranges[i].chardata_for_range[j].x0 =
+         ranges[i].chardata_for_range[j].y0 =
+         ranges[i].chardata_for_range[j].x1 =
+         ranges[i].chardata_for_range[j].y1 = 0;
+
+   n = 0;
+   for (i=0; i < num_ranges; ++i)
+      n += ranges[i].num_chars;
+
+   rects = (stbrp_rect *) STBTT_malloc(sizeof(*rects) * n, spc->user_allocator_context);
+   if (rects == NULL)
+      return 0;
+
+   info.userdata = spc->user_allocator_context;
+   stbtt_InitFont(&info, fontdata, stbtt_GetFontOffsetForIndex(fontdata,font_index));
+
+   n = stbtt_PackFontRangesGatherRects(spc, &info, ranges, num_ranges, rects);
+
+   stbtt_PackFontRangesPackRects(spc, rects, n);
+
+   return_value = stbtt_PackFontRangesRenderIntoRects(spc, &info, ranges, num_ranges, rects);
+
+   STBTT_free(rects, spc->user_allocator_context);
+   return return_value;
+}
+
+STBTT_DEF int stbtt_PackFontRange(stbtt_pack_context *spc, const unsigned char *fontdata, int font_index, float font_size,
+            int first_unicode_codepoint_in_range, int num_chars_in_range, stbtt_packedchar *chardata_for_range)
+{
+   stbtt_pack_range range;
+   range.first_unicode_codepoint_in_range = first_unicode_codepoint_in_range;
+   range.array_of_unicode_codepoints = NULL;
+   range.num_chars                   = num_chars_in_range;
+   range.chardata_for_range          = chardata_for_range;
+   range.font_size                   = font_size;
+   return stbtt_PackFontRanges(spc, fontdata, font_index, &range, 1);
+}
+
+STBTT_DEF void stbtt_GetScaledFontVMetrics(const unsigned char *fontdata, int index, float size, float *ascent, float *descent, float *lineGap)
+{
+   int i_ascent, i_descent, i_lineGap;
+   float scale;
+   stbtt_fontinfo info;
+   stbtt_InitFont(&info, fontdata, stbtt_GetFontOffsetForIndex(fontdata, index));
+   scale = size > 0 ? stbtt_ScaleForPixelHeight(&info, size) : stbtt_ScaleForMappingEmToPixels(&info, -size);
+   stbtt_GetFontVMetrics(&info, &i_ascent, &i_descent, &i_lineGap);
+   *ascent  = (float) i_ascent  * scale;
+   *descent = (float) i_descent * scale;
+   *lineGap = (float) i_lineGap * scale;
+}
+
+STBTT_DEF void stbtt_GetPackedQuad(const stbtt_packedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int align_to_integer)
+{
+   float ipw = 1.0f / pw, iph = 1.0f / ph;
+   const stbtt_packedchar *b = chardata + char_index;
+
+   if (align_to_integer) {
+      float x = (float) STBTT_ifloor((*xpos + b->xoff) + 0.5f);
+      float y = (float) STBTT_ifloor((*ypos + b->yoff) + 0.5f);
+      q->x0 = x;
+      q->y0 = y;
+      q->x1 = x + b->xoff2 - b->xoff;
+      q->y1 = y + b->yoff2 - b->yoff;
+   } else {
+      q->x0 = *xpos + b->xoff;
+      q->y0 = *ypos + b->yoff;
+      q->x1 = *xpos + b->xoff2;
+      q->y1 = *ypos + b->yoff2;
+   }
+
+   q->s0 = b->x0 * ipw;
+   q->t0 = b->y0 * iph;
+   q->s1 = b->x1 * ipw;
+   q->t1 = b->y1 * iph;
+
+   *xpos += b->xadvance;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// sdf computation
+//
+
+#define STBTT_min(a,b)  ((a) < (b) ? (a) : (b))
+#define STBTT_max(a,b)  ((a) < (b) ? (b) : (a))
+
+static int stbtt__ray_intersect_bezier(float orig[2], float ray[2], float q0[2], float q1[2], float q2[2], float hits[2][2])
+{
+   float q0perp = q0[1]*ray[0] - q0[0]*ray[1];
+   float q1perp = q1[1]*ray[0] - q1[0]*ray[1];
+   float q2perp = q2[1]*ray[0] - q2[0]*ray[1];
+   float roperp = orig[1]*ray[0] - orig[0]*ray[1];
+
+   float a = q0perp - 2*q1perp + q2perp;
+   float b = q1perp - q0perp;
+   float c = q0perp - roperp;
+
+   float s0 = 0., s1 = 0.;
+   int num_s = 0;
+
+   if (a != 0.0) {
+      float discr = b*b - a*c;
+      if (discr > 0.0) {
+         float rcpna = -1 / a;
+         float d = (float) STBTT_sqrt(discr);
+         s0 = (b+d) * rcpna;
+         s1 = (b-d) * rcpna;
+         if (s0 >= 0.0 && s0 <= 1.0)
+            num_s = 1;
+         if (d > 0.0 && s1 >= 0.0 && s1 <= 1.0) {
+            if (num_s == 0) s0 = s1;
+            ++num_s;
+         }
+      }
+   } else {
+      // 2*b*s + c = 0
+      // s = -c / (2*b)
+      s0 = c / (-2 * b);
+      if (s0 >= 0.0 && s0 <= 1.0)
+         num_s = 1;
+   }
+
+   if (num_s == 0)
+      return 0;
+   else {
+      float rcp_len2 = 1 / (ray[0]*ray[0] + ray[1]*ray[1]);
+      float rayn_x = ray[0] * rcp_len2, rayn_y = ray[1] * rcp_len2;
+
+      float q0d =   q0[0]*rayn_x +   q0[1]*rayn_y;
+      float q1d =   q1[0]*rayn_x +   q1[1]*rayn_y;
+      float q2d =   q2[0]*rayn_x +   q2[1]*rayn_y;
+      float rod = orig[0]*rayn_x + orig[1]*rayn_y;
+
+      float q10d = q1d - q0d;
+      float q20d = q2d - q0d;
+      float q0rd = q0d - rod;
+
+      hits[0][0] = q0rd + s0*(2.0f - 2.0f*s0)*q10d + s0*s0*q20d;
+      hits[0][1] = a*s0+b;
+
+      if (num_s > 1) {
+         hits[1][0] = q0rd + s1*(2.0f - 2.0f*s1)*q10d + s1*s1*q20d;
+         hits[1][1] = a*s1+b;
+         return 2;
+      } else {
+         return 1;
+      }
+   }
+}
+
+static int equal(float *a, float *b)
+{
+   return (a[0] == b[0] && a[1] == b[1]);
+}
+
+static int stbtt__compute_crossings_x(float x, float y, int nverts, stbtt_vertex *verts)
+{
+   int i;
+   float orig[2], ray[2] = { 1, 0 };
+   float y_frac;
+   int winding = 0;
+
+   // make sure y never passes through a vertex of the shape
+   y_frac = (float) STBTT_fmod(y, 1.0f);
+   if (y_frac < 0.01f)
+      y += 0.01f;
+   else if (y_frac > 0.99f)
+      y -= 0.01f;
+
+   orig[0] = x;
+   orig[1] = y;
+
+   // test a ray from (-infinity,y) to (x,y)
+   for (i=0; i < nverts; ++i) {
+      if (verts[i].type == STBTT_vline) {
+         int x0 = (int) verts[i-1].x, y0 = (int) verts[i-1].y;
+         int x1 = (int) verts[i  ].x, y1 = (int) verts[i  ].y;
+         if (y > STBTT_min(y0,y1) && y < STBTT_max(y0,y1) && x > STBTT_min(x0,x1)) {
+            float x_inter = (y - y0) / (y1 - y0) * (x1-x0) + x0;
+            if (x_inter < x)
+               winding += (y0 < y1) ? 1 : -1;
+         }
+      }
+      if (verts[i].type == STBTT_vcurve) {
+         int x0 = (int) verts[i-1].x , y0 = (int) verts[i-1].y ;
+         int x1 = (int) verts[i  ].cx, y1 = (int) verts[i  ].cy;
+         int x2 = (int) verts[i  ].x , y2 = (int) verts[i  ].y ;
+         int ax = STBTT_min(x0,STBTT_min(x1,x2)), ay = STBTT_min(y0,STBTT_min(y1,y2));
+         int by = STBTT_max(y0,STBTT_max(y1,y2));
+         if (y > ay && y < by && x > ax) {
+            float q0[2],q1[2],q2[2];
+            float hits[2][2];
+            q0[0] = (float)x0;
+            q0[1] = (float)y0;
+            q1[0] = (float)x1;
+            q1[1] = (float)y1;
+            q2[0] = (float)x2;
+            q2[1] = (float)y2;
+            if (equal(q0,q1) || equal(q1,q2)) {
+               x0 = (int)verts[i-1].x;
+               y0 = (int)verts[i-1].y;
+               x1 = (int)verts[i  ].x;
+               y1 = (int)verts[i  ].y;
+               if (y > STBTT_min(y0,y1) && y < STBTT_max(y0,y1) && x > STBTT_min(x0,x1)) {
+                  float x_inter = (y - y0) / (y1 - y0) * (x1-x0) + x0;
+                  if (x_inter < x)
+                     winding += (y0 < y1) ? 1 : -1;
+               }
+            } else {
+               int num_hits = stbtt__ray_intersect_bezier(orig, ray, q0, q1, q2, hits);
+               if (num_hits >= 1)
+                  if (hits[0][0] < 0)
+                     winding += (hits[0][1] < 0 ? -1 : 1);
+               if (num_hits >= 2)
+                  if (hits[1][0] < 0)
+                     winding += (hits[1][1] < 0 ? -1 : 1);
+            }
+         }
+      }
+   }
+   return winding;
+}
+
+static float stbtt__cuberoot( float x )
+{
+   if (x<0)
+      return -(float) STBTT_pow(-x,1.0f/3.0f);
+   else
+      return  (float) STBTT_pow( x,1.0f/3.0f);
+}
+
+// x^3 + a*x^2 + b*x + c = 0
+static int stbtt__solve_cubic(float a, float b, float c, float* r)
+{
+   float s = -a / 3;
+   float p = b - a*a / 3;
+   float q = a * (2*a*a - 9*b) / 27 + c;
+   float p3 = p*p*p;
+   float d = q*q + 4*p3 / 27;
+   if (d >= 0) {
+      float z = (float) STBTT_sqrt(d);
+      float u = (-q + z) / 2;
+      float v = (-q - z) / 2;
+      u = stbtt__cuberoot(u);
+      v = stbtt__cuberoot(v);
+      r[0] = s + u + v;
+      return 1;
+   } else {
+      float u = (float) STBTT_sqrt(-p/3);
+      float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative
+      float m = (float) STBTT_cos(v);
+      float n = (float) STBTT_cos(v-3.141592/2)*1.732050808f;
+      r[0] = s + u * 2 * m;
+      r[1] = s - u * (m + n);
+      r[2] = s - u * (m - n);
+
+      //STBTT_assert( STBTT_fabs(((r[0]+a)*r[0]+b)*r[0]+c) < 0.05f);  // these asserts may not be safe at all scales, though they're in bezier t parameter units so maybe?
+      //STBTT_assert( STBTT_fabs(((r[1]+a)*r[1]+b)*r[1]+c) < 0.05f);
+      //STBTT_assert( STBTT_fabs(((r[2]+a)*r[2]+b)*r[2]+c) < 0.05f);
+      return 3;
+   }
+}
+
+STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float scale, int glyph, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff)
+{
+   float scale_x = scale, scale_y = scale;
+   int ix0,iy0,ix1,iy1;
+   int w,h;
+   unsigned char *data;
+
+   if (scale == 0) return NULL;
+
+   stbtt_GetGlyphBitmapBoxSubpixel(info, glyph, scale, scale, 0.0f,0.0f, &ix0,&iy0,&ix1,&iy1);
+
+   // if empty, return NULL
+   if (ix0 == ix1 || iy0 == iy1)
+      return NULL;
+
+   ix0 -= padding;
+   iy0 -= padding;
+   ix1 += padding;
+   iy1 += padding;
+
+   w = (ix1 - ix0);
+   h = (iy1 - iy0);
+
+   if (width ) *width  = w;
+   if (height) *height = h;
+   if (xoff  ) *xoff   = ix0;
+   if (yoff  ) *yoff   = iy0;
+
+   // invert for y-downwards bitmaps
+   scale_y = -scale_y;
+
+   {
+      // distance from singular values (in the same units as the pixel grid)
+      const float eps = 1./1024, eps2 = eps*eps;
+      int x,y,i,j;
+      float *precompute;
+      stbtt_vertex *verts;
+      int num_verts = stbtt_GetGlyphShape(info, glyph, &verts);
+      data = (unsigned char *) STBTT_malloc(w * h, info->userdata);
+      precompute = (float *) STBTT_malloc(num_verts * sizeof(float), info->userdata);
+
+      for (i=0,j=num_verts-1; i < num_verts; j=i++) {
+         if (verts[i].type == STBTT_vline) {
+            float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
+            float x1 = verts[j].x*scale_x, y1 = verts[j].y*scale_y;
+            float dist = (float) STBTT_sqrt((x1-x0)*(x1-x0) + (y1-y0)*(y1-y0));
+            precompute[i] = (dist < eps) ? 0.0f : 1.0f / dist;
+         } else if (verts[i].type == STBTT_vcurve) {
+            float x2 = verts[j].x *scale_x, y2 = verts[j].y *scale_y;
+            float x1 = verts[i].cx*scale_x, y1 = verts[i].cy*scale_y;
+            float x0 = verts[i].x *scale_x, y0 = verts[i].y *scale_y;
+            float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
+            float len2 = bx*bx + by*by;
+            if (len2 >= eps2)
+               precompute[i] = 1.0f / len2;
+            else
+               precompute[i] = 0.0f;
+         } else
+            precompute[i] = 0.0f;
+      }
+
+      for (y=iy0; y < iy1; ++y) {
+         for (x=ix0; x < ix1; ++x) {
+            float val;
+            float min_dist = 999999.0f;
+            float sx = (float) x + 0.5f;
+            float sy = (float) y + 0.5f;
+            float x_gspace = (sx / scale_x);
+            float y_gspace = (sy / scale_y);
+
+            int winding = stbtt__compute_crossings_x(x_gspace, y_gspace, num_verts, verts); // @OPTIMIZE: this could just be a rasterization, but needs to be line vs. non-tesselated curves so a new path
+
+            for (i=0; i < num_verts; ++i) {
+               float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
+
+               if (verts[i].type == STBTT_vline && precompute[i] != 0.0f) {
+                  float x1 = verts[i-1].x*scale_x, y1 = verts[i-1].y*scale_y;
+
+                  float dist,dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                  if (dist2 < min_dist*min_dist)
+                     min_dist = (float) STBTT_sqrt(dist2);
+
+                  // coarse culling against bbox
+                  //if (sx > STBTT_min(x0,x1)-min_dist && sx < STBTT_max(x0,x1)+min_dist &&
+                  //    sy > STBTT_min(y0,y1)-min_dist && sy < STBTT_max(y0,y1)+min_dist)
+                  dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i];
+                  STBTT_assert(i != 0);
+                  if (dist < min_dist) {
+                     // check position along line
+                     // x' = x0 + t*(x1-x0), y' = y0 + t*(y1-y0)
+                     // minimize (x'-sx)*(x'-sx)+(y'-sy)*(y'-sy)
+                     float dx = x1-x0, dy = y1-y0;
+                     float px = x0-sx, py = y0-sy;
+                     // minimize (px+t*dx)^2 + (py+t*dy)^2 = px*px + 2*px*dx*t + t^2*dx*dx + py*py + 2*py*dy*t + t^2*dy*dy
+                     // derivative: 2*px*dx + 2*py*dy + (2*dx*dx+2*dy*dy)*t, set to 0 and solve
+                     float t = -(px*dx + py*dy) / (dx*dx + dy*dy);
+                     if (t >= 0.0f && t <= 1.0f)
+                        min_dist = dist;
+                  }
+               } else if (verts[i].type == STBTT_vcurve) {
+                  float x2 = verts[i-1].x *scale_x, y2 = verts[i-1].y *scale_y;
+                  float x1 = verts[i  ].cx*scale_x, y1 = verts[i  ].cy*scale_y;
+                  float box_x0 = STBTT_min(STBTT_min(x0,x1),x2);
+                  float box_y0 = STBTT_min(STBTT_min(y0,y1),y2);
+                  float box_x1 = STBTT_max(STBTT_max(x0,x1),x2);
+                  float box_y1 = STBTT_max(STBTT_max(y0,y1),y2);
+                  // coarse culling against bbox to avoid computing cubic unnecessarily
+                  if (sx > box_x0-min_dist && sx < box_x1+min_dist && sy > box_y0-min_dist && sy < box_y1+min_dist) {
+                     int num=0;
+                     float ax = x1-x0, ay = y1-y0;
+                     float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
+                     float mx = x0 - sx, my = y0 - sy;
+                     float res[3] = {0.f,0.f,0.f};
+                     float px,py,t,it,dist2;
+                     float a_inv = precompute[i];
+                     if (a_inv == 0.0) { // if a_inv is 0, it's 2nd degree so use quadratic formula
+                        float a = 3*(ax*bx + ay*by);
+                        float b = 2*(ax*ax + ay*ay) + (mx*bx+my*by);
+                        float c = mx*ax+my*ay;
+                        if (STBTT_fabs(a) < eps2) { // if a is 0, it's linear
+                           if (STBTT_fabs(b) >= eps2) {
+                              res[num++] = -c/b;
+                           }
+                        } else {
+                           float discriminant = b*b - 4*a*c;
+                           if (discriminant < 0)
+                              num = 0;
+                           else {
+                              float root = (float) STBTT_sqrt(discriminant);
+                              res[0] = (-b - root)/(2*a);
+                              res[1] = (-b + root)/(2*a);
+                              num = 2; // don't bother distinguishing 1-solution case, as code below will still work
+                           }
+                        }
+                     } else {
+                        float b = 3*(ax*bx + ay*by) * a_inv; // could precompute this as it doesn't depend on sample point
+                        float c = (2*(ax*ax + ay*ay) + (mx*bx+my*by)) * a_inv;
+                        float d = (mx*ax+my*ay) * a_inv;
+                        num = stbtt__solve_cubic(b, c, d, res);
+                     }
+                     dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                     if (dist2 < min_dist*min_dist)
+                        min_dist = (float) STBTT_sqrt(dist2);
+
+                     if (num >= 1 && res[0] >= 0.0f && res[0] <= 1.0f) {
+                        t = res[0], it = 1.0f - t;
+                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
+                        py = it*it*y0 + 2*t*it*y1 + t*t*y2;
+                        dist2 = (px-sx)*(px-sx) + (py-sy)*(py-sy);
+                        if (dist2 < min_dist * min_dist)
+                           min_dist = (float) STBTT_sqrt(dist2);
+                     }
+                     if (num >= 2 && res[1] >= 0.0f && res[1] <= 1.0f) {
+                        t = res[1], it = 1.0f - t;
+                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
+                        py = it*it*y0 + 2*t*it*y1 + t*t*y2;
+                        dist2 = (px-sx)*(px-sx) + (py-sy)*(py-sy);
+                        if (dist2 < min_dist * min_dist)
+                           min_dist = (float) STBTT_sqrt(dist2);
+                     }
+                     if (num >= 3 && res[2] >= 0.0f && res[2] <= 1.0f) {
+                        t = res[2], it = 1.0f - t;
+                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
+                        py = it*it*y0 + 2*t*it*y1 + t*t*y2;
+                        dist2 = (px-sx)*(px-sx) + (py-sy)*(py-sy);
+                        if (dist2 < min_dist * min_dist)
+                           min_dist = (float) STBTT_sqrt(dist2);
+                     }
+                  }
+               }
+            }
+            if (winding == 0)
+               min_dist = -min_dist;  // if outside the shape, value is negative
+            val = onedge_value + pixel_dist_scale * min_dist;
+            if (val < 0)
+               val = 0;
+            else if (val > 255)
+               val = 255;
+            data[(y-iy0)*w+(x-ix0)] = (unsigned char) val;
+         }
+      }
+      STBTT_free(precompute, info->userdata);
+      STBTT_free(verts, info->userdata);
+   }
+   return data;
+}
+
+STBTT_DEF unsigned char * stbtt_GetCodepointSDF(const stbtt_fontinfo *info, float scale, int codepoint, int padding, unsigned char onedge_value, float pixel_dist_scale, int *width, int *height, int *xoff, int *yoff)
+{
+   return stbtt_GetGlyphSDF(info, scale, stbtt_FindGlyphIndex(info, codepoint), padding, onedge_value, pixel_dist_scale, width, height, xoff, yoff);
+}
+
+STBTT_DEF void stbtt_FreeSDF(unsigned char *bitmap, void *userdata)
+{
+   STBTT_free(bitmap, userdata);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// font name matching -- recommended not to use this
+//
+
+// check if a utf8 string contains a prefix which is the utf16 string; if so return length of matching utf8 string
+static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(stbtt_uint8 *s1, stbtt_int32 len1, stbtt_uint8 *s2, stbtt_int32 len2)
+{
+   stbtt_int32 i=0;
+
+   // convert utf16 to utf8 and compare the results while converting
+   while (len2) {
+      stbtt_uint16 ch = s2[0]*256 + s2[1];
+      if (ch < 0x80) {
+         if (i >= len1) return -1;
+         if (s1[i++] != ch) return -1;
+      } else if (ch < 0x800) {
+         if (i+1 >= len1) return -1;
+         if (s1[i++] != 0xc0 + (ch >> 6)) return -1;
+         if (s1[i++] != 0x80 + (ch & 0x3f)) return -1;
+      } else if (ch >= 0xd800 && ch < 0xdc00) {
+         stbtt_uint32 c;
+         stbtt_uint16 ch2 = s2[2]*256 + s2[3];
+         if (i+3 >= len1) return -1;
+         c = ((ch - 0xd800) << 10) + (ch2 - 0xdc00) + 0x10000;
+         if (s1[i++] != 0xf0 + (c >> 18)) return -1;
+         if (s1[i++] != 0x80 + ((c >> 12) & 0x3f)) return -1;
+         if (s1[i++] != 0x80 + ((c >>  6) & 0x3f)) return -1;
+         if (s1[i++] != 0x80 + ((c      ) & 0x3f)) return -1;
+         s2 += 2; // plus another 2 below
+         len2 -= 2;
+      } else if (ch >= 0xdc00 && ch < 0xe000) {
+         return -1;
+      } else {
+         if (i+2 >= len1) return -1;
+         if (s1[i++] != 0xe0 + (ch >> 12)) return -1;
+         if (s1[i++] != 0x80 + ((ch >> 6) & 0x3f)) return -1;
+         if (s1[i++] != 0x80 + ((ch     ) & 0x3f)) return -1;
+      }
+      s2 += 2;
+      len2 -= 2;
+   }
+   return i;
+}
+
+static int stbtt_CompareUTF8toUTF16_bigendian_internal(char *s1, int len1, char *s2, int len2)
+{
+   return len1 == stbtt__CompareUTF8toUTF16_bigendian_prefix((stbtt_uint8*) s1, len1, (stbtt_uint8*) s2, len2);
+}
+
+// returns results in whatever encoding you request... but note that 2-byte encodings
+// will be BIG-ENDIAN... use stbtt_CompareUTF8toUTF16_bigendian() to compare
+STBTT_DEF const char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID)
+{
+   stbtt_int32 i,count,stringOffset;
+   stbtt_uint8 *fc = font->data;
+   stbtt_uint32 offset = font->fontstart;
+   stbtt_uint32 nm = stbtt__find_table(fc, offset, "name");
+   if (!nm) return NULL;
+
+   count = ttUSHORT(fc+nm+2);
+   stringOffset = nm + ttUSHORT(fc+nm+4);
+   for (i=0; i < count; ++i) {
+      stbtt_uint32 loc = nm + 6 + 12 * i;
+      if (platformID == ttUSHORT(fc+loc+0) && encodingID == ttUSHORT(fc+loc+2)
+          && languageID == ttUSHORT(fc+loc+4) && nameID == ttUSHORT(fc+loc+6)) {
+         *length = ttUSHORT(fc+loc+8);
+         return (const char *) (fc+stringOffset+ttUSHORT(fc+loc+10));
+      }
+   }
+   return NULL;
+}
+
+static int stbtt__matchpair(stbtt_uint8 *fc, stbtt_uint32 nm, stbtt_uint8 *name, stbtt_int32 nlen, stbtt_int32 target_id, stbtt_int32 next_id)
+{
+   stbtt_int32 i;
+   stbtt_int32 count = ttUSHORT(fc+nm+2);
+   stbtt_int32 stringOffset = nm + ttUSHORT(fc+nm+4);
+
+   for (i=0; i < count; ++i) {
+      stbtt_uint32 loc = nm + 6 + 12 * i;
+      stbtt_int32 id = ttUSHORT(fc+loc+6);
+      if (id == target_id) {
+         // find the encoding
+         stbtt_int32 platform = ttUSHORT(fc+loc+0), encoding = ttUSHORT(fc+loc+2), language = ttUSHORT(fc+loc+4);
+
+         // is this a Unicode encoding?
+         if (platform == 0 || (platform == 3 && encoding == 1) || (platform == 3 && encoding == 10)) {
+            stbtt_int32 slen = ttUSHORT(fc+loc+8);
+            stbtt_int32 off = ttUSHORT(fc+loc+10);
+
+            // check if there's a prefix match
+            stbtt_int32 matchlen = stbtt__CompareUTF8toUTF16_bigendian_prefix(name, nlen, fc+stringOffset+off,slen);
+            if (matchlen >= 0) {
+               // check for target_id+1 immediately following, with same encoding & language
+               if (i+1 < count && ttUSHORT(fc+loc+12+6) == next_id && ttUSHORT(fc+loc+12) == platform && ttUSHORT(fc+loc+12+2) == encoding && ttUSHORT(fc+loc+12+4) == language) {
+                  slen = ttUSHORT(fc+loc+12+8);
+                  off = ttUSHORT(fc+loc+12+10);
+                  if (slen == 0) {
+                     if (matchlen == nlen)
+                        return 1;
+                  } else if (matchlen < nlen && name[matchlen] == ' ') {
+                     ++matchlen;
+                     if (stbtt_CompareUTF8toUTF16_bigendian_internal((char*) (name+matchlen), nlen-matchlen, (char*)(fc+stringOffset+off),slen))
+                        return 1;
+                  }
+               } else {
+                  // if nothing immediately following
+                  if (matchlen == nlen)
+                     return 1;
+               }
+            }
+         }
+
+         // @TODO handle other encodings
+      }
+   }
+   return 0;
+}
+
+static int stbtt__matches(stbtt_uint8 *fc, stbtt_uint32 offset, stbtt_uint8 *name, stbtt_int32 flags)
+{
+   stbtt_int32 nlen = (stbtt_int32) STBTT_strlen((char *) name);
+   stbtt_uint32 nm,hd;
+   if (!stbtt__isfont(fc+offset)) return 0;
+
+   // check italics/bold/underline flags in macStyle...
+   if (flags) {
+      hd = stbtt__find_table(fc, offset, "head");
+      if ((ttUSHORT(fc+hd+44) & 7) != (flags & 7)) return 0;
+   }
+
+   nm = stbtt__find_table(fc, offset, "name");
+   if (!nm) return 0;
+
+   if (flags) {
+      // if we checked the macStyle flags, then just check the family and ignore the subfamily
+      if (stbtt__matchpair(fc, nm, name, nlen, 16, -1))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  1, -1))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
+   } else {
+      if (stbtt__matchpair(fc, nm, name, nlen, 16, 17))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  1,  2))  return 1;
+      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
+   }
+
+   return 0;
+}
+
+static int stbtt_FindMatchingFont_internal(unsigned char *font_collection, char *name_utf8, stbtt_int32 flags)
+{
+   stbtt_int32 i;
+   for (i=0;;++i) {
+      stbtt_int32 off = stbtt_GetFontOffsetForIndex(font_collection, i);
+      if (off < 0) return off;
+      if (stbtt__matches((stbtt_uint8 *) font_collection, off, (stbtt_uint8*) name_utf8, flags))
+         return off;
+   }
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+STBTT_DEF int stbtt_BakeFontBitmap(const unsigned char *data, int offset,
+                                float pixel_height, unsigned char *pixels, int pw, int ph,
+                                int first_char, int num_chars, stbtt_bakedchar *chardata)
+{
+   return stbtt_BakeFontBitmap_internal((unsigned char *) data, offset, pixel_height, pixels, pw, ph, first_char, num_chars, chardata);
+}
+
+STBTT_DEF int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index)
+{
+   return stbtt_GetFontOffsetForIndex_internal((unsigned char *) data, index);
+}
+
+STBTT_DEF int stbtt_GetNumberOfFonts(const unsigned char *data)
+{
+   return stbtt_GetNumberOfFonts_internal((unsigned char *) data);
+}
+
+STBTT_DEF int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset)
+{
+   return stbtt_InitFont_internal(info, (unsigned char *) data, offset);
+}
+
+STBTT_DEF int stbtt_FindMatchingFont(const unsigned char *fontdata, const char *name, int flags)
+{
+   return stbtt_FindMatchingFont_internal((unsigned char *) fontdata, (char *) name, flags);
+}
+
+STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2)
+{
+   return stbtt_CompareUTF8toUTF16_bigendian_internal((char *) s1, len1, (char *) s2, len2);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+#endif // STB_TRUETYPE_IMPLEMENTATION
+
+
+// FULL VERSION HISTORY
+//
+//   1.25 (2021-07-11) many fixes
+//   1.24 (2020-02-05) fix warning
+//   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
+//   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
+//   1.21 (2019-02-25) fix warning
+//   1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics()
+//   1.19 (2018-02-11) OpenType GPOS kerning (horizontal only), STBTT_fmod
+//   1.18 (2018-01-29) add missing function
+//   1.17 (2017-07-23) make more arguments const; doc fix
+//   1.16 (2017-07-12) SDF support
+//   1.15 (2017-03-03) make more arguments const
+//   1.14 (2017-01-16) num-fonts-in-TTC function
+//   1.13 (2017-01-02) support OpenType fonts, certain Apple fonts
+//   1.12 (2016-10-25) suppress warnings about casting away const with -Wcast-qual
+//   1.11 (2016-04-02) fix unused-variable warning
+//   1.10 (2016-04-02) allow user-defined fabs() replacement
+//                     fix memory leak if fontsize=0.0
+//                     fix warning from duplicate typedef
+//   1.09 (2016-01-16) warning fix; avoid crash on outofmem; use alloc userdata for PackFontRanges
+//   1.08 (2015-09-13) document stbtt_Rasterize(); fixes for vertical & horizontal edges
+//   1.07 (2015-08-01) allow PackFontRanges to accept arrays of sparse codepoints;
+//                     allow PackFontRanges to pack and render in separate phases;
+//                     fix stbtt_GetFontOFfsetForIndex (never worked for non-0 input?);
+//                     fixed an assert() bug in the new rasterizer
+//                     replace assert() with STBTT_assert() in new rasterizer
+//   1.06 (2015-07-14) performance improvements (~35% faster on x86 and x64 on test machine)
+//                     also more precise AA rasterizer, except if shapes overlap
+//                     remove need for STBTT_sort
+//   1.05 (2015-04-15) fix misplaced definitions for STBTT_STATIC
+//   1.04 (2015-04-15) typo in example
+//   1.03 (2015-04-12) STBTT_STATIC, fix memory leak in new packing, various fixes
+//   1.02 (2014-12-10) fix various warnings & compile issues w/ stb_rect_pack, C++
+//   1.01 (2014-12-08) fix subpixel position when oversampling to exactly match
+//                        non-oversampled; STBTT_POINT_SIZE for packed case only
+//   1.00 (2014-12-06) add new PackBegin etc. API, w/ support for oversampling
+//   0.99 (2014-09-18) fix multiple bugs with subpixel rendering (ryg)
+//   0.9  (2014-08-07) support certain mac/iOS fonts without an MS platformID
+//   0.8b (2014-07-07) fix a warning
+//   0.8  (2014-05-25) fix a few more warnings
+//   0.7  (2013-09-25) bugfix: subpixel glyph bug fixed in 0.5 had come back
+//   0.6c (2012-07-24) improve documentation
+//   0.6b (2012-07-20) fix a few more warnings
+//   0.6  (2012-07-17) fix warnings; added stbtt_ScaleForMappingEmToPixels,
+//                        stbtt_GetFontBoundingBox, stbtt_IsGlyphEmpty
+//   0.5  (2011-12-09) bugfixes:
+//                        subpixel glyph renderer computed wrong bounding box
+//                        first vertex of shape can be off-curve (FreeSans)
+//   0.4b (2011-12-03) fixed an error in the font baking example
+//   0.4  (2011-12-01) kerning, subpixel rendering (tor)
+//                    bugfixes for:
+//                        codepoint-to-glyph conversion using table fmt=12
+//                        codepoint-to-glyph conversion using table fmt=4
+//                        stbtt_GetBakedQuad with non-square texture (Zer)
+//                    updated Hello World! sample to use kerning and subpixel
+//                    fixed some warnings
+//   0.3  (2009-06-24) cmap fmt=12, compound shapes (MM)
+//                    userdata, malloc-from-userdata, non-zero fill (stb)
+//   0.2  (2009-03-11) Fix unsigned/signed char warnings
+//   0.1  (2009-03-09) First public release
+//
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/lib/stb/stb_voxel_render.h b/lib/stb/stb_voxel_render.h
new file mode 100644
index 0000000..2e7a372
--- /dev/null
+++ b/lib/stb/stb_voxel_render.h
@@ -0,0 +1,3807 @@
+// stb_voxel_render.h - v0.89 - Sean Barrett, 2015 - public domain
+//
+// This library helps render large-scale "voxel" worlds for games,
+// in this case, one with blocks that can have textures and that
+// can also be a few shapes other than cubes.
+//
+//    Video introduction:
+//       http://www.youtube.com/watch?v=2vnTtiLrV1w
+//
+//    Minecraft-viewer sample app (not very simple though):
+//       http://github.com/nothings/stb/tree/master/tests/caveview
+//
+// It works by creating triangle meshes. The library includes
+//
+//    - converter from dense 3D arrays of block info to vertex mesh
+//    - vertex & fragment shaders for the vertex mesh
+//    - assistance in setting up shader state
+//
+// For portability, none of the library code actually accesses
+// the 3D graphics API. (At the moment, it's not actually portable
+// since the shaders are GLSL only, but patches are welcome.)
+//
+// You have to do all the caching and tracking of vertex buffers
+// yourself. However, you could also try making a game with
+// a small enough world that it's fully loaded rather than
+// streaming. Currently the preferred vertex format is 20 bytes
+// per quad. There are designs to allow much more compact formats
+// with a slight reduction in shader features, but no roadmap
+// for actually implementing them.
+//
+//
+// USAGE
+//
+//   #define the symbol STB_VOXEL_RENDER_IMPLEMENTATION in *one*
+//   C/C++ file before the #include of this file; the implementation
+//   will be generated in that file.
+//
+//   If you define the symbols STB_VOXEL_RENDER_STATIC, then the
+//   implementation will be private to that file.
+//
+//
+// FEATURES
+//
+//   - you can choose textured blocks with the features below,
+//     or colored voxels with 2^24 colors and no textures.
+//
+//   - voxels are mostly just cubes, but there's support for
+//     half-height cubes and diagonal slopes, half-height
+//     diagonals, and even odder shapes especially for doing
+//     more-continuous "ground".
+//
+//   - texture coordinates are projections along one of the major
+//     axes, with the per-texture scaling.
+//
+//   - a number of aspects of the shader and the vertex format
+//     are configurable; the library generally takes care of
+//     coordinating the vertex format with the mesh for you.
+//
+//
+// FEATURES (SHADER PERSPECTIVE)
+//
+//   - vertices aligned on integer lattice, z on multiples of 0.5
+//   - per-vertex "lighting" or "ambient occlusion" value (6 bits)
+//   - per-vertex texture crossfade (3 bits)
+//
+//   - per-face texture #1 id (8-bit index into array texture)
+//   - per-face texture #2 id (8-bit index into second array texture)
+//   - per-face color (6-bit palette index, 2 bits of per-texture boolean enable)
+//   - per-face 5-bit normal for lighting calculations & texture coord computation
+//   - per-face 2-bit texture matrix rotation to rotate faces
+//
+//   - indexed-by-texture-id scale factor (separate for texture #1 and texture #2)
+//   - indexed-by-texture-#2-id blend mode (alpha composite or modulate/multiply);
+//     the first is good for decals, the second for detail textures, "light maps",
+//     etc; both modes are controlled by texture #2's alpha, scaled by the
+//     per-vertex texture crossfade and the per-face color (if enabled on texture #2);
+//     modulate/multiply multiplies by an extra factor of 2.0 so that if you
+//     make detail maps whose average brightness is 0.5 everything works nicely.
+//
+//   - ambient lighting: half-lambert directional plus constant, all scaled by vertex ao
+//   - face can be fullbright (emissive), controlled by per-face color
+//   - installable lighting, with default single-point-light
+//   - installable fog, with default hacked smoothstep
+//
+//  Note that all the variations of lighting selection and texture
+//  blending are run-time conditions in the shader, so they can be
+//  intermixed in a single mesh.
+//
+//
+// INTEGRATION ARC
+//
+//   The way to get this library to work from scratch is to do the following:
+//
+//      Step 1. define STBVOX_CONFIG_MODE to 0
+//
+//        This mode uses only vertex attributes and uniforms, and is easiest
+//        to get working. It requires 32 bytes per quad and limits the
+//        size of some tables to avoid hitting uniform limits.
+//
+//      Step 2. define STBVOX_CONFIG_MODE to 1
+//
+//        This requires using a texture buffer to store the quad data,
+//        reducing the size to 20 bytes per quad.
+//
+//      Step 3: define STBVOX_CONFIG_PREFER_TEXBUFFER
+//
+//        This causes some uniforms to be stored as texture buffers
+//        instead. This increases the size of some of those tables,
+//        and avoids a potential slow path (gathering non-uniform
+//        data from uniforms) on some hardware.
+//
+//   In the future I might add additional modes that have significantly
+//   smaller meshes but reduce features, down as small as 6 bytes per quad.
+//   See elsewhere in this file for a table of candidate modes. Switching
+//   to a mode will require changing some of your mesh creation code, but
+//   everything else should be seamless. (And I'd like to change the API
+//   so that mesh creation is data-driven the way the uniforms are, and
+//   then you wouldn't even have to change anything but the mode number.)
+//
+//
+// IMPROVEMENTS FOR SHIP-WORTHY PROGRAMS USING THIS LIBRARY
+//
+//   I currently tolerate a certain level of "bugginess" in this library.
+//
+//   I'm referring to things which look a little wrong (as long as they
+//   don't cause holes or cracks in the output meshes), or things which
+//   do not produce as optimal a mesh as possible. Notable examples:
+//
+//        -  incorrect lighting on slopes
+//        -  inefficient meshes for vheight blocks
+//
+//   I am willing to do the work to improve these things if someone is
+//   going to ship a substantial program that would be improved by them.
+//   (It need not be commercial, nor need it be a game.) I just didn't
+//   want to do the work up front if it might never be leveraged. So just
+//   submit a bug report as usual (github is preferred), but add a note
+//   that this is for a thing that is really going to ship. (That means
+//   you need to be far enough into the project that it's clear you're
+//   committed to it; not during early exploratory development.)
+//
+//
+// VOXEL MESH API
+//
+//   Context
+//
+//     To understand the API, make sure you first understand the feature set
+//     listed above.
+//
+//     Because the vertices are compact, they have very limited spatial
+//     precision. Thus a single mesh can only contain the data for a limited
+//     area. To make very large voxel maps, you'll need to build multiple
+//     vertex buffers. (But you want this anyway for frustum culling.)
+//
+//     Each generated mesh has three components:
+//             - vertex data (vertex buffer)
+//             - face data (optional, stored in texture buffer)
+//             - mesh transform (uniforms)
+//
+//     Once you've generated the mesh with this library, it's up to you
+//     to upload it to the GPU, to keep track of the state, and to render
+//     it.
+//
+//   Concept
+//
+//     The basic design is that you pass in one or more 3D arrays; each array
+//     is (typically) one-byte-per-voxel and contains information about one
+//     or more properties of some particular voxel property.
+//
+//     Because there is so much per-vertex and per-face data possible
+//     in the output, and each voxel can have 6 faces and 8 vertices, it
+//     would require an very large data structure to describe all
+//     of the possibilities, and this would cause the mesh-creation
+//     process to be slow. Instead, the API provides multiple ways
+//     to express each property, some more compact, others less so;
+//     each such way has some limitations on what it can express.
+//
+//     Note that there are so many paths and combinations, not all of them
+//     have been tested. Just report bugs and I'll fix 'em.
+//
+//   Details
+//
+//     See the API documentation in the header-file section.
+//
+//
+// CONTRIBUTORS
+//
+//   Features             Porting            Bugfixes & Warnings
+//  Sean Barrett                          github:r-leyh   Jesus Fernandez
+//                                        Miguel Lechon   github:Arbeiterunfallversicherungsgesetz
+//                                        Thomas Frase    James Hofmann
+//                                        Stephen Olsen   github:guitarfreak
+//
+// VERSION HISTORY
+//
+//   0.89   (2020-02-02)  bugfix in sample code
+//   0.88   (2019-03-04)  fix warnings
+//   0.87   (2019-02-25)  fix warning
+//   0.86   (2019-02-07)  fix typos in comments
+//   0.85   (2017-03-03)  add block_selector (by guitarfreak)
+//   0.84   (2016-04-02)  fix GLSL syntax error on glModelView path
+//   0.83   (2015-09-13)  remove non-constant struct initializers to support more compilers
+//   0.82   (2015-08-01)  added input.packed_compact to store rot, vheight & texlerp efficiently
+//                        fix broken tex_overlay2
+//   0.81   (2015-05-28)  fix broken STBVOX_CONFIG_OPTIMIZED_VHEIGHT
+//   0.80   (2015-04-11)  fix broken STBVOX_CONFIG_ROTATION_IN_LIGHTING refactoring
+//                        change STBVOX_MAKE_LIGHTING to STBVOX_MAKE_LIGHTING_EXT so
+//                                    that header defs don't need to see config vars
+//                        add STBVOX_CONFIG_VHEIGHT_IN_LIGHTING and other vheight fixes
+//                        added documentation for vheight ("weird slopes")
+//   0.79   (2015-04-01)  fix the missing types from 0.78; fix string constants being const
+//   0.78   (2015-04-02)  bad "#else", compile as C++
+//   0.77   (2015-04-01)  documentation tweaks, rename config var to STB_VOXEL_RENDER_STATIC
+//   0.76   (2015-04-01)  typos, signed/unsigned shader issue, more documentation
+//   0.75   (2015-04-01)  initial release
+//
+//
+// HISTORICAL FOUNDATION
+//
+//   stb_voxel_render   20-byte quads   2015/01
+//   zmc engine         32-byte quads   2013/12
+//   zmc engine         96-byte quads   2011/10
+//
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+#ifndef INCLUDE_STB_VOXEL_RENDER_H
+#define INCLUDE_STB_VOXEL_RENDER_H
+
+#include <stdlib.h>
+
+typedef struct stbvox_mesh_maker stbvox_mesh_maker;
+typedef struct stbvox_input_description stbvox_input_description;
+
+#ifdef STB_VOXEL_RENDER_STATIC
+#define STBVXDEC static
+#else
+#define STBVXDEC extern
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// CONFIGURATION MACROS
+//
+//  #define STBVOX_CONFIG_MODE <integer>           // REQUIRED
+//     Configures the overall behavior of stb_voxel_render. This
+//     can affect the shaders, the uniform info, and other things.
+//     (If you need more than one mode in the same app, you can
+//     use STB_VOXEL_RENDER_STATIC to create multiple versions
+//     in separate files, and then wrap them.)
+//
+//         Mode value       Meaning
+//             0               Textured blocks, 32-byte quads
+//             1               Textured blocks, 20-byte quads
+//            20               Untextured blocks, 32-byte quads
+//            21               Untextured blocks, 20-byte quads
+//
+//
+//  #define STBVOX_CONFIG_PRECISION_Z  <integer>   // OPTIONAL
+//     Defines the number of bits of fractional position for Z.
+//     Only 0 or 1 are valid. 1 is the default. If 0, then a
+//     single mesh has twice the legal Z range; e.g. in
+//     modes 0,1,20,21, Z in the mesh can extend to 511 instead
+//     of 255. However, half-height blocks cannot be used.
+//
+// All of the following are just #ifdef tested so need no values, and are optional.
+//
+//    STBVOX_CONFIG_BLOCKTYPE_SHORT
+//        use unsigned 16-bit values for 'blocktype' in the input instead of 8-bit values
+//
+//    STBVOX_CONFIG_OPENGL_MODELVIEW
+//        use the gl_ModelView matrix rather than the explicit uniform
+//
+//    STBVOX_CONFIG_HLSL
+//        NOT IMPLEMENTED! Define HLSL shaders instead of GLSL shaders
+//
+//    STBVOX_CONFIG_PREFER_TEXBUFFER
+//        Stores many of the uniform arrays in texture buffers instead,
+//        so they can be larger and may be more efficient on some hardware.
+//
+//    STBVOX_CONFIG_LIGHTING_SIMPLE
+//        Creates a simple lighting engine with a single point light source
+//        in addition to the default half-lambert ambient light.
+//
+//    STBVOX_CONFIG_LIGHTING
+//        Declares a lighting function hook; you must append a lighting function
+//        to the shader before compiling it:
+//            vec3 compute_lighting(vec3 pos, vec3 norm, vec3 albedo, vec3 ambient);
+//        'ambient' is the half-lambert ambient light with vertex ambient-occlusion applied
+//
+//    STBVOX_CONFIG_FOG_SMOOTHSTEP
+//        Defines a simple unrealistic fog system designed to maximize
+//        unobscured view distance while not looking too weird when things
+//        emerge from the fog. Configured using an extra array element
+//        in the STBVOX_UNIFORM_ambient uniform.
+//
+//    STBVOX_CONFIG_FOG
+//        Defines a fog function hook; you must append a fog function to
+//        the shader before compiling it:
+//            vec3 compute_fog(vec3 color, vec3 relative_pos, float fragment_alpha);
+//        "color" is the incoming pre-fogged color, fragment_alpha is the alpha value,
+//        and relative_pos is the vector from the point to the camera in worldspace
+//
+//    STBVOX_CONFIG_DISABLE_TEX2
+//        This disables all processing of texture 2 in the shader in case
+//        you don't use it. Eventually this could be replaced with a mode
+//        that omits the unused data entirely.
+//
+//    STBVOX_CONFIG_TEX1_EDGE_CLAMP
+//    STBVOX_CONFIG_TEX2_EDGE_CLAMP
+//        If you want to edge clamp the textures, instead of letting them wrap,
+//        set this flag. By default stb_voxel_render relies on texture wrapping
+//        to simplify texture coordinate generation. This flag forces it to do
+//        it correctly, although there can still be minor artifacts.
+//
+//    STBVOX_CONFIG_ROTATION_IN_LIGHTING
+//        Changes the meaning of the 'lighting' mesher input variable to also
+//        store the rotation; see later discussion.
+//
+//    STBVOX_CONFIG_VHEIGHT_IN_LIGHTING
+//        Changes the meaning of the 'lighting' mesher input variable to also
+//        store the vheight; see later discussion. Cannot use both this and
+//        the previous variable.
+//
+//    STBVOX_CONFIG_PREMULTIPLIED_ALPHA
+//        Adjusts the shader calculations on the assumption that tex1.rgba,
+//        tex2.rgba, and color.rgba all use premultiplied values, and that
+//        the output of the fragment shader should be premultiplied.
+//
+//    STBVOX_CONFIG_UNPREMULTIPLY
+//        Only meaningful if STBVOX_CONFIG_PREMULTIPLIED_ALPHA is defined.
+//        Changes the behavior described above so that the inputs are
+//        still premultiplied alpha, but the output of the fragment
+//        shader is not premultiplied alpha. This is needed when allowing
+//        non-unit alpha values but not doing alpha-blending (for example
+//        when alpha testing).
+//
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// MESHING
+//
+// A mesh represents a (typically) small chunk of a larger world.
+// Meshes encode coordinates using small integers, so those
+// coordinates must be relative to some base location.
+// All of the coordinates in the functions below use
+// these relative coordinates unless explicitly stated
+// otherwise.
+//
+// Input to the meshing step is documented further down
+
+STBVXDEC void stbvox_init_mesh_maker(stbvox_mesh_maker *mm);
+// Call this function to initialize a mesh-maker context structure
+// used to build meshes. You should have one context per thread
+// that's building meshes.
+
+STBVXDEC void stbvox_set_buffer(stbvox_mesh_maker *mm, int mesh, int slot, void *buffer, size_t len);
+// Call this to set the buffer into which stbvox will write the mesh
+// it creates. It can build more than one mesh in parallel (distinguished
+// by the 'mesh' parameter), and each mesh can be made up of more than
+// one buffer (distinguished by the 'slot' parameter).
+//
+// Multiple meshes are under your control; use the 'selector' input
+// variable to choose which mesh each voxel's vertices are written to.
+// For example, you can use this to generate separate meshes for opaque
+// and transparent data.
+//
+// You can query the number of slots by calling stbvox_get_buffer_count
+// described below. The meaning of the buffer for each slot depends
+// on STBVOX_CONFIG_MODE.
+//
+//   In mode 0 & mode 20, there is only one slot. The mesh data for that
+//   slot is two interleaved vertex attributes: attr_vertex, a single
+//   32-bit uint, and attr_face, a single 32-bit uint.
+//
+//   In mode 1 & mode 21, there are two slots. The first buffer should
+//   be four times as large as the second buffer. The first buffer
+//   contains a single vertex attribute: 'attr_vertex', a single 32-bit uint.
+//   The second buffer contains texture buffer data (an array of 32-bit uints)
+//   that will be accessed through the sampler identified by STBVOX_UNIFORM_face_data.
+
+STBVXDEC int stbvox_get_buffer_count(stbvox_mesh_maker *mm);
+// Returns the number of buffers needed per mesh as described above.
+
+STBVXDEC int stbvox_get_buffer_size_per_quad(stbvox_mesh_maker *mm, int slot);
+// Returns how much of a given buffer will get used per quad. This
+// allows you to choose correct relative sizes for each buffer, although
+// the values are fixed based on the configuration you've selected at
+// compile time, and the details are described in stbvox_set_buffer.
+
+STBVXDEC void stbvox_set_default_mesh(stbvox_mesh_maker *mm, int mesh);
+// Selects which mesh the mesher will output to (see previous function)
+// if the input doesn't specify a per-voxel selector. (I doubt this is
+// useful, but it's here just in case.)
+
+STBVXDEC stbvox_input_description *stbvox_get_input_description(stbvox_mesh_maker *mm);
+// This function call returns a pointer to the stbvox_input_description part
+// of stbvox_mesh_maker (which you should otherwise treat as opaque). You
+// zero this structure, then fill out the relevant pointers to the data
+// describing your voxel object/world.
+//
+// See further documentation at the description of stbvox_input_description below.
+
+STBVXDEC void stbvox_set_input_stride(stbvox_mesh_maker *mm, int x_stride_in_elements, int y_stride_in_elements);
+// This sets the stride between successive elements of the 3D arrays
+// in the stbvox_input_description. Z values are always stored consecutively.
+// (The preferred coordinate system for stbvox is X right, Y forwards, Z up.)
+
+STBVXDEC void stbvox_set_input_range(stbvox_mesh_maker *mm, int x0, int y0, int z0, int x1, int y1, int z1);
+// This sets the range of values in the 3D array for the voxels that
+// the mesh generator will convert. The lower values are inclusive,
+// the higher values are exclusive, so (0,0,0) to (16,16,16) generates
+// mesh data associated with voxels up to (15,15,15) but no higher.
+//
+// The mesh generate generates faces at the boundary between open space
+// and solid space but associates them with the solid space, so if (15,0,0)
+// is open and (16,0,0) is solid, then the mesh will contain the boundary
+// between them if x0 <= 16 and x1 > 16.
+//
+// Note that the mesh generator will access array elements 1 beyond the
+// limits set in these parameters. For example, if you set the limits
+// to be (0,0,0) and (16,16,16), then the generator will access all of
+// the voxels between (-1,-1,-1) and (16,16,16), including (16,16,16).
+// You may have to do pointer arithmetic to make it work.
+//
+// For example, caveview processes mesh chunks that are 32x32x16, but it
+// does this using input buffers that are 34x34x18.
+//
+// The lower limits are x0 >= 0, y0 >= 0, and z0 >= 0.
+//
+// The upper limits are mode dependent, but all the current methods are
+// limited to x1 < 127, y1 < 127, z1 < 255. Note that these are not
+// powers of two; if you want to use power-of-two chunks (to make
+// it efficient to decide which chunk a coordinate falls in), you're
+// limited to at most x1=64, y1=64, z1=128. For classic Minecraft-style
+// worlds with limited vertical extent, I recommend using a single
+// chunk for the entire height, which limits the height to 255 blocks
+// (one less than Minecraft), and only chunk the map in X & Y.
+
+STBVXDEC int stbvox_make_mesh(stbvox_mesh_maker *mm);
+// Call this function to create mesh data for the currently configured
+// set of input data. This appends to the currently configured mesh output
+// buffer. Returns 1 on success. If there is not enough room in the buffer,
+// it outputs as much as it can, and returns 0; you need to switch output
+// buffers (either by calling stbvox_set_buffer to set new buffers, or
+// by copying the data out and calling stbvox_reset_buffers), and then
+// call this function again without changing any of the input parameters.
+//
+// Note that this function appends; you can call it multiple times to
+// build a single mesh. For example, caveview uses chunks that are
+// 32x32x255, but builds the mesh for it by processing 32x32x16 at atime
+// (this is faster as it is reuses the same 34x34x18 input buffers rather
+// than needing 34x34x257 input buffers).
+
+// Once you're done creating a mesh into a given buffer,
+// consider the following functions:
+
+STBVXDEC int stbvox_get_quad_count(stbvox_mesh_maker *mm, int mesh);
+// Returns the number of quads in the mesh currently generated by mm.
+// This is the sum of all consecutive stbvox_make_mesh runs appending
+// to the same buffer. 'mesh' distinguishes between the multiple user
+// meshes available via 'selector' or stbvox_set_default_mesh.
+//
+// Typically you use this function when you're done building the mesh
+// and want to record how to draw it.
+//
+// Note that there are no index buffers; the data stored in the buffers
+// should be drawn as quads (e.g. with GL_QUAD); if your API does not
+// support quads, you can create a single index buffer large enough to
+// draw your largest vertex buffer, and reuse it for every rendering.
+// (Note that if you use 32-bit indices, you'll use 24 bytes of bandwidth
+// per quad, more than the 20 bytes for the vertex/face mesh data.)
+
+STBVXDEC void stbvox_set_mesh_coordinates(stbvox_mesh_maker *mm, int x, int y, int z);
+// Sets the global coordinates for this chunk, such that (0,0,0) relative
+// coordinates will be at (x,y,z) in global coordinates.
+
+STBVXDEC void stbvox_get_bounds(stbvox_mesh_maker *mm, float bounds[2][3]);
+// Returns the bounds for the mesh in global coordinates. Use this
+// for e.g. frustum culling the mesh. @BUG: this just uses the
+// values from stbvox_set_input_range(), so if you build by
+// appending multiple values, this will be wrong, and you need to
+// set stbvox_set_input_range() to the full size. Someday this
+// will switch to tracking the actual bounds of the *mesh*, though.
+
+STBVXDEC void stbvox_get_transform(stbvox_mesh_maker *mm, float transform[3][3]);
+// Returns the 'transform' data for the shader uniforms. It is your
+// job to set this to the shader before drawing the mesh. It is the
+// only uniform that needs to change per-mesh. Note that it is not
+// a 3x3 matrix, but rather a scale to decode fixed point numbers as
+// floats, a translate from relative to global space, and a special
+// translation for texture coordinate generation that avoids
+// floating-point precision issues. @TODO: currently we add the
+// global translation to the vertex, than multiply by modelview,
+// but this means if camera location and vertex are far from the
+// origin, we lose precision. Need to make a special modelview with
+// the translation (or some of it) factored out to avoid this.
+
+STBVXDEC void stbvox_reset_buffers(stbvox_mesh_maker *mm);
+// Call this function if you're done with the current output buffer
+// but want to reuse it (e.g. you're done appending with
+// stbvox_make_mesh and you've copied the data out to your graphics API
+// so can reuse the buffer).
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RENDERING
+//
+
+STBVXDEC char *stbvox_get_vertex_shader(void);
+// Returns the (currently GLSL-only) vertex shader.
+
+STBVXDEC char *stbvox_get_fragment_shader(void);
+// Returns the (currently GLSL-only) fragment shader.
+// You can override the lighting and fogging calculations
+// by appending data to the end of these; see the #define
+// documentation for more information.
+
+STBVXDEC char *stbvox_get_fragment_shader_alpha_only(void);
+// Returns a slightly cheaper fragment shader that computes
+// alpha but not color. This is useful for e.g. a depth-only
+// pass when using alpha test.
+
+typedef struct stbvox_uniform_info stbvox_uniform_info;
+
+STBVXDEC int stbvox_get_uniform_info(stbvox_uniform_info *info, int uniform);
+// Gets the information about a uniform necessary for you to
+// set up each uniform with a minimal amount of explicit code.
+// See the sample code after the structure definition for stbvox_uniform_info,
+// further down in this header section.
+//
+// "uniform" is from the list immediately following. For many
+// of these, default values are provided which you can set.
+// Most values are shared for most draw calls; e.g. for stateful
+// APIs you can set most of the state only once. Only
+// STBVOX_UNIFORM_transform needs to change per draw call.
+//
+// STBVOX_UNIFORM_texscale
+//    64- or 128-long vec4 array. (128 only if STBVOX_CONFIG_PREFER_TEXBUFFER)
+//    x: scale factor to apply to texture #1. must be a power of two. 1.0 means 'face-sized'
+//    y: scale factor to apply to texture #2. must be a power of two. 1.0 means 'face-sized'
+//    z: blend mode indexed by texture #2. 0.0 is alpha compositing; 1.0 is multiplication.
+//    w: unused currently. @TODO use to support texture animation?
+//
+//    Texscale is indexed by the bottom 6 or 7 bits of the texture id; thus for
+//    example the texture at index 0 in the array and the texture in index 128 of
+//    the array must be scaled the same. This means that if you only have 64 or 128
+//    unique textures, they all get distinct values anyway; otherwise you have
+//    to group them in pairs or sets of four.
+//
+// STBVOX_UNIFORM_ambient
+//    4-long vec4 array:
+//      ambient[0].xyz   - negative of direction of a directional light for half-lambert
+//      ambient[1].rgb   - color of light scaled by NdotL (can be negative)
+//      ambient[2].rgb   - constant light added to above calculation;
+//                         effectively light ranges from ambient[2]-ambient[1] to ambient[2]+ambient[1]
+//      ambient[3].rgb   - fog color for STBVOX_CONFIG_FOG_SMOOTHSTEP
+//      ambient[3].a     - reciprocal of squared distance of farthest fog point (viewing distance)
+
+
+                               //  +----- has a default value
+                               //  |  +-- you should always use the default value
+enum                           //  V  V
+{                              //  ------------------------------------------------
+   STBVOX_UNIFORM_face_data,   //  n      the sampler with the face texture buffer
+   STBVOX_UNIFORM_transform,   //  n      the transform data from stbvox_get_transform
+   STBVOX_UNIFORM_tex_array,   //  n      an array of two texture samplers containing the two texture arrays
+   STBVOX_UNIFORM_texscale,    //  Y      a table of texture properties, see above
+   STBVOX_UNIFORM_color_table, //  Y      64 vec4 RGBA values; a default palette is provided; if A > 1.0, fullbright
+   STBVOX_UNIFORM_normals,     //  Y  Y   table of normals, internal-only
+   STBVOX_UNIFORM_texgen,      //  Y  Y   table of texgen vectors, internal-only
+   STBVOX_UNIFORM_ambient,     //  n      lighting & fog info, see above
+   STBVOX_UNIFORM_camera_pos,  //  Y      camera position in global voxel space (for lighting & fog)
+
+   STBVOX_UNIFORM_count,
+};
+
+enum
+{
+   STBVOX_UNIFORM_TYPE_none,
+   STBVOX_UNIFORM_TYPE_sampler,
+   STBVOX_UNIFORM_TYPE_vec2,
+   STBVOX_UNIFORM_TYPE_vec3,
+   STBVOX_UNIFORM_TYPE_vec4,
+};
+
+struct stbvox_uniform_info
+{
+   int type;                    // which type of uniform
+   int bytes_per_element;       // the size of each uniform array element (e.g. vec3 = 12 bytes)
+   int array_length;            // length of the uniform array
+   char *name;                  // name in the shader @TODO use numeric binding
+   float *default_value;        // if not NULL, you can use this as the uniform pointer
+   int use_tex_buffer;          // if true, then the uniform is a sampler but the data can come from default_value
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Uniform sample code
+//
+
+#if 0
+// Run this once per frame before drawing all the meshes.
+// You still need to separately set the 'transform' uniform for every mesh.
+void setup_uniforms(GLuint shader, float camera_pos[4], GLuint tex1, GLuint tex2)
+{
+   int i;
+   glUseProgram(shader); // so uniform binding works
+   for (i=0; i < STBVOX_UNIFORM_count; ++i) {
+      stbvox_uniform_info sui;
+      if (stbvox_get_uniform_info(&sui, i)) {
+         GLint loc = glGetUniformLocation(shader, sui.name);
+         if (loc != -1) {
+            switch (i) {
+               case STBVOX_UNIFORM_camera_pos: // only needed for fog
+                  glUniform4fv(loc, sui.array_length, camera_pos);
+                  break;
+
+               case STBVOX_UNIFORM_tex_array: {
+                  GLuint tex_unit[2] = { 0, 1 }; // your choice of samplers
+                  glUniform1iv(loc, 2, tex_unit);
+
+                  glActiveTexture(GL_TEXTURE0 + tex_unit[0]); glBindTexture(GL_TEXTURE_2D_ARRAY, tex1);
+                  glActiveTexture(GL_TEXTURE0 + tex_unit[1]); glBindTexture(GL_TEXTURE_2D_ARRAY, tex2);
+                  glActiveTexture(GL_TEXTURE0); // reset to default
+                  break;
+               }
+
+               case STBVOX_UNIFORM_face_data:
+                  glUniform1i(loc, SAMPLER_YOU_WILL_BIND_PER_MESH_FACE_DATA_TO);
+                  break;
+
+               case STBVOX_UNIFORM_ambient:     // you definitely want to override this
+               case STBVOX_UNIFORM_color_table: // you might want to override this
+               case STBVOX_UNIFORM_texscale:    // you may want to override this
+                  glUniform4fv(loc, sui.array_length, sui.default_value);
+                  break;
+
+               case STBVOX_UNIFORM_normals:     // you never want to override this
+               case STBVOX_UNIFORM_texgen:      // you never want to override this
+                  glUniform3fv(loc, sui.array_length, sui.default_value);
+                  break;
+            }
+         }
+      }
+   }
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// INPUT TO MESHING
+//
+
+// Shapes of blocks that aren't always cubes
+enum
+{
+   STBVOX_GEOM_empty,
+   STBVOX_GEOM_knockout,  // creates a hole in the mesh
+   STBVOX_GEOM_solid,
+   STBVOX_GEOM_transp,    // solid geometry, but transparent contents so neighbors generate normally, unless same blocktype
+
+   // following 4 can be represented by vheight as well
+   STBVOX_GEOM_slab_upper,
+   STBVOX_GEOM_slab_lower,
+   STBVOX_GEOM_floor_slope_north_is_top,
+   STBVOX_GEOM_ceil_slope_north_is_bottom,
+
+   STBVOX_GEOM_floor_slope_north_is_top_as_wall_UNIMPLEMENTED,   // same as floor_slope above, but uses wall's texture & texture projection
+   STBVOX_GEOM_ceil_slope_north_is_bottom_as_wall_UNIMPLEMENTED,
+   STBVOX_GEOM_crossed_pair,    // corner-to-corner pairs, with normal vector bumped upwards
+   STBVOX_GEOM_force,           // like GEOM_transp, but faces visible even if neighbor is same type, e.g. minecraft fancy leaves
+
+   // these access vheight input
+   STBVOX_GEOM_floor_vheight_03 = 12,  // diagonal is SW-NE
+   STBVOX_GEOM_floor_vheight_12,       // diagonal is SE-NW
+   STBVOX_GEOM_ceil_vheight_03,
+   STBVOX_GEOM_ceil_vheight_12,
+
+   STBVOX_GEOM_count, // number of geom cases
+};
+
+enum
+{
+   STBVOX_FACE_east,
+   STBVOX_FACE_north,
+   STBVOX_FACE_west,
+   STBVOX_FACE_south,
+   STBVOX_FACE_up,
+   STBVOX_FACE_down,
+
+   STBVOX_FACE_count,
+};
+
+#ifdef STBVOX_CONFIG_BLOCKTYPE_SHORT
+typedef unsigned short stbvox_block_type;
+#else
+typedef unsigned char stbvox_block_type;
+#endif
+
+// 24-bit color
+typedef struct
+{
+   unsigned char r,g,b;
+} stbvox_rgb;
+
+#define STBVOX_COLOR_TEX1_ENABLE   64
+#define STBVOX_COLOR_TEX2_ENABLE  128
+
+// This is the data structure you fill out. Most of the arrays can be
+// NULL, except when one is required to get the value to index another.
+//
+// The compass system used in the following descriptions is:
+//     east means increasing x
+//     north means increasing y
+//     up means increasing z
+struct stbvox_input_description
+{
+   unsigned char lighting_at_vertices;
+   // The default is lighting values (i.e. ambient occlusion) are at block
+   // center, and the vertex light is gathered from those adjacent block
+   // centers that the vertex is facing. This makes smooth lighting
+   // consistent across adjacent faces with the same orientation.
+   //
+   // Setting this flag to non-zero gives you explicit control
+   // of light at each vertex, but now the lighting/ao will be
+   // shared by all vertices at the same point, even if they
+   // have different normals.
+
+   // these are mostly 3D maps you use to define your voxel world, using x_stride and y_stride
+   // note that for cache efficiency, you want to use the block_foo palettes as much as possible instead
+
+   stbvox_rgb *rgb;
+   // Indexed by 3D coordinate.
+   // 24-bit voxel color for STBVOX_CONFIG_MODE = 20 or 21 only
+
+   unsigned char *lighting;
+   // Indexed by 3D coordinate. The lighting value / ambient occlusion
+   // value that is used to define the vertex lighting values.
+   // The raw lighting values are defined at the center of blocks
+   // (or at vertex if 'lighting_at_vertices' is true).
+   //
+   // If the macro STBVOX_CONFIG_ROTATION_IN_LIGHTING is defined,
+   // then an additional 2-bit block rotation value is stored
+   // in this field as well.
+   //
+   // Encode with STBVOX_MAKE_LIGHTING_EXT(lighting,rot)--here
+   // 'lighting' should still be 8 bits, as the macro will
+   // discard the bottom bits automatically. Similarly, if
+   // using STBVOX_CONFIG_VHEIGHT_IN_LIGHTING, encode with
+   // STBVOX_MAKE_LIGHTING_EXT(lighting,vheight).
+   //
+   // (Rationale: rotation needs to be independent of blocktype,
+   // but is only 2 bits so doesn't want to be its own array.
+   // Lighting is the one thing that was likely to already be
+   // in use and that I could easily steal 2 bits from.)
+
+   stbvox_block_type *blocktype;
+   // Indexed by 3D coordinate. This is a core "block type" value, which is used
+   // to index into other arrays; essentially a "palette". This is much more
+   // memory-efficient and performance-friendly than storing the values explicitly,
+   // but only makes sense if the values are always synchronized.
+   //
+   // If a voxel's blocktype is 0, it is assumed to be empty (STBVOX_GEOM_empty),
+   // and no other blocktypes should be STBVOX_GEOM_empty. (Only if you do not
+   // have blocktypes should STBVOX_GEOM_empty ever used.)
+   //
+   // Normally it is an unsigned byte, but you can override it to be
+   // a short if you have too many blocktypes.
+
+   unsigned char *geometry;
+   // Indexed by 3D coordinate. Contains the geometry type for the block.
+   // Also contains a 2-bit rotation for how the whole block is rotated.
+   // Also includes a 2-bit vheight value when using shared vheight values.
+   // See the separate vheight documentation.
+   // Encode with STBVOX_MAKE_GEOMETRY(geom, rot, vheight)
+
+   unsigned char *block_geometry;
+   // Array indexed by blocktype containing the geometry for this block, plus
+   // a 2-bit "simple rotation". Note rotation has limited use since it's not
+   // independent of blocktype.
+   //
+   // Encode with STBVOX_MAKE_GEOMETRY(geom,simple_rot,0)
+
+   unsigned char *block_tex1;
+   // Array indexed by blocktype containing the texture id for texture #1.
+
+   unsigned char (*block_tex1_face)[6];
+   // Array indexed by blocktype and face containing the texture id for texture #1.
+   // The N/E/S/W face choices can be rotated by one of the rotation selectors;
+   // The top & bottom face textures will rotate to match.
+   // Note that it only makes sense to use one of block_tex1 or block_tex1_face;
+   // this pattern repeats throughout and this notice is not repeated.
+
+   unsigned char *tex2;
+   // Indexed by 3D coordinate. Contains the texture id for texture #2
+   // to use on all faces of the block.
+
+   unsigned char *block_tex2;
+   // Array indexed by blocktype containing the texture id for texture #2.
+
+   unsigned char (*block_tex2_face)[6];
+   // Array indexed by blocktype and face containing the texture id for texture #2.
+   // The N/E/S/W face choices can be rotated by one of the rotation selectors;
+   // The top & bottom face textures will rotate to match.
+
+   unsigned char *color;
+   // Indexed by 3D coordinate. Contains the color for all faces of the block.
+   // The core color value is 0..63.
+   // Encode with STBVOX_MAKE_COLOR(color_number, tex1_enable, tex2_enable)
+
+   unsigned char *block_color;
+   // Array indexed by blocktype containing the color value to apply to the faces.
+   // The core color value is 0..63.
+   // Encode with STBVOX_MAKE_COLOR(color_number, tex1_enable, tex2_enable)
+
+   unsigned char (*block_color_face)[6];
+   // Array indexed by blocktype and face containing the color value to apply to that face.
+   // The core color value is 0..63.
+   // Encode with STBVOX_MAKE_COLOR(color_number, tex1_enable, tex2_enable)
+
+   unsigned char *block_texlerp;
+   // Array indexed by blocktype containing 3-bit scalar for texture #2 alpha
+   // (known throughout as 'texlerp'). This is constant over every face even
+   // though the property is potentially per-vertex.
+
+   unsigned char (*block_texlerp_face)[6];
+   // Array indexed by blocktype and face containing 3-bit scalar for texture #2 alpha.
+   // This is constant over the face even though the property is potentially per-vertex.
+
+   unsigned char *block_vheight;
+   // Array indexed by blocktype containing the vheight values for the
+   // top or bottom face of this block. These will rotate properly if the
+   // block is rotated. See discussion of vheight.
+   // Encode with STBVOX_MAKE_VHEIGHT(sw_height, se_height, nw_height, ne_height)
+
+   unsigned char *selector;
+   // Array indexed by 3D coordinates indicating which output mesh to select.
+
+   unsigned char *block_selector;
+   // Array indexed by blocktype indicating which output mesh to select.
+
+   unsigned char *side_texrot;
+   // Array indexed by 3D coordinates encoding 2-bit texture rotations for the
+   // faces on the E/N/W/S sides of the block.
+   // Encode with STBVOX_MAKE_SIDE_TEXROT(rot_e, rot_n, rot_w, rot_s)
+
+   unsigned char *block_side_texrot;
+   // Array indexed by blocktype encoding 2-bit texture rotations for the faces
+   // on the E/N/W/S sides of the block.
+   // Encode with STBVOX_MAKE_SIDE_TEXROT(rot_e, rot_n, rot_w, rot_s)
+
+   unsigned char *overlay;                 // index into palettes listed below
+   // Indexed by 3D coordinate. If 0, there is no overlay. If non-zero,
+   // it indexes into to the below arrays and overrides the values
+   // defined by the blocktype.
+
+   unsigned char (*overlay_tex1)[6];
+   // Array indexed by overlay value and face, containing an override value
+   // for the texture id for texture #1. If 0, the value defined by blocktype
+   // is used.
+
+   unsigned char (*overlay_tex2)[6];
+   // Array indexed by overlay value and face, containing an override value
+   // for the texture id for texture #2. If 0, the value defined by blocktype
+   // is used.
+
+   unsigned char (*overlay_color)[6];
+   // Array indexed by overlay value and face, containing an override value
+   // for the face color. If 0, the value defined by blocktype is used.
+
+   unsigned char *overlay_side_texrot;
+   // Array indexed by overlay value, encoding 2-bit texture rotations for the faces
+   // on the E/N/W/S sides of the block.
+   // Encode with STBVOX_MAKE_SIDE_TEXROT(rot_e, rot_n, rot_w, rot_s)
+
+   unsigned char *rotate;
+   // Indexed by 3D coordinate. Allows independent rotation of several
+   // parts of the voxel, where by rotation I mean swapping textures
+   // and colors between E/N/S/W faces.
+   //    Block: rotates anything indexed by blocktype
+   //    Overlay: rotates anything indexed by overlay
+   //    EColor: rotates faces defined in ecolor_facemask
+   // Encode with STBVOX_MAKE_MATROT(block,overlay,ecolor)
+
+   unsigned char *tex2_for_tex1;
+   // Array indexed by tex1 containing the texture id for texture #2.
+   // You can use this if the two are always/almost-always strictly
+   // correlated (e.g. if tex2 is a detail texture for tex1), as it
+   // will be more efficient (touching fewer cache lines) than using
+   // e.g. block_tex2_face.
+
+   unsigned char *tex2_replace;
+   // Indexed by 3D coordinate. Specifies the texture id for texture #2
+   // to use on a single face of the voxel, which must be E/N/W/S (not U/D).
+   // The texture id is limited to 6 bits unless tex2_facemask is also
+   // defined (see below).
+   // Encode with STBVOX_MAKE_TEX2_REPLACE(tex2, face)
+
+   unsigned char *tex2_facemask;
+   // Indexed by 3D coordinate. Specifies which of the six faces should
+   // have their tex2 replaced by the value of tex2_replace. In this
+   // case, all 8 bits of tex2_replace are used as the texture id.
+   // Encode with STBVOX_MAKE_FACE_MASK(east,north,west,south,up,down)
+
+   unsigned char *extended_color;
+   // Indexed by 3D coordinate. Specifies a value that indexes into
+   // the ecolor arrays below (both of which must be defined).
+
+   unsigned char *ecolor_color;
+   // Indexed by extended_color value, specifies an optional override
+   // for the color value on some faces.
+   // Encode with STBVOX_MAKE_COLOR(color_number, tex1_enable, tex2_enable)
+
+   unsigned char *ecolor_facemask;
+   // Indexed by extended_color value, this specifies which faces the
+   // color in ecolor_color should be applied to. The faces can be
+   // independently rotated by the ecolor value of 'rotate', if it exists.
+   // Encode with STBVOX_MAKE_FACE_MASK(e,n,w,s,u,d)
+
+   unsigned char *color2;
+   // Indexed by 3D coordinates, specifies an alternative color to apply
+   // to some of the faces of the block.
+   // Encode with STBVOX_MAKE_COLOR(color_number, tex1_enable, tex2_enable)
+
+   unsigned char *color2_facemask;
+   // Indexed by 3D coordinates, specifies which faces should use the
+   // color defined in color2. No rotation value is applied.
+   // Encode with STBVOX_MAKE_FACE_MASK(e,n,w,s,u,d)
+
+   unsigned char *color3;
+   // Indexed by 3D coordinates, specifies an alternative color to apply
+   // to some of the faces of the block.
+   // Encode with STBVOX_MAKE_COLOR(color_number, tex1_enable, tex2_enable)
+
+   unsigned char *color3_facemask;
+   // Indexed by 3D coordinates, specifies which faces should use the
+   // color defined in color3. No rotation value is applied.
+   // Encode with STBVOX_MAKE_FACE_MASK(e,n,w,s,u,d)
+
+   unsigned char *texlerp_simple;
+   // Indexed by 3D coordinates, this is the smallest texlerp encoding
+   // that can do useful work. It consits of three values: baselerp,
+   // vertlerp, and face_vertlerp. Baselerp defines the value
+   // to use on all of the faces but one, from the STBVOX_TEXLERP_BASE
+   // values. face_vertlerp is one of the 6 face values (or STBVOX_FACE_NONE)
+   // which specifies the face should use the vertlerp values.
+   // Vertlerp defines a lerp value at every vertex of the mesh.
+   // Thus, one face can have per-vertex texlerp values, and those
+   // values are encoded in the space so that they will be shared
+   // by adjacent faces that also use vertlerp, allowing continuity
+   // (this is used for the "texture crossfade" bit of the release video).
+   // Encode with STBVOX_MAKE_TEXLERP_SIMPLE(baselerp, vertlerp, face_vertlerp)
+
+   // The following texlerp encodings are experimental and maybe not
+   // that useful.
+
+   unsigned char *texlerp;
+   // Indexed by 3D coordinates, this defines four values:
+   //   vertlerp is a lerp value at every vertex of the mesh (using STBVOX_TEXLERP_BASE values).
+   //   ud is the value to use on up and down faces, from STBVOX_TEXLERP_FACE values
+   //   ew is the value to use on east and west faces, from STBVOX_TEXLERP_FACE values
+   //   ns is the value to use on north and south faces, from STBVOX_TEXLERP_FACE values
+   // If any of ud, ew, or ns is STBVOX_TEXLERP_FACE_use_vert, then the
+   // vertlerp values for the vertices are gathered and used for those faces.
+   // Encode with STBVOX_MAKE_TEXLERP(vertlerp,ud,ew,sw)
+
+   unsigned short *texlerp_vert3;
+   // Indexed by 3D coordinates, this works with texlerp and
+   // provides a unique texlerp value for every direction at
+   // every vertex. The same rules of whether faces share values
+   // applies. The STBVOX_TEXLERP_FACE vertlerp value defined in
+   // texlerp is only used for the down direction. The values at
+   // each vertex in other directions are defined in this array,
+   // and each uses the STBVOX_TEXLERP3 values (i.e. full precision
+   // 3-bit texlerp values).
+   // Encode with STBVOX_MAKE_VERT3(vertlerp_e,vertlerp_n,vertlerp_w,vertlerp_s,vertlerp_u)
+
+   unsigned short *texlerp_face3;          // e:3,n:3,w:3,s:3,u:2,d:2
+   // Indexed by 3D coordinates, this provides a compact way to
+   // fully specify the texlerp value indepenendly for every face,
+   // but doesn't allow per-vertex variation. E/N/W/S values are
+   // encoded using STBVOX_TEXLERP3 values, whereas up and down
+   // use STBVOX_TEXLERP_SIMPLE values.
+   // Encode with STBVOX_MAKE_FACE3(face_e,face_n,face_w,face_s,face_u,face_d)
+
+   unsigned char *vheight;                 // STBVOX_MAKE_VHEIGHT   -- sw:2, se:2, nw:2, ne:2, doesn't rotate
+   // Indexed by 3D coordinates, this defines the four
+   // vheight values to use if the geometry is STBVOX_GEOM_vheight*.
+   // See the vheight discussion.
+
+   unsigned char *packed_compact;
+   // Stores block rotation, vheight, and texlerp values:
+   //    block rotation: 2 bits
+   //    vertex vheight: 2 bits
+   //    use_texlerp   : 1 bit
+   //    vertex texlerp: 3 bits
+   // If STBVOX_CONFIG_UP_TEXLERP_PACKED is defined, then 'vertex texlerp' is
+   // used for up faces if use_texlerp is 1. If STBVOX_CONFIG_DOWN_TEXLERP_PACKED
+   // is defined, then 'vertex texlerp' is used for down faces if use_texlerp is 1.
+   // Note if those symbols are defined but packed_compact is NULL, the normal
+   // texlerp default will be used.
+   // Encode with STBVOX_MAKE_PACKED_COMPACT(rot, vheight, texlerp, use_texlerp)
+};
+// @OPTIMIZE allow specializing; build a single struct with all of the
+// 3D-indexed arrays combined so it's AoS instead of SoA for better
+// cache efficiency
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  VHEIGHT DOCUMENTATION
+//
+//  "vheight" is the internal name for the special block types
+//  with sloped tops or bottoms. "vheight" stands for "vertex height".
+//
+//  Note that these blocks are very flexible (there are 256 of them,
+//  although at least 17 of them should never be used), but they
+//  also have a disadvantage that they generate extra invisible
+//  faces; the generator does not currently detect whether adjacent
+//  vheight blocks hide each others sides, so those side faces are
+//  always generated. For a continuous ground terrain, this means
+//  that you may generate 5x as many quads as needed. See notes
+//  on "improvements for shipping products" in the introduction.
+
+enum
+{
+   STBVOX_VERTEX_HEIGHT_0,
+   STBVOX_VERTEX_HEIGHT_half,
+   STBVOX_VERTEX_HEIGHT_1,
+   STBVOX_VERTEX_HEIGHT_one_and_a_half,
+};
+// These are the "vheight" values. Vheight stands for "vertex height".
+// The idea is that for a "floor vheight" block, you take a cube and
+// reposition the top-most vertices at various heights as specified by
+// the vheight values. Similarly, a "ceiling vheight" block takes a
+// cube and repositions the bottom-most vertices.
+//
+// A floor block only adjusts the top four vertices; the bottom four vertices
+// remain at the bottom of the block. The height values are 2 bits,
+// measured in halves of a block; so you can specify heights of 0/2,
+// 1/2, 2/2, or 3/2. 0 is the bottom of the block, 1 is halfway
+// up the block, 2 is the top of the block, and 3 is halfway up the
+// next block (and actually outside of the block). The value 3 is
+// actually legal for floor vheight (but not ceiling), and allows you to:
+//
+//     (A) have smoother terrain by having slopes that cross blocks,
+//         e.g. (1,1,3,3) is a regular-seeming slope halfway between blocks
+//     (B) make slopes steeper than 45-degrees, e.g. (0,0,3,3)
+//
+// (Because only z coordinates have half-block precision, and x&y are
+// limited to block corner precision, it's not possible to make these
+// things "properly" out of blocks, e.g. a half-slope block on its side
+// or a sloped block halfway between blocks that's made out of two blocks.)
+//
+// If you define STBVOX_CONFIG_OPTIMIZED_VHEIGHT, then the top face
+// (or bottom face for a ceiling vheight block) will be drawn as a
+// single quad even if the four vertex heights aren't planar, and a
+// single normal will be used over the entire quad. If you
+// don't define it, then if the top face is non-planar, it will be
+// split into two triangles, each with their own normal/lighting.
+// (Note that since all output from stb_voxel_render is quad meshes,
+// triangles are actually rendered as degenerate quads.) In this case,
+// the distinction between STBVOX_GEOM_floor_vheight_03 and
+// STBVOX_GEOM_floor_vheight_12 comes into play; the former introduces
+// an edge from the SW to NE corner (i.e. from <0,0,?> to <1,1,?>),
+// while the latter introduces an edge from the NW to SE corner
+// (i.e. from <0,1,?> to <1,0,?>.) For a "lazy mesh" look, use
+// exclusively _03 or _12. For a "classic mesh" look, alternate
+// _03 and _12 in a checkerboard pattern. For a "smoothest surface"
+// look, choose the edge based on actual vertex heights.
+//
+// The four vertex heights can come from several places. The simplest
+// encoding is to just use the 'vheight' parameter which stores four
+// explicit vertex heights for every block. This allows total independence,
+// but at the cost of the largest memory usage, 1 byte per 3D block.
+// Encode this with STBVOX_MAKE_VHEIGHT(vh_sw, vh_se, vh_nw, vh_ne).
+// These coordinates are absolute, not affected by block rotations.
+//
+// An alternative if you just want to encode some very specific block
+// types, not all the possibilities--say you just want half-height slopes,
+// so you want (0,0,1,1) and (1,1,2,2)--then you can use block_vheight
+// to specify them. The geometry rotation will cause block_vheight values
+// to be rotated (because it's as if you're just defining a type of
+// block). This value is also encoded with STBVOX_MAKE_VHEIGHT.
+//
+// If you want to save memory and you're creating a "continuous ground"
+// sort of effect, you can make each vertex of the lattice share the
+// vheight value; that is, two adjacent blocks that share a vertex will
+// always get the same vheight value for that vertex. Then you need to
+// store two bits of vheight for every block, which you do by storing it
+// as part another data structure. Store the south-west vertex's vheight
+// with the block. You can either use the "geometry" mesh variable (it's
+// a parameter to STBVOX_MAKE_GEOMETRY) or you can store it in the
+// "lighting" mesh variable if you defined STBVOX_CONFIG_VHEIGHT_IN_LIGHTING,
+// using STBVOX_MAKE_LIGHTING_EXT(lighting,vheight).
+//
+// Note that if you start with a 2D height map and generate vheight data from
+// it, you don't necessarily store only one value per (x,y) coordinate,
+// as the same value may need to be set up at multiple z heights. For
+// example, if height(8,8) = 13.5, then you want the block at (8,8,13)
+// to store STBVOX_VERTEX_HEIGHT_half, and this will be used by blocks
+// at (7,7,13), (8,7,13), (7,8,13), and (8,8,13). However, if you're
+// allowing steep slopes, it might be the case that you have a block
+// at (7,7,12) which is supposed to stick up to 13.5; that means
+// you also need to store STBVOX_VERTEX_HEIGHT_one_and_a_half at (8,8,12).
+
+enum
+{
+   STBVOX_TEXLERP_FACE_0,
+   STBVOX_TEXLERP_FACE_half,
+   STBVOX_TEXLERP_FACE_1,
+   STBVOX_TEXLERP_FACE_use_vert,
+};
+
+enum
+{
+   STBVOX_TEXLERP_BASE_0,    // 0.0
+   STBVOX_TEXLERP_BASE_2_7,  // 2/7
+   STBVOX_TEXLERP_BASE_5_7,  // 4/7
+   STBVOX_TEXLERP_BASE_1     // 1.0
+};
+
+enum
+{
+   STBVOX_TEXLERP3_0_8,
+   STBVOX_TEXLERP3_1_8,
+   STBVOX_TEXLERP3_2_8,
+   STBVOX_TEXLERP3_3_8,
+   STBVOX_TEXLERP3_4_8,
+   STBVOX_TEXLERP3_5_8,
+   STBVOX_TEXLERP3_6_8,
+   STBVOX_TEXLERP3_7_8,
+};
+
+#define STBVOX_FACE_NONE  7
+
+#define STBVOX_BLOCKTYPE_EMPTY    0
+
+#ifdef STBVOX_BLOCKTYPE_SHORT
+#define STBVOX_BLOCKTYPE_HOLE  65535
+#else
+#define STBVOX_BLOCKTYPE_HOLE    255
+#endif
+
+#define STBVOX_MAKE_GEOMETRY(geom, rotate, vheight) ((geom) + (rotate)*16 + (vheight)*64)
+#define STBVOX_MAKE_VHEIGHT(v_sw, v_se, v_nw, v_ne) ((v_sw) + (v_se)*4 + (v_nw)*16 + (v_ne)*64)
+#define STBVOX_MAKE_MATROT(block, overlay, color)  ((block) + (overlay)*4 + (color)*64)
+#define STBVOX_MAKE_TEX2_REPLACE(tex2, tex2_replace_face) ((tex2) + ((tex2_replace_face) & 3)*64)
+#define STBVOX_MAKE_TEXLERP(ns2, ew2, ud2, vert)  ((ew2) + (ns2)*4 + (ud2)*16 + (vert)*64)
+#define STBVOX_MAKE_TEXLERP_SIMPLE(baselerp,vert,face)   ((vert)*32 + (face)*4 + (baselerp))
+#define STBVOX_MAKE_TEXLERP1(vert,e2,n2,w2,s2,u4,d2) STBVOX_MAKE_TEXLERP(s2, w2, d2, vert)
+#define STBVOX_MAKE_TEXLERP2(vert,e2,n2,w2,s2,u4,d2) ((u2)*16 + (n2)*4 + (s2))
+#define STBVOX_MAKE_FACE_MASK(e,n,w,s,u,d)  ((e)+(n)*2+(w)*4+(s)*8+(u)*16+(d)*32)
+#define STBVOX_MAKE_SIDE_TEXROT(e,n,w,s) ((e)+(n)*4+(w)*16+(s)*64)
+#define STBVOX_MAKE_COLOR(color,t1,t2) ((color)+(t1)*64+(t2)*128)
+#define STBVOX_MAKE_TEXLERP_VERT3(e,n,w,s,u)   ((e)+(n)*8+(w)*64+(s)*512+(u)*4096)
+#define STBVOX_MAKE_TEXLERP_FACE3(e,n,w,s,u,d) ((e)+(n)*8+(w)*64+(s)*512+(u)*4096+(d)*16384)
+#define STBVOX_MAKE_PACKED_COMPACT(rot, vheight, texlerp, def) ((rot)+4*(vheight)+16*(use)+32*(texlerp))
+
+#define STBVOX_MAKE_LIGHTING_EXT(lighting, rot)  (((lighting)&~3)+(rot))
+#define STBVOX_MAKE_LIGHTING(lighting)       (lighting)
+
+#ifndef STBVOX_MAX_MESHES
+#define STBVOX_MAX_MESHES      2           // opaque & transparent
+#endif
+
+#define STBVOX_MAX_MESH_SLOTS  3           // one vertex & two faces, or two vertex and one face
+
+
+// don't mess with this directly, it's just here so you can
+// declare stbvox_mesh_maker on the stack or as a global
+struct stbvox_mesh_maker
+{
+   stbvox_input_description input;
+   int cur_x, cur_y, cur_z;       // last unprocessed voxel if it splits into multiple buffers
+   int x0,y0,z0,x1,y1,z1;
+   int x_stride_in_bytes;
+   int y_stride_in_bytes;
+   int config_dirty;
+   int default_mesh;
+   unsigned int tags;
+
+   int cube_vertex_offset[6][4]; // this allows access per-vertex data stored block-centered (like texlerp, ambient)
+   int vertex_gather_offset[6][4];
+
+   int pos_x,pos_y,pos_z;
+   int full;
+
+   // computed from user input
+   char *output_cur   [STBVOX_MAX_MESHES][STBVOX_MAX_MESH_SLOTS];
+   char *output_end   [STBVOX_MAX_MESHES][STBVOX_MAX_MESH_SLOTS];
+   char *output_buffer[STBVOX_MAX_MESHES][STBVOX_MAX_MESH_SLOTS];
+   int   output_len   [STBVOX_MAX_MESHES][STBVOX_MAX_MESH_SLOTS];
+
+   // computed from config
+   int   output_size  [STBVOX_MAX_MESHES][STBVOX_MAX_MESH_SLOTS]; // per quad
+   int   output_step  [STBVOX_MAX_MESHES][STBVOX_MAX_MESH_SLOTS]; // per vertex or per face, depending
+   int   num_mesh_slots;
+
+   float default_tex_scale[128][2];
+};
+
+#endif //  INCLUDE_STB_VOXEL_RENDER_H
+
+
+#ifdef STB_VOXEL_RENDER_IMPLEMENTATION
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h> // memset
+
+// have to use our own names to avoid the _MSC_VER path having conflicting type names
+#ifndef _MSC_VER
+   #include <stdint.h>
+   typedef uint16_t stbvox_uint16;
+   typedef uint32_t stbvox_uint32;
+#else
+   typedef unsigned short stbvox_uint16;
+   typedef unsigned int   stbvox_uint32;
+#endif
+
+#ifdef _MSC_VER
+   #define STBVOX_NOTUSED(v)  (void)(v)
+#else
+   #define STBVOX_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+
+
+#ifndef STBVOX_CONFIG_MODE
+#error "Must defined STBVOX_CONFIG_MODE to select the mode"
+#endif
+
+#if defined(STBVOX_CONFIG_ROTATION_IN_LIGHTING) && defined(STBVOX_CONFIG_VHEIGHT_IN_LIGHTING)
+#error "Can't store both rotation and vheight in lighting"
+#endif
+
+
+// The following are candidate voxel modes. Only modes 0, 1, and 20, and 21 are
+// currently implemented. Reducing the storage-per-quad further
+// shouldn't improve performance, although obviously it allow you
+// to create larger worlds without streaming.
+//
+//
+//                      -----------  Two textures -----------       -- One texture --     ---- Color only ----
+//            Mode:     0     1     2     3     4     5     6        10    11    12      20    21    22    23    24
+// ============================================================================================================
+//  uses Tex Buffer     n     Y     Y     Y     Y     Y     Y         Y     Y     Y       n     Y     Y     Y     Y
+//   bytes per quad    32    20    14    12    10     6     6         8     8     4      32    20    10     6     4
+//       non-blocks   all   all   some  some  some slabs stairs     some  some  none    all   all  slabs slabs  none
+//             tex1   256   256   256   256   256   256   256       256   256   256       n     n     n     n     n
+//             tex2   256   256   256   256   256   256   128         n     n     n       n     n     n     n     n
+//           colors    64    64    64    64    64    64    64         8     n     n     2^24  2^24  2^24  2^24  256
+//        vertex ao     Y     Y     Y     Y     Y     n     n         Y     Y     n       Y     Y     Y     n     n
+//   vertex texlerp     Y     Y     Y     n     n     n     n         -     -     -       -     -     -     -     -
+//      x&y extents   127   127   128    64    64   128    64        64   128   128     127   127   128   128   128
+//        z extents   255   255   128    64?   64?   64    64        32    64   128     255   255   128    64   128
+
+// not sure why I only wrote down the above "result data" and didn't preserve
+// the vertex formats, but here I've tried to reconstruct the designs...
+//     mode # 3 is wrong, one byte too large, but they may have been an error originally
+
+//            Mode:     0     1     2     3     4     5     6        10    11    12      20    21    22    23    24
+// =============================================================================================================
+//   bytes per quad    32    20    14    12    10     6     6         8     8     4            20    10     6     4
+//
+//    vertex x bits     7     7     0     6     0     0     0         0     0     0             7     0     0     0
+//    vertex y bits     7     7     0     0     0     0     0         0     0     0             7     0     0     0
+//    vertex z bits     9     9     7     4     2     0     0         2     2     0             9     2     0     0
+//   vertex ao bits     6     6     6     6     6     0     0         6     6     0             6     6     0     0
+//  vertex txl bits     3     3     3     0     0     0     0         0     0     0            (3)    0     0     0
+//
+//   face tex1 bits    (8)    8     8     8     8     8     8         8     8     8
+//   face tex2 bits    (8)    8     8     8     8     8     7         -     -     -
+//  face color bits    (8)    8     8     8     8     8     8         3     0     0            24    24    24     8
+// face normal bits    (8)    8     8     8     6     4     7         4     4     3             8     3     4     3
+//      face x bits                 7     0     6     7     6         6     7     7             0     7     7     7
+//      face y bits                 7     6     6     7     6         6     7     7             0     7     7     7
+//      face z bits                 2     2     6     6     6         5     6     7             0     7     6     7
+
+
+#if STBVOX_CONFIG_MODE==0 || STBVOX_CONFIG_MODE==1
+
+   #define STBVOX_ICONFIG_VERTEX_32
+   #define STBVOX_ICONFIG_FACE1_1
+
+#elif STBVOX_CONFIG_MODE==20 || STBVOX_CONFIG_MODE==21
+
+   #define STBVOX_ICONFIG_VERTEX_32
+   #define STBVOX_ICONFIG_FACE1_1
+   #define STBVOX_ICONFIG_UNTEXTURED
+
+#else
+#error "Selected value of STBVOX_CONFIG_MODE is not supported"
+#endif
+
+#if STBVOX_CONFIG_MODE==0 || STBVOX_CONFIG_MODE==20
+#define STBVOX_ICONFIG_FACE_ATTRIBUTE
+#endif
+
+#ifndef STBVOX_CONFIG_HLSL
+// the fallback if all others are exhausted is GLSL
+#define STBVOX_ICONFIG_GLSL
+#endif
+
+#ifdef STBVOX_CONFIG_OPENGL_MODELVIEW
+#define STBVOX_ICONFIG_OPENGL_3_1_COMPATIBILITY
+#endif
+
+#if defined(STBVOX_ICONFIG_VERTEX_32)
+   typedef stbvox_uint32 stbvox_mesh_vertex;
+   #define stbvox_vertex_encode(x,y,z,ao,texlerp) \
+      ((stbvox_uint32) ((x)+((y)<<7)+((z)<<14)+((ao)<<23)+((texlerp)<<29)))
+#elif defined(STBVOX_ICONFIG_VERTEX_16_1)  // mode=2
+   typedef stbvox_uint16 stbvox_mesh_vertex;
+   #define stbvox_vertex_encode(x,y,z,ao,texlerp) \
+      ((stbvox_uint16) ((z)+((ao)<<7)+((texlerp)<<13)
+#elif defined(STBVOX_ICONFIG_VERTEX_16_2)  // mode=3
+   typedef stbvox_uint16 stbvox_mesh_vertex;
+   #define stbvox_vertex_encode(x,y,z,ao,texlerp) \
+      ((stbvox_uint16) ((x)+((z)<<6))+((ao)<<10))
+#elif defined(STBVOX_ICONFIG_VERTEX_8)
+   typedef stbvox_uint8 stbvox_mesh_vertex;
+   #define stbvox_vertex_encode(x,y,z,ao,texlerp) \
+      ((stbvox_uint8) ((z)+((ao)<<6))
+#else
+   #error "internal error, no vertex type"
+#endif
+
+#ifdef STBVOX_ICONFIG_FACE1_1
+   typedef struct
+   {
+      unsigned char tex1,tex2,color,face_info;
+   } stbvox_mesh_face;
+#else
+   #error "internal error, no face type"
+#endif
+
+
+// 20-byte quad format:
+//
+// per vertex:
+//
+//     x:7
+//     y:7
+//     z:9
+//     ao:6
+//     tex_lerp:3
+//
+// per face:
+//
+//     tex1:8
+//     tex2:8
+//     face:8
+//     color:8
+
+
+// Faces:
+//
+// Faces use the bottom 3 bits to choose the texgen
+// mode, and all the bits to choose the normal.
+// Thus the bottom 3 bits have to be:
+//      e, n, w, s, u, d, u, d
+//
+// These use compact names so tables are readable
+
+enum
+{
+   STBVF_e,
+   STBVF_n,
+   STBVF_w,
+   STBVF_s,
+   STBVF_u,
+   STBVF_d,
+   STBVF_eu,
+   STBVF_ed,
+
+   STBVF_eu_wall,
+   STBVF_nu_wall,
+   STBVF_wu_wall,
+   STBVF_su_wall,
+   STBVF_ne_u,
+   STBVF_ne_d,
+   STBVF_nu,
+   STBVF_nd,
+
+   STBVF_ed_wall,
+   STBVF_nd_wall,
+   STBVF_wd_wall,
+   STBVF_sd_wall,
+   STBVF_nw_u,
+   STBVF_nw_d,
+   STBVF_wu,
+   STBVF_wd,
+
+   STBVF_ne_u_cross,
+   STBVF_nw_u_cross,
+   STBVF_sw_u_cross,
+   STBVF_se_u_cross,
+   STBVF_sw_u,
+   STBVF_sw_d,
+   STBVF_su,
+   STBVF_sd,
+
+   // @TODO we need more than 5 bits to encode the normal to fit the following
+   // so for now we use the right projection but the wrong normal
+   STBVF_se_u = STBVF_su,
+   STBVF_se_d = STBVF_sd,
+
+   STBVF_count,
+};
+
+/////////////////////////////////////////////////////////////////////////////
+//
+//    tables -- i'd prefer if these were at the end of the file, but: C++
+//
+
+static float stbvox_default_texgen[2][32][3] =
+{
+   { {  0, 1,0 }, { 0, 0, 1 }, {  0,-1,0 }, { 0, 0,-1 },
+     { -1, 0,0 }, { 0, 0, 1 }, {  1, 0,0 }, { 0, 0,-1 },
+     {  0,-1,0 }, { 0, 0, 1 }, {  0, 1,0 }, { 0, 0,-1 },
+     {  1, 0,0 }, { 0, 0, 1 }, { -1, 0,0 }, { 0, 0,-1 },
+
+     {  1, 0,0 }, { 0, 1, 0 }, { -1, 0,0 }, { 0,-1, 0 },
+     { -1, 0,0 }, { 0,-1, 0 }, {  1, 0,0 }, { 0, 1, 0 },
+     {  1, 0,0 }, { 0, 1, 0 }, { -1, 0,0 }, { 0,-1, 0 },
+     { -1, 0,0 }, { 0,-1, 0 }, {  1, 0,0 }, { 0, 1, 0 },
+   },
+   { { 0, 0,-1 }, {  0, 1,0 }, { 0, 0, 1 }, {  0,-1,0 },
+     { 0, 0,-1 }, { -1, 0,0 }, { 0, 0, 1 }, {  1, 0,0 },
+     { 0, 0,-1 }, {  0,-1,0 }, { 0, 0, 1 }, {  0, 1,0 },
+     { 0, 0,-1 }, {  1, 0,0 }, { 0, 0, 1 }, { -1, 0,0 },
+
+     { 0,-1, 0 }, {  1, 0,0 }, { 0, 1, 0 }, { -1, 0,0 },
+     { 0, 1, 0 }, { -1, 0,0 }, { 0,-1, 0 }, {  1, 0,0 },
+     { 0,-1, 0 }, {  1, 0,0 }, { 0, 1, 0 }, { -1, 0,0 },
+     { 0, 1, 0 }, { -1, 0,0 }, { 0,-1, 0 }, {  1, 0,0 },
+   },
+};
+
+#define STBVOX_RSQRT2   0.7071067811865f
+#define STBVOX_RSQRT3   0.5773502691896f
+
+static float stbvox_default_normals[32][3] =
+{
+   { 1,0,0 },  // east
+   { 0,1,0 },  // north
+   { -1,0,0 }, // west
+   { 0,-1,0 }, // south
+   { 0,0,1 },  // up
+   { 0,0,-1 }, // down
+   {  STBVOX_RSQRT2,0, STBVOX_RSQRT2 }, // east & up
+   {  STBVOX_RSQRT2,0, -STBVOX_RSQRT2 }, // east & down
+
+   {  STBVOX_RSQRT2,0, STBVOX_RSQRT2 }, // east & up
+   { 0, STBVOX_RSQRT2, STBVOX_RSQRT2 }, // north & up
+   { -STBVOX_RSQRT2,0, STBVOX_RSQRT2 }, // west & up
+   { 0,-STBVOX_RSQRT2, STBVOX_RSQRT2 }, // south & up
+   {  STBVOX_RSQRT3, STBVOX_RSQRT3, STBVOX_RSQRT3 }, // ne & up
+   {  STBVOX_RSQRT3, STBVOX_RSQRT3,-STBVOX_RSQRT3 }, // ne & down
+   { 0, STBVOX_RSQRT2, STBVOX_RSQRT2 }, // north & up
+   { 0, STBVOX_RSQRT2, -STBVOX_RSQRT2 }, // north & down
+
+   {  STBVOX_RSQRT2,0, -STBVOX_RSQRT2 }, // east & down
+   { 0, STBVOX_RSQRT2, -STBVOX_RSQRT2 }, // north & down
+   { -STBVOX_RSQRT2,0, -STBVOX_RSQRT2 }, // west & down
+   { 0,-STBVOX_RSQRT2, -STBVOX_RSQRT2 }, // south & down
+   { -STBVOX_RSQRT3, STBVOX_RSQRT3, STBVOX_RSQRT3 }, // NW & up
+   { -STBVOX_RSQRT3, STBVOX_RSQRT3,-STBVOX_RSQRT3 }, // NW & down
+   { -STBVOX_RSQRT2,0, STBVOX_RSQRT2 }, // west & up
+   { -STBVOX_RSQRT2,0, -STBVOX_RSQRT2 }, // west & down
+
+   {  STBVOX_RSQRT3, STBVOX_RSQRT3,STBVOX_RSQRT3 }, // NE & up crossed
+   { -STBVOX_RSQRT3, STBVOX_RSQRT3,STBVOX_RSQRT3 }, // NW & up crossed
+   { -STBVOX_RSQRT3,-STBVOX_RSQRT3,STBVOX_RSQRT3 }, // SW & up crossed
+   {  STBVOX_RSQRT3,-STBVOX_RSQRT3,STBVOX_RSQRT3 }, // SE & up crossed
+   { -STBVOX_RSQRT3,-STBVOX_RSQRT3, STBVOX_RSQRT3 }, // SW & up
+   { -STBVOX_RSQRT3,-STBVOX_RSQRT3,-STBVOX_RSQRT3 }, // SW & up
+   { 0,-STBVOX_RSQRT2, STBVOX_RSQRT2 }, // south & up
+   { 0,-STBVOX_RSQRT2, -STBVOX_RSQRT2 }, // south & down
+};
+
+static float stbvox_default_texscale[128][4] =
+{
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+   {1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},{1,1,0,0},
+};
+
+static unsigned char stbvox_default_palette_compact[64][3] =
+{
+   { 255,255,255 }, { 238,238,238 }, { 221,221,221 }, { 204,204,204 },
+   { 187,187,187 }, { 170,170,170 }, { 153,153,153 }, { 136,136,136 },
+   { 119,119,119 }, { 102,102,102 }, {  85, 85, 85 }, {  68, 68, 68 },
+   {  51, 51, 51 }, {  34, 34, 34 }, {  17, 17, 17 }, {   0,  0,  0 },
+   { 255,240,240 }, { 255,220,220 }, { 255,160,160 }, { 255, 32, 32 },
+   { 200,120,160 }, { 200, 60,150 }, { 220,100,130 }, { 255,  0,128 },
+   { 240,240,255 }, { 220,220,255 }, { 160,160,255 }, {  32, 32,255 },
+   { 120,160,200 }, {  60,150,200 }, { 100,130,220 }, {   0,128,255 },
+   { 240,255,240 }, { 220,255,220 }, { 160,255,160 }, {  32,255, 32 },
+   { 160,200,120 }, { 150,200, 60 }, { 130,220,100 }, { 128,255,  0 },
+   { 255,255,240 }, { 255,255,220 }, { 220,220,180 }, { 255,255, 32 },
+   { 200,160,120 }, { 200,150, 60 }, { 220,130,100 }, { 255,128,  0 },
+   { 255,240,255 }, { 255,220,255 }, { 220,180,220 }, { 255, 32,255 },
+   { 160,120,200 }, { 150, 60,200 }, { 130,100,220 }, { 128,  0,255 },
+   { 240,255,255 }, { 220,255,255 }, { 180,220,220 }, {  32,255,255 },
+   { 120,200,160 }, {  60,200,150 }, { 100,220,130 }, {   0,255,128 },
+};
+
+static float stbvox_default_ambient[4][4] =
+{
+   { 0,0,1      ,0 }, // reversed lighting direction
+   { 0.5,0.5,0.5,0 }, // directional color
+   { 0.5,0.5,0.5,0 }, // constant color
+   { 0.5,0.5,0.5,1.0f/1000.0f/1000.0f }, // fog data for simple_fog
+};
+
+static float stbvox_default_palette[64][4];
+
+static void stbvox_build_default_palette(void)
+{
+   int i;
+   for (i=0; i < 64; ++i) {
+      stbvox_default_palette[i][0] = stbvox_default_palette_compact[i][0] / 255.0f;
+      stbvox_default_palette[i][1] = stbvox_default_palette_compact[i][1] / 255.0f;
+      stbvox_default_palette[i][2] = stbvox_default_palette_compact[i][2] / 255.0f;
+      stbvox_default_palette[i][3] = 1.0f;
+   }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Shaders
+//
+
+#if defined(STBVOX_ICONFIG_OPENGL_3_1_COMPATIBILITY)
+   #define STBVOX_SHADER_VERSION "#version 150 compatibility\n"
+#elif defined(STBVOX_ICONFIG_OPENGL_3_0)
+   #define STBVOX_SHADER_VERSION "#version 130\n"
+#elif defined(STBVOX_ICONFIG_GLSL)
+   #define STBVOX_SHADER_VERSION "#version 150\n"
+#else
+   #define STBVOX_SHADER_VERSION ""
+#endif
+
+static const char *stbvox_vertex_program =
+{
+      STBVOX_SHADER_VERSION
+
+   #ifdef STBVOX_ICONFIG_FACE_ATTRIBUTE  // NOT TAG_face_sampled
+      "in uvec4 attr_face;\n"
+   #else
+      "uniform usamplerBuffer facearray;\n"
+   #endif
+
+   #ifdef STBVOX_ICONFIG_FACE_ARRAY_2
+      "uniform usamplerBuffer facearray2;\n"
+   #endif
+
+      // vertex input data
+      "in uint attr_vertex;\n"
+
+      // per-buffer data
+      "uniform vec3 transform[3];\n"
+
+      // per-frame data
+      "uniform vec4 camera_pos;\n"  // 4th value is used for arbitrary hacking
+
+      // to simplify things, we avoid using more than 256 uniform vectors
+      // in fragment shader to avoid possible 1024 component limit, so
+      // we access this table in the fragment shader.
+      "uniform vec3 normal_table[32];\n"
+
+      #ifndef STBVOX_CONFIG_OPENGL_MODELVIEW
+         "uniform mat4x4 model_view;\n"
+      #endif
+
+      // fragment output data
+      "flat out uvec4  facedata;\n"
+      "     out  vec3  voxelspace_pos;\n"
+      "     out  vec3  vnormal;\n"
+      "     out float  texlerp;\n"
+      "     out float  amb_occ;\n"
+
+      // @TODO handle the HLSL way to do this
+      "void main()\n"
+      "{\n"
+      #ifdef STBVOX_ICONFIG_FACE_ATTRIBUTE
+         "   facedata = attr_face;\n"
+      #else
+         "   int faceID = gl_VertexID >> 2;\n"
+         "   facedata   = texelFetch(facearray, faceID);\n"
+      #endif
+
+      // extract data for vertex
+      "   vec3 offset;\n"
+      "   offset.x = float( (attr_vertex       ) & 127u );\n"             // a[0..6]
+      "   offset.y = float( (attr_vertex >>  7u) & 127u );\n"             // a[7..13]
+      "   offset.z = float( (attr_vertex >> 14u) & 511u );\n"             // a[14..22]
+      "   amb_occ  = float( (attr_vertex >> 23u) &  63u ) / 63.0;\n"      // a[23..28]
+      "   texlerp  = float( (attr_vertex >> 29u)        ) /  7.0;\n"      // a[29..31]
+
+      "   vnormal = normal_table[(facedata.w>>2u) & 31u];\n"
+      "   voxelspace_pos = offset * transform[0];\n"  // mesh-to-object scale
+      "   vec3 position  = voxelspace_pos + transform[1];\n"  // mesh-to-object translate
+
+      #ifdef STBVOX_DEBUG_TEST_NORMALS
+         "   if ((facedata.w & 28u) == 16u || (facedata.w & 28u) == 24u)\n"
+         "      position += vnormal.xyz * camera_pos.w;\n"
+      #endif
+
+      #ifndef STBVOX_CONFIG_OPENGL_MODELVIEW
+         "   gl_Position = model_view * vec4(position,1.0);\n"
+      #else
+         "   gl_Position = gl_ModelViewProjectionMatrix * vec4(position,1.0);\n"
+      #endif
+
+      "}\n"
+};
+
+
+static const char *stbvox_fragment_program =
+{
+      STBVOX_SHADER_VERSION
+
+      // rlerp is lerp but with t on the left, like god intended
+      #if defined(STBVOX_ICONFIG_GLSL)
+         "#define rlerp(t,x,y) mix(x,y,t)\n"
+      #elif defined(STBVOX_CONFIG_HLSL)
+         "#define rlerp(t,x,y) lerp(x,y,t)\n"
+      #else
+         #error "need definition of rlerp()"
+      #endif
+
+
+      // vertex-shader output data
+      "flat in uvec4  facedata;\n"
+      "     in  vec3  voxelspace_pos;\n"
+      "     in  vec3  vnormal;\n"
+      "     in float  texlerp;\n"
+      "     in float  amb_occ;\n"
+
+      // per-buffer data
+      "uniform vec3 transform[3];\n"
+
+      // per-frame data
+      "uniform vec4 camera_pos;\n"  // 4th value is used for arbitrary hacking
+
+      // probably constant data
+      "uniform vec4 ambient[4];\n"
+
+      #ifndef STBVOX_ICONFIG_UNTEXTURED
+         // generally constant data
+         "uniform sampler2DArray tex_array[2];\n"
+
+         #ifdef STBVOX_CONFIG_PREFER_TEXBUFFER
+            "uniform samplerBuffer color_table;\n"
+            "uniform samplerBuffer texscale;\n"
+            "uniform samplerBuffer texgen;\n"
+         #else
+            "uniform vec4 color_table[64];\n"
+            "uniform vec4 texscale[64];\n" // instead of 128, to avoid running out of uniforms
+            "uniform vec3 texgen[64];\n"
+         #endif
+      #endif
+
+      "out vec4  outcolor;\n"
+
+      #if defined(STBVOX_CONFIG_LIGHTING) || defined(STBVOX_CONFIG_LIGHTING_SIMPLE)
+      "vec3 compute_lighting(vec3 pos, vec3 norm, vec3 albedo, vec3 ambient);\n"
+      #endif
+      #if defined(STBVOX_CONFIG_FOG) || defined(STBVOX_CONFIG_FOG_SMOOTHSTEP)
+      "vec3 compute_fog(vec3 color, vec3 relative_pos, float fragment_alpha);\n"
+      #endif
+
+      "void main()\n"
+      "{\n"
+      "   vec3 albedo;\n"
+      "   float fragment_alpha;\n"
+
+      #ifndef STBVOX_ICONFIG_UNTEXTURED
+         // unpack the values
+         "   uint tex1_id = facedata.x;\n"
+         "   uint tex2_id = facedata.y;\n"
+         "   uint texprojid = facedata.w & 31u;\n"
+         "   uint color_id  = facedata.z;\n"
+
+         #ifndef STBVOX_CONFIG_PREFER_TEXBUFFER
+            // load from uniforms / texture buffers
+            "   vec3 texgen_s = texgen[texprojid];\n"
+            "   vec3 texgen_t = texgen[texprojid+32u];\n"
+            "   float tex1_scale = texscale[tex1_id & 63u].x;\n"
+            "   vec4 color = color_table[color_id & 63u];\n"
+            #ifndef STBVOX_CONFIG_DISABLE_TEX2
+            "   vec4 tex2_props = texscale[tex2_id & 63u];\n"
+            #endif
+         #else
+            "   vec3 texgen_s = texelFetch(texgen, int(texprojid)).xyz;\n"
+            "   vec3 texgen_t = texelFetch(texgen, int(texprojid+32u)).xyz;\n"
+            "   float tex1_scale = texelFetch(texscale, int(tex1_id & 127u)).x;\n"
+            "   vec4 color = texelFetch(color_table, int(color_id & 63u));\n"
+            #ifndef STBVOX_CONFIG_DISABLE_TEX2
+            "   vec4 tex2_props = texelFetch(texscale, int(tex1_id & 127u));\n"
+            #endif
+         #endif
+
+         #ifndef STBVOX_CONFIG_DISABLE_TEX2
+         "   float tex2_scale = tex2_props.y;\n"
+         "   bool texblend_mode = tex2_props.z != 0.0;\n"
+         #endif
+         "   vec2 texcoord;\n"
+         "   vec3 texturespace_pos = voxelspace_pos + transform[2].xyz;\n"
+         "   texcoord.s = dot(texturespace_pos, texgen_s);\n"
+         "   texcoord.t = dot(texturespace_pos, texgen_t);\n"
+
+         "   vec2  texcoord_1 = tex1_scale * texcoord;\n"
+         #ifndef STBVOX_CONFIG_DISABLE_TEX2
+         "   vec2  texcoord_2 = tex2_scale * texcoord;\n"
+         #endif
+
+         #ifdef STBVOX_CONFIG_TEX1_EDGE_CLAMP
+         "   texcoord_1 = texcoord_1 - floor(texcoord_1);\n"
+         "   vec4 tex1 = textureGrad(tex_array[0], vec3(texcoord_1, float(tex1_id)), dFdx(tex1_scale*texcoord), dFdy(tex1_scale*texcoord));\n"
+         #else
+         "   vec4 tex1 = texture(tex_array[0], vec3(texcoord_1, float(tex1_id)));\n"
+         #endif
+
+         #ifndef STBVOX_CONFIG_DISABLE_TEX2
+         #ifdef STBVOX_CONFIG_TEX2_EDGE_CLAMP
+         "   texcoord_2 = texcoord_2 - floor(texcoord_2);\n"
+         "   vec4 tex2 = textureGrad(tex_array[0], vec3(texcoord_2, float(tex2_id)), dFdx(tex2_scale*texcoord), dFdy(tex2_scale*texcoord));\n"
+         #else
+         "   vec4 tex2 = texture(tex_array[1], vec3(texcoord_2, float(tex2_id)));\n"
+         #endif
+         #endif
+
+         "   bool emissive = (color.a > 1.0);\n"
+         "   color.a = min(color.a, 1.0);\n"
+
+         // recolor textures
+         "   if ((color_id &  64u) != 0u) tex1.rgba *= color.rgba;\n"
+         "   fragment_alpha = tex1.a;\n"
+         #ifndef STBVOX_CONFIG_DISABLE_TEX2
+            "   if ((color_id & 128u) != 0u) tex2.rgba *= color.rgba;\n"
+
+            #ifdef STBVOX_CONFIG_PREMULTIPLIED_ALPHA
+            "   tex2.rgba *= texlerp;\n"
+            #else
+            "   tex2.a *= texlerp;\n"
+            #endif
+
+            "   if (texblend_mode)\n"
+            "      albedo = tex1.xyz * rlerp(tex2.a, vec3(1.0,1.0,1.0), 2.0*tex2.xyz);\n"
+            "   else {\n"
+            #ifdef STBVOX_CONFIG_PREMULTIPLIED_ALPHA
+            "      albedo = (1.0-tex2.a)*tex1.xyz + tex2.xyz;\n"
+            #else
+            "      albedo = rlerp(tex2.a, tex1.xyz, tex2.xyz);\n"
+            #endif
+            "      fragment_alpha = tex1.a*(1-tex2.a)+tex2.a;\n"
+            "   }\n"
+         #else
+            "      albedo = tex1.xyz;\n"
+         #endif
+
+      #else // UNTEXTURED
+         "   vec4 color;"
+         "   color.xyz = vec3(facedata.xyz) / 255.0;\n"
+         "   bool emissive = false;\n"
+         "   albedo = color.xyz;\n"
+         "   fragment_alpha = 1.0;\n"
+      #endif
+
+      #ifdef STBVOX_ICONFIG_VARYING_VERTEX_NORMALS
+         // currently, there are no modes that trigger this path; idea is that there
+         // could be a couple of bits per vertex to perturb the normal to e.g. get curved look
+         "   vec3 normal = normalize(vnormal);\n"
+      #else
+         "   vec3 normal = vnormal;\n"
+      #endif
+
+      "   vec3 ambient_color = dot(normal, ambient[0].xyz) * ambient[1].xyz + ambient[2].xyz;\n"
+
+      "   ambient_color = clamp(ambient_color, 0.0, 1.0);"
+      "   ambient_color *= amb_occ;\n"
+
+      "   vec3 lit_color;\n"
+      "   if (!emissive)\n"
+      #if defined(STBVOX_ICONFIG_LIGHTING) || defined(STBVOX_CONFIG_LIGHTING_SIMPLE)
+         "      lit_color = compute_lighting(voxelspace_pos + transform[1], normal, albedo, ambient_color);\n"
+      #else
+         "      lit_color = albedo * ambient_color ;\n"
+      #endif
+      "   else\n"
+      "      lit_color = albedo;\n"
+
+      #if defined(STBVOX_ICONFIG_FOG) || defined(STBVOX_CONFIG_FOG_SMOOTHSTEP)
+         "   vec3 dist = voxelspace_pos + (transform[1] - camera_pos.xyz);\n"
+         "   lit_color = compute_fog(lit_color, dist, fragment_alpha);\n"
+      #endif
+
+      #ifdef STBVOX_CONFIG_UNPREMULTIPLY
+      "   vec4 final_color = vec4(lit_color/fragment_alpha, fragment_alpha);\n"
+      #else
+      "   vec4 final_color = vec4(lit_color, fragment_alpha);\n"
+      #endif
+      "   outcolor = final_color;\n"
+      "}\n"
+
+   #ifdef STBVOX_CONFIG_LIGHTING_SIMPLE
+      "\n"
+      "uniform vec3 light_source[2];\n"
+      "vec3 compute_lighting(vec3 pos, vec3 norm, vec3 albedo, vec3 ambient)\n"
+      "{\n"
+      "   vec3 light_dir = light_source[0] - pos;\n"
+      "   float lambert = dot(light_dir, norm) / dot(light_dir, light_dir);\n"
+      "   vec3 diffuse = clamp(light_source[1] * clamp(lambert, 0.0, 1.0), 0.0, 1.0);\n"
+      "   return (diffuse + ambient) * albedo;\n"
+      "}\n"
+   #endif
+
+   #ifdef STBVOX_CONFIG_FOG_SMOOTHSTEP
+      "\n"
+      "vec3 compute_fog(vec3 color, vec3 relative_pos, float fragment_alpha)\n"
+      "{\n"
+      "   float f = dot(relative_pos,relative_pos)*ambient[3].w;\n"
+      //"   f = rlerp(f, -2,1);\n"
+      "   f = clamp(f, 0.0, 1.0);\n"
+      "   f = 3.0*f*f - 2.0*f*f*f;\n" // smoothstep
+      //"   f = f*f;\n"  // fade in more smoothly
+      #ifdef STBVOX_CONFIG_PREMULTIPLIED_ALPHA
+      "   return rlerp(f, color.xyz, ambient[3].xyz*fragment_alpha);\n"
+      #else
+      "   return rlerp(f, color.xyz, ambient[3].xyz);\n"
+      #endif
+      "}\n"
+   #endif
+};
+
+
+// still requires full alpha lookups, including tex2 if texblend is enabled
+static const char *stbvox_fragment_program_alpha_only =
+{
+   STBVOX_SHADER_VERSION
+
+   // vertex-shader output data
+   "flat in uvec4  facedata;\n"
+   "     in  vec3  voxelspace_pos;\n"
+   "     in float  texlerp;\n"
+
+   // per-buffer data
+   "uniform vec3 transform[3];\n"
+
+   #ifndef STBVOX_ICONFIG_UNTEXTURED
+      // generally constant data
+      "uniform sampler2DArray tex_array[2];\n"
+
+      #ifdef STBVOX_CONFIG_PREFER_TEXBUFFER
+         "uniform samplerBuffer texscale;\n"
+         "uniform samplerBuffer texgen;\n"
+      #else
+         "uniform vec4 texscale[64];\n" // instead of 128, to avoid running out of uniforms
+         "uniform vec3 texgen[64];\n"
+      #endif
+   #endif
+
+   "out vec4  outcolor;\n"
+
+   "void main()\n"
+   "{\n"
+   "   vec3 albedo;\n"
+   "   float fragment_alpha;\n"
+
+   #ifndef STBVOX_ICONFIG_UNTEXTURED
+      // unpack the values
+      "   uint tex1_id = facedata.x;\n"
+      "   uint tex2_id = facedata.y;\n"
+      "   uint texprojid = facedata.w & 31u;\n"
+      "   uint color_id  = facedata.z;\n"
+
+      #ifndef STBVOX_CONFIG_PREFER_TEXBUFFER
+         // load from uniforms / texture buffers
+         "   vec3 texgen_s = texgen[texprojid];\n"
+         "   vec3 texgen_t = texgen[texprojid+32u];\n"
+         "   float tex1_scale = texscale[tex1_id & 63u].x;\n"
+         "   vec4 color = color_table[color_id & 63u];\n"
+         "   vec4 tex2_props = texscale[tex2_id & 63u];\n"
+      #else
+         "   vec3 texgen_s = texelFetch(texgen, int(texprojid)).xyz;\n"
+         "   vec3 texgen_t = texelFetch(texgen, int(texprojid+32u)).xyz;\n"
+         "   float tex1_scale = texelFetch(texscale, int(tex1_id & 127u)).x;\n"
+         "   vec4 color = texelFetch(color_table, int(color_id & 63u));\n"
+         "   vec4 tex2_props = texelFetch(texscale, int(tex2_id & 127u));\n"
+      #endif
+
+      #ifndef STBVOX_CONFIG_DISABLE_TEX2
+      "   float tex2_scale = tex2_props.y;\n"
+      "   bool texblend_mode = tex2_props.z &((facedata.w & 128u) != 0u);\n"
+      #endif
+
+      "   color.a = min(color.a, 1.0);\n"
+
+      "   vec2 texcoord;\n"
+      "   vec3 texturespace_pos = voxelspace_pos + transform[2].xyz;\n"
+      "   texcoord.s = dot(texturespace_pos, texgen_s);\n"
+      "   texcoord.t = dot(texturespace_pos, texgen_t);\n"
+
+      "   vec2  texcoord_1 = tex1_scale * texcoord;\n"
+      "   vec2  texcoord_2 = tex2_scale * texcoord;\n"
+
+      #ifdef STBVOX_CONFIG_TEX1_EDGE_CLAMP
+      "   texcoord_1 = texcoord_1 - floor(texcoord_1);\n"
+      "   vec4 tex1 = textureGrad(tex_array[0], vec3(texcoord_1, float(tex1_id)), dFdx(tex1_scale*texcoord), dFdy(tex1_scale*texcoord));\n"
+      #else
+      "   vec4 tex1 = texture(tex_array[0], vec3(texcoord_1, float(tex1_id)));\n"
+      #endif
+
+      "   if ((color_id &  64u) != 0u) tex1.a *= color.a;\n"
+      "   fragment_alpha = tex1.a;\n"
+
+      #ifndef STBVOX_CONFIG_DISABLE_TEX2
+      "   if (!texblend_mode) {\n"
+         #ifdef STBVOX_CONFIG_TEX2_EDGE_CLAMP
+         "      texcoord_2 = texcoord_2 - floor(texcoord_2);\n"
+         "      vec4 tex2 = textureGrad(tex_array[0], vec3(texcoord_2, float(tex2_id)), dFdx(tex2_scale*texcoord), dFdy(tex2_scale*texcoord));\n"
+         #else
+         "      vec4 tex2 = texture(tex_array[1], vec3(texcoord_2, float(tex2_id)));\n"
+         #endif
+
+         "      tex2.a *= texlerp;\n"
+         "      if ((color_id & 128u) != 0u) tex2.rgba *= color.a;\n"
+         "      fragment_alpha = tex1.a*(1-tex2.a)+tex2.a;\n"
+         "}\n"
+      "\n"
+      #endif
+
+   #else // UNTEXTURED
+      "   fragment_alpha = 1.0;\n"
+   #endif
+
+   "   outcolor = vec4(0.0, 0.0, 0.0, fragment_alpha);\n"
+   "}\n"
+};
+
+
+STBVXDEC char *stbvox_get_vertex_shader(void)
+{
+   return (char *) stbvox_vertex_program;
+}
+
+STBVXDEC char *stbvox_get_fragment_shader(void)
+{
+   return (char *) stbvox_fragment_program;
+}
+
+STBVXDEC char *stbvox_get_fragment_shader_alpha_only(void)
+{
+   return (char *) stbvox_fragment_program_alpha_only;
+}
+
+static float stbvox_dummy_transform[3][3];
+
+#ifdef STBVOX_CONFIG_PREFER_TEXBUFFER
+#define STBVOX_TEXBUF 1
+#else
+#define STBVOX_TEXBUF 0
+#endif
+
+static stbvox_uniform_info stbvox_uniforms[] =
+{
+   { STBVOX_UNIFORM_TYPE_sampler  ,  4,   1, (char*) "facearray"    , 0                           },
+   { STBVOX_UNIFORM_TYPE_vec3     , 12,   3, (char*) "transform"    , stbvox_dummy_transform[0]   },
+   { STBVOX_UNIFORM_TYPE_sampler  ,  4,   2, (char*) "tex_array"    , 0                           },
+   { STBVOX_UNIFORM_TYPE_vec4     , 16, 128, (char*) "texscale"     , stbvox_default_texscale[0] , STBVOX_TEXBUF },
+   { STBVOX_UNIFORM_TYPE_vec4     , 16,  64, (char*) "color_table"  , stbvox_default_palette[0]  , STBVOX_TEXBUF },
+   { STBVOX_UNIFORM_TYPE_vec3     , 12,  32, (char*) "normal_table" , stbvox_default_normals[0]   },
+   { STBVOX_UNIFORM_TYPE_vec3     , 12,  64, (char*) "texgen"       , stbvox_default_texgen[0][0], STBVOX_TEXBUF },
+   { STBVOX_UNIFORM_TYPE_vec4     , 16,   4, (char*) "ambient"      , stbvox_default_ambient[0]   },
+   { STBVOX_UNIFORM_TYPE_vec4     , 16,   1, (char*) "camera_pos"   , stbvox_dummy_transform[0]   },
+};
+
+STBVXDEC int stbvox_get_uniform_info(stbvox_uniform_info *info, int uniform)
+{
+   if (uniform < 0 || uniform >= STBVOX_UNIFORM_count)
+      return 0;
+
+   *info = stbvox_uniforms[uniform];
+   return 1;
+}
+
+#define STBVOX_GET_GEO(geom_data)  ((geom_data) & 15)
+
+typedef struct
+{
+   unsigned char block:2;
+   unsigned char overlay:2;
+   unsigned char facerot:2;
+   unsigned char ecolor:2;
+} stbvox_rotate;
+
+typedef struct
+{
+   unsigned char x,y,z;
+} stbvox_pos;
+
+static unsigned char stbvox_rotate_face[6][4] =
+{
+   { 0,1,2,3 },
+   { 1,2,3,0 },
+   { 2,3,0,1 },
+   { 3,0,1,2 },
+   { 4,4,4,4 },
+   { 5,5,5,5 },
+};
+
+#define STBVOX_ROTATE(x,r)   stbvox_rotate_face[x][r] // (((x)+(r))&3)
+
+stbvox_mesh_face stbvox_compute_mesh_face_value(stbvox_mesh_maker *mm, stbvox_rotate rot, int face, int v_off, int normal)
+{
+   stbvox_mesh_face face_data = { 0 };
+   stbvox_block_type bt = mm->input.blocktype[v_off];
+   unsigned char bt_face = STBVOX_ROTATE(face, rot.block);
+   int facerot = rot.facerot;
+
+   #ifdef STBVOX_ICONFIG_UNTEXTURED
+   if (mm->input.rgb) {
+      face_data.tex1  = mm->input.rgb[v_off].r;
+      face_data.tex2  = mm->input.rgb[v_off].g;
+      face_data.color = mm->input.rgb[v_off].b;
+      face_data.face_info = (normal<<2);
+      return face_data;
+   }
+   #else
+   unsigned char color_face;
+
+   if (mm->input.color)
+      face_data.color = mm->input.color[v_off];
+
+   if (mm->input.block_tex1)
+      face_data.tex1 = mm->input.block_tex1[bt];
+   else if (mm->input.block_tex1_face)
+      face_data.tex1 = mm->input.block_tex1_face[bt][bt_face];
+   else
+      face_data.tex1 = bt;
+
+   if (mm->input.block_tex2)
+      face_data.tex2 = mm->input.block_tex2[bt];
+   else if (mm->input.block_tex2_face)
+      face_data.tex2 = mm->input.block_tex2_face[bt][bt_face];
+
+   if (mm->input.block_color) {
+      unsigned char mcol = mm->input.block_color[bt];
+      if (mcol)
+         face_data.color = mcol;
+   } else if (mm->input.block_color_face) {
+      unsigned char mcol = mm->input.block_color_face[bt][bt_face];
+      if (mcol)
+         face_data.color = mcol;
+   }
+
+   if (face <= STBVOX_FACE_south) {
+      if (mm->input.side_texrot)
+         facerot = mm->input.side_texrot[v_off] >> (2 * face);
+      else if (mm->input.block_side_texrot)
+         facerot = mm->input.block_side_texrot[v_off] >> (2 * bt_face);
+   }
+
+   if (mm->input.overlay) {
+      int over_face = STBVOX_ROTATE(face, rot.overlay);
+      unsigned char over = mm->input.overlay[v_off];
+      if (over) {
+         if (mm->input.overlay_tex1) {
+            unsigned char rep1 = mm->input.overlay_tex1[over][over_face];
+            if (rep1)
+               face_data.tex1 = rep1;
+         }
+         if (mm->input.overlay_tex2) {
+            unsigned char rep2 = mm->input.overlay_tex2[over][over_face];
+            if (rep2)
+               face_data.tex2 = rep2;
+         }
+         if (mm->input.overlay_color) {
+            unsigned char rep3 = mm->input.overlay_color[over][over_face];
+            if (rep3)
+               face_data.color = rep3;
+         }
+
+         if (mm->input.overlay_side_texrot && face <= STBVOX_FACE_south)
+            facerot = mm->input.overlay_side_texrot[over] >> (2*over_face);
+      }
+   }
+
+   if (mm->input.tex2_for_tex1)
+      face_data.tex2 = mm->input.tex2_for_tex1[face_data.tex1];
+   if (mm->input.tex2)
+      face_data.tex2 = mm->input.tex2[v_off];
+   if (mm->input.tex2_replace) {
+      if (mm->input.tex2_facemask[v_off] & (1 << face))
+         face_data.tex2 = mm->input.tex2_replace[v_off];
+   }
+
+   color_face = STBVOX_ROTATE(face, rot.ecolor);
+   if (mm->input.extended_color) {
+      unsigned char ec = mm->input.extended_color[v_off];
+      if (mm->input.ecolor_facemask[ec] & (1 << color_face))
+         face_data.color = mm->input.ecolor_color[ec];
+   }
+
+   if (mm->input.color2) {
+      if (mm->input.color2_facemask[v_off] & (1 << color_face))
+         face_data.color = mm->input.color2[v_off];
+      if (mm->input.color3 && (mm->input.color3_facemask[v_off] & (1 << color_face)))
+         face_data.color = mm->input.color3[v_off];
+   }
+   #endif
+
+   face_data.face_info = (normal<<2) + facerot;
+   return face_data;
+}
+
+// these are the types of faces each block can have
+enum
+{
+   STBVOX_FT_none    ,
+   STBVOX_FT_upper   ,
+   STBVOX_FT_lower   ,
+   STBVOX_FT_solid   ,
+   STBVOX_FT_diag_012,
+   STBVOX_FT_diag_023,
+   STBVOX_FT_diag_013,
+   STBVOX_FT_diag_123,
+   STBVOX_FT_force   , // can't be covered up, used for internal faces, also hides nothing
+   STBVOX_FT_partial , // only covered by solid, never covers anything else
+
+   STBVOX_FT_count
+};
+
+static unsigned char stbvox_face_lerp[6] = { 0,2,0,2,4,4 };
+static unsigned char stbvox_vert3_lerp[5] = { 0,3,6,9,12 };
+static unsigned char stbvox_vert_lerp_for_face_lerp[4] = { 0, 4, 7, 7 };
+static unsigned char stbvox_face3_lerp[6] = { 0,3,6,9,12,14 };
+static unsigned char stbvox_vert_lerp_for_simple[4] = { 0,2,5,7 };
+static unsigned char stbvox_face3_updown[8] = { 0,2,5,7,0,2,5,7 }; // ignore top bit
+
+// vertex offsets for face vertices
+static unsigned char stbvox_vertex_vector[6][4][3] =
+{
+   { { 1,0,1 }, { 1,1,1 }, { 1,1,0 }, { 1,0,0 } }, // east
+   { { 1,1,1 }, { 0,1,1 }, { 0,1,0 }, { 1,1,0 } }, // north
+   { { 0,1,1 }, { 0,0,1 }, { 0,0,0 }, { 0,1,0 } }, // west
+   { { 0,0,1 }, { 1,0,1 }, { 1,0,0 }, { 0,0,0 } }, // south
+   { { 0,1,1 }, { 1,1,1 }, { 1,0,1 }, { 0,0,1 } }, // up
+   { { 0,0,0 }, { 1,0,0 }, { 1,1,0 }, { 0,1,0 } }, // down
+};
+
+// stbvox_vertex_vector, but read coordinates as binary numbers, zyx
+static unsigned char stbvox_vertex_selector[6][4] =
+{
+   { 5,7,3,1 },
+   { 7,6,2,3 },
+   { 6,4,0,2 },
+   { 4,5,1,0 },
+   { 6,7,5,4 },
+   { 0,1,3,2 },
+};
+
+static stbvox_mesh_vertex stbvox_vmesh_delta_normal[6][4] =
+{
+   {  stbvox_vertex_encode(1,0,1,0,0) ,
+      stbvox_vertex_encode(1,1,1,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0)  },
+   {  stbvox_vertex_encode(1,1,1,0,0) ,
+      stbvox_vertex_encode(0,1,1,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0)  },
+   {  stbvox_vertex_encode(0,1,1,0,0) ,
+      stbvox_vertex_encode(0,0,1,0,0) ,
+      stbvox_vertex_encode(0,0,0,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0)  },
+   {  stbvox_vertex_encode(0,0,1,0,0) ,
+      stbvox_vertex_encode(1,0,1,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(0,0,0,0,0)  },
+   {  stbvox_vertex_encode(0,1,1,0,0) ,
+      stbvox_vertex_encode(1,1,1,0,0) ,
+      stbvox_vertex_encode(1,0,1,0,0) ,
+      stbvox_vertex_encode(0,0,1,0,0)  },
+   {  stbvox_vertex_encode(0,0,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0)  }
+};
+
+static stbvox_mesh_vertex stbvox_vmesh_pre_vheight[6][4] =
+{
+   {  stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0)  },
+   {  stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0)  },
+   {  stbvox_vertex_encode(0,1,0,0,0) ,
+      stbvox_vertex_encode(0,0,0,0,0) ,
+      stbvox_vertex_encode(0,0,0,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0)  },
+   {  stbvox_vertex_encode(0,0,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(0,0,0,0,0)  },
+   {  stbvox_vertex_encode(0,1,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(0,0,0,0,0)  },
+   {  stbvox_vertex_encode(0,0,0,0,0) ,
+      stbvox_vertex_encode(1,0,0,0,0) ,
+      stbvox_vertex_encode(1,1,0,0,0) ,
+      stbvox_vertex_encode(0,1,0,0,0)  }
+};
+
+static stbvox_mesh_vertex stbvox_vmesh_delta_half_z[6][4] =
+{
+   { stbvox_vertex_encode(1,0,2,0,0) ,
+     stbvox_vertex_encode(1,1,2,0,0) ,
+     stbvox_vertex_encode(1,1,0,0,0) ,
+     stbvox_vertex_encode(1,0,0,0,0)  },
+   { stbvox_vertex_encode(1,1,2,0,0) ,
+     stbvox_vertex_encode(0,1,2,0,0) ,
+     stbvox_vertex_encode(0,1,0,0,0) ,
+     stbvox_vertex_encode(1,1,0,0,0)  },
+   { stbvox_vertex_encode(0,1,2,0,0) ,
+     stbvox_vertex_encode(0,0,2,0,0) ,
+     stbvox_vertex_encode(0,0,0,0,0) ,
+     stbvox_vertex_encode(0,1,0,0,0)  },
+   { stbvox_vertex_encode(0,0,2,0,0) ,
+     stbvox_vertex_encode(1,0,2,0,0) ,
+     stbvox_vertex_encode(1,0,0,0,0) ,
+     stbvox_vertex_encode(0,0,0,0,0)  },
+   { stbvox_vertex_encode(0,1,2,0,0) ,
+     stbvox_vertex_encode(1,1,2,0,0) ,
+     stbvox_vertex_encode(1,0,2,0,0) ,
+     stbvox_vertex_encode(0,0,2,0,0)  },
+   { stbvox_vertex_encode(0,0,0,0,0) ,
+     stbvox_vertex_encode(1,0,0,0,0) ,
+     stbvox_vertex_encode(1,1,0,0,0) ,
+     stbvox_vertex_encode(0,1,0,0,0)  }
+};
+
+static stbvox_mesh_vertex stbvox_vmesh_crossed_pair[6][4] =
+{
+   { stbvox_vertex_encode(1,0,2,0,0) ,
+     stbvox_vertex_encode(0,1,2,0,0) ,
+     stbvox_vertex_encode(0,1,0,0,0) ,
+     stbvox_vertex_encode(1,0,0,0,0)  },
+   { stbvox_vertex_encode(1,1,2,0,0) ,
+     stbvox_vertex_encode(0,0,2,0,0) ,
+     stbvox_vertex_encode(0,0,0,0,0) ,
+     stbvox_vertex_encode(1,1,0,0,0)  },
+   { stbvox_vertex_encode(0,1,2,0,0) ,
+     stbvox_vertex_encode(1,0,2,0,0) ,
+     stbvox_vertex_encode(1,0,0,0,0) ,
+     stbvox_vertex_encode(0,1,0,0,0)  },
+   { stbvox_vertex_encode(0,0,2,0,0) ,
+     stbvox_vertex_encode(1,1,2,0,0) ,
+     stbvox_vertex_encode(1,1,0,0,0) ,
+     stbvox_vertex_encode(0,0,0,0,0)  },
+   // not used, so we leave it non-degenerate to make sure it doesn't get gen'd accidentally
+   { stbvox_vertex_encode(0,1,2,0,0) ,
+     stbvox_vertex_encode(1,1,2,0,0) ,
+     stbvox_vertex_encode(1,0,2,0,0) ,
+     stbvox_vertex_encode(0,0,2,0,0)  },
+   { stbvox_vertex_encode(0,0,0,0,0) ,
+     stbvox_vertex_encode(1,0,0,0,0) ,
+     stbvox_vertex_encode(1,1,0,0,0) ,
+     stbvox_vertex_encode(0,1,0,0,0)  }
+};
+
+#define STBVOX_MAX_GEOM     16
+#define STBVOX_NUM_ROTATION  4
+
+// this is used to determine if a face is ever generated at all
+static unsigned char stbvox_hasface[STBVOX_MAX_GEOM][STBVOX_NUM_ROTATION] =
+{
+   { 0,0,0,0 }, // empty
+   { 0,0,0,0 }, // knockout
+   { 63,63,63,63 }, // solid
+   { 63,63,63,63 }, // transp
+   { 63,63,63,63 }, // slab
+   { 63,63,63,63 }, // slab
+   { 1|2|4|48, 8|1|2|48, 4|8|1|48, 2|4|8|48, }, // floor slopes
+   { 1|2|4|48, 8|1|2|48, 4|8|1|48, 2|4|8|48, }, // ceil slopes
+   { 47,47,47,47 }, // wall-projected diagonal with down face
+   { 31,31,31,31 }, // wall-projected diagonal with up face
+   { 63,63,63,63 }, // crossed-pair has special handling, but avoid early-out
+   { 63,63,63,63 }, // force
+   { 63,63,63,63 }, // vheight
+   { 63,63,63,63 }, // vheight
+   { 63,63,63,63 }, // vheight
+   { 63,63,63,63 }, // vheight
+};
+
+// this determines which face type above is visible on each side of the geometry
+static unsigned char stbvox_facetype[STBVOX_GEOM_count][6] =
+{
+   { 0, },  // STBVOX_GEOM_empty
+   { STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid }, // knockout
+   { STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid, STBVOX_FT_solid }, // solid
+   { STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force }, // transp
+
+   { STBVOX_FT_upper, STBVOX_FT_upper, STBVOX_FT_upper, STBVOX_FT_upper, STBVOX_FT_solid, STBVOX_FT_force },
+   { STBVOX_FT_lower, STBVOX_FT_lower, STBVOX_FT_lower, STBVOX_FT_lower, STBVOX_FT_force, STBVOX_FT_solid },
+   { STBVOX_FT_diag_123, STBVOX_FT_solid, STBVOX_FT_diag_023, STBVOX_FT_none, STBVOX_FT_force, STBVOX_FT_solid },
+   { STBVOX_FT_diag_012, STBVOX_FT_solid, STBVOX_FT_diag_013, STBVOX_FT_none, STBVOX_FT_solid, STBVOX_FT_force },
+
+   { STBVOX_FT_diag_123, STBVOX_FT_solid, STBVOX_FT_diag_023, STBVOX_FT_force, STBVOX_FT_none, STBVOX_FT_solid },
+   { STBVOX_FT_diag_012, STBVOX_FT_solid, STBVOX_FT_diag_013, STBVOX_FT_force, STBVOX_FT_solid, STBVOX_FT_none },
+   { STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, 0,0 }, // crossed pair
+   { STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force, STBVOX_FT_force }, // GEOM_force
+
+   { STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial, STBVOX_FT_force, STBVOX_FT_solid }, // floor vheight, all neighbors forced
+   { STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial, STBVOX_FT_force, STBVOX_FT_solid }, // floor vheight, all neighbors forced
+   { STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial, STBVOX_FT_solid, STBVOX_FT_force }, // ceil vheight, all neighbors forced
+   { STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial,STBVOX_FT_partial, STBVOX_FT_solid, STBVOX_FT_force }, // ceil vheight, all neighbors forced
+};
+
+// This table indicates what normal to use for the "up" face of a sloped geom
+// @TODO this could be done with math given the current arrangement of the enum, but let's not require it
+static unsigned char stbvox_floor_slope_for_rot[4] =
+{
+   STBVF_su,
+   STBVF_wu, // @TODO: why is this reversed from what it should be? this is a north-is-up face, so slope should be south&up
+   STBVF_nu,
+   STBVF_eu,
+};
+
+static unsigned char stbvox_ceil_slope_for_rot[4] =
+{
+   STBVF_sd,
+   STBVF_ed,
+   STBVF_nd,
+   STBVF_wd,
+};
+
+// this table indicates whether, for each pair of types above, a face is visible.
+// each value indicates whether a given type is visible for all neighbor types
+static unsigned short stbvox_face_visible[STBVOX_FT_count] =
+{
+   // we encode the table by listing which cases cause *obscuration*, and bitwise inverting that
+   // table is pre-shifted by 5 to save a shift when it's accessed
+   (unsigned short) ((~0x07ffu                                          )<<5),  // none is completely obscured by everything
+   (unsigned short) ((~((1u<<STBVOX_FT_solid) | (1<<STBVOX_FT_upper)   ))<<5),  // upper
+   (unsigned short) ((~((1u<<STBVOX_FT_solid) | (1<<STBVOX_FT_lower)   ))<<5),  // lower
+   (unsigned short) ((~((1u<<STBVOX_FT_solid)                          ))<<5),  // solid is only completely obscured only by solid
+   (unsigned short) ((~((1u<<STBVOX_FT_solid) | (1<<STBVOX_FT_diag_013)))<<5),  // diag012 matches diag013
+   (unsigned short) ((~((1u<<STBVOX_FT_solid) | (1<<STBVOX_FT_diag_123)))<<5),  // diag023 matches diag123
+   (unsigned short) ((~((1u<<STBVOX_FT_solid) | (1<<STBVOX_FT_diag_012)))<<5),  // diag013 matches diag012
+   (unsigned short) ((~((1u<<STBVOX_FT_solid) | (1<<STBVOX_FT_diag_023)))<<5),  // diag123 matches diag023
+   (unsigned short) ((~0u                                               )<<5),  // force is always rendered regardless, always forces neighbor
+   (unsigned short) ((~((1u<<STBVOX_FT_solid)                          ))<<5),  // partial is only completely obscured only by solid
+};
+
+// the vertex heights of the block types, in binary vertex order (zyx):
+// lower: SW, SE, NW, NE; upper: SW, SE, NW, NE
+static stbvox_mesh_vertex stbvox_geometry_vheight[8][8] =
+{
+   #define STBVOX_HEIGHTS(a,b,c,d,e,f,g,h) \
+     { stbvox_vertex_encode(0,0,a,0,0),  \
+       stbvox_vertex_encode(0,0,b,0,0),  \
+       stbvox_vertex_encode(0,0,c,0,0),  \
+       stbvox_vertex_encode(0,0,d,0,0),  \
+       stbvox_vertex_encode(0,0,e,0,0),  \
+       stbvox_vertex_encode(0,0,f,0,0),  \
+       stbvox_vertex_encode(0,0,g,0,0),  \
+       stbvox_vertex_encode(0,0,h,0,0) }
+
+   STBVOX_HEIGHTS(0,0,0,0, 2,2,2,2),
+   STBVOX_HEIGHTS(0,0,0,0, 2,2,2,2),
+   STBVOX_HEIGHTS(0,0,0,0, 2,2,2,2),
+   STBVOX_HEIGHTS(0,0,0,0, 2,2,2,2),
+   STBVOX_HEIGHTS(1,1,1,1, 2,2,2,2),
+   STBVOX_HEIGHTS(0,0,0,0, 1,1,1,1),
+   STBVOX_HEIGHTS(0,0,0,0, 0,0,2,2),
+   STBVOX_HEIGHTS(2,2,0,0, 2,2,2,2),
+};
+
+// rotate vertices defined as [z][y][x] coords
+static unsigned char stbvox_rotate_vertex[8][4] =
+{
+   { 0,1,3,2 }, // zyx=000
+   { 1,3,2,0 }, // zyx=001
+   { 2,0,1,3 }, // zyx=010
+   { 3,2,0,1 }, // zyx=011
+   { 4,5,7,6 }, // zyx=100
+   { 5,7,6,4 }, // zyx=101
+   { 6,4,5,7 }, // zyx=110
+   { 7,6,4,5 }, // zyx=111
+};
+
+#ifdef STBVOX_CONFIG_OPTIMIZED_VHEIGHT
+// optimized vheight generates a single normal over the entire face, even if it's not planar
+static unsigned char stbvox_optimized_face_up_normal[4][4][4][4] =
+{
+   {
+      {
+         { STBVF_u   , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+         { STBVF_nw_u, STBVF_nu  , STBVF_nu  , STBVF_ne_u, },
+         { STBVF_nw_u, STBVF_nu  , STBVF_nu  , STBVF_nu  , },
+         { STBVF_nw_u, STBVF_nw_u, STBVF_nu  , STBVF_nu  , },
+      },{
+         { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+         { STBVF_u   , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+         { STBVF_nw_u, STBVF_nu  , STBVF_nu  , STBVF_ne_u, },
+         { STBVF_nw_u, STBVF_nu  , STBVF_nu  , STBVF_nu  , },
+      },{
+         { STBVF_eu  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+         { STBVF_u   , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+         { STBVF_nw_u, STBVF_nu  , STBVF_nu  , STBVF_ne_u, },
+      },{
+         { STBVF_eu  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_eu  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+         { STBVF_u   , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+      },
+   },{
+      {
+         { STBVF_sw_u, STBVF_u   , STBVF_ne_u, STBVF_ne_u, },
+         { STBVF_wu  , STBVF_nw_u, STBVF_nu  , STBVF_nu  , },
+         { STBVF_wu  , STBVF_nw_u, STBVF_nu  , STBVF_nu  , },
+         { STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, STBVF_nu  , },
+      },{
+         { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_u   , STBVF_ne_u, STBVF_ne_u, },
+         { STBVF_wu  , STBVF_nw_u, STBVF_nu  , STBVF_nu  , },
+         { STBVF_wu  , STBVF_nw_u, STBVF_nu  , STBVF_nu  , },
+      },{
+         { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_u   , STBVF_ne_u, STBVF_ne_u, },
+         { STBVF_wu  , STBVF_nw_u, STBVF_nu  , STBVF_nu  , },
+      },{
+         { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_u   , STBVF_ne_u, STBVF_ne_u, },
+      },
+   },{
+      {
+         { STBVF_sw_u, STBVF_sw_u, STBVF_u   , STBVF_ne_u, },
+         { STBVF_wu  , STBVF_wu  , STBVF_nw_u, STBVF_nu  , },
+         { STBVF_wu  , STBVF_wu  , STBVF_nw_u, STBVF_nu  , },
+         { STBVF_wu  , STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, },
+      },{
+         { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_sw_u, STBVF_u   , STBVF_ne_u, },
+         { STBVF_wu  , STBVF_wu  , STBVF_nw_u, STBVF_nu  , },
+         { STBVF_wu  , STBVF_wu  , STBVF_nw_u, STBVF_nu  , },
+      },{
+         { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_sw_u, STBVF_u   , STBVF_ne_u, },
+         { STBVF_wu  , STBVF_wu  , STBVF_nw_u, STBVF_nu  , },
+      },{
+         { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+         { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_sw_u, STBVF_u   , STBVF_ne_u, },
+      },
+   },{
+      {
+         { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_u   , },
+         { STBVF_sw_u, STBVF_wu  , STBVF_wu  , STBVF_nw_u, },
+         { STBVF_wu  , STBVF_wu  , STBVF_wu  , STBVF_nw_u, },
+         { STBVF_wu  , STBVF_wu  , STBVF_nw_u, STBVF_nw_u, },
+      },{
+         { STBVF_sw_u, STBVF_su  , STBVF_su  , STBVF_su  , },
+         { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_u   , },
+         { STBVF_sw_u, STBVF_wu  , STBVF_wu  , STBVF_nw_u, },
+         { STBVF_wu  , STBVF_wu  , STBVF_wu  , STBVF_nw_u, },
+      },{
+         { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_su  , STBVF_su  , STBVF_su  , },
+         { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_u   , },
+         { STBVF_sw_u, STBVF_wu  , STBVF_wu  , STBVF_nw_u, },
+      },{
+         { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+         { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+         { STBVF_sw_u, STBVF_su  , STBVF_su  , STBVF_su  , },
+         { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_u   , },
+      },
+   },
+};
+#else
+// which normal to use for a given vheight that's planar
+// @TODO: this table was constructed by hand and may have bugs
+//                                 nw se sw
+static unsigned char stbvox_planar_face_up_normal[4][4][4] =
+{
+   {                                                      // sw,se,nw,ne;  ne = se+nw-sw
+      { STBVF_u   , 0         , 0         , 0          }, //  0,0,0,0; 1,0,0,-1; 2,0,0,-2; 3,0,0,-3;
+      { STBVF_u   , STBVF_u   , 0         , 0          }, //  0,1,0,1; 1,1,0, 0; 2,1,0,-1; 3,1,0,-2;
+      { STBVF_wu  , STBVF_nw_u, STBVF_nu  , 0          }, //  0,2,0,2; 1,2,0, 1; 2,2,0, 0; 3,2,0,-1;
+      { STBVF_wu  , STBVF_nw_u, STBVF_nw_u, STBVF_nu   }, //  0,3,0,3; 1,3,0, 2; 2,3,0, 1; 3,3,0, 0;
+   },{
+      { STBVF_u   , STBVF_u   , 0         , 0          }, //  0,0,1,1; 1,0,1, 0; 2,0,1,-1; 3,0,1,-2;
+      { STBVF_sw_u, STBVF_u   , STBVF_ne_u, 0          }, //  0,1,1,2; 1,1,1, 1; 2,1,1, 0; 3,1,1,-1;
+      { STBVF_sw_u, STBVF_u   , STBVF_u   , STBVF_ne_u }, //  0,2,1,3; 1,2,1, 2; 2,2,1, 1; 3,2,1, 0;
+      { 0         , STBVF_wu  , STBVF_nw_u, STBVF_nu   }, //  0,3,1,4; 1,3,1, 3; 2,3,1, 2; 3,3,1, 1;
+   },{
+      { STBVF_su  , STBVF_se_u, STBVF_eu  , 0          }, //  0,0,2,2; 1,0,2, 1; 2,0,2, 0; 3,0,2,-1;
+      { STBVF_sw_u, STBVF_u   , STBVF_u   , STBVF_ne_u }, //  0,1,2,3; 1,1,2, 2; 2,1,2, 1; 3,1,2, 0;
+      { 0         , STBVF_sw_u, STBVF_u   , STBVF_ne_u }, //  0,2,2,4; 1,2,2, 3; 2,2,2, 2; 3,2,2, 1;
+      { 0         , 0         , STBVF_u   , STBVF_u    }, //  0,3,2,5; 1,3,2, 4; 2,3,2, 3; 3,3,2, 2;
+   },{
+      { STBVF_su  , STBVF_se_u, STBVF_se_u, STBVF_eu   }, //  0,0,3,3; 1,0,3, 2; 2,0,3, 1; 3,0,3, 0;
+      { 0         , STBVF_su  , STBVF_se_u, STBVF_eu   }, //  0,1,3,4; 1,1,3, 3; 2,1,3, 2; 3,1,3, 1;
+      { 0         , 0         , STBVF_u   , STBVF_u    }, //  0,2,3,5; 1,2,3, 4; 2,2,3, 3; 3,2,3, 2;
+      { 0         , 0         , 0         , STBVF_u    }, //  0,3,3,6; 1,3,3, 5; 2,3,3, 4; 3,3,3, 3;
+   }
+};
+
+// these tables were constructed automatically using a variant of the code
+// below; however, they seem wrong, so who knows
+static unsigned char stbvox_face_up_normal_012[4][4][4] =
+{
+   {
+      { STBVF_u   , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_wu  , STBVF_nu  , STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_wu  , STBVF_nw_u, STBVF_nu  , STBVF_ne_u, },
+      { STBVF_wu  , STBVF_nw_u, STBVF_nw_u, STBVF_nu  , },
+   },{
+      { STBVF_su  , STBVF_eu  , STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_sw_u, STBVF_u   , STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_sw_u, STBVF_wu  , STBVF_nu  , STBVF_ne_u, },
+      { STBVF_sw_u, STBVF_wu  , STBVF_nw_u, STBVF_nu  , },
+   },{
+      { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+      { STBVF_sw_u, STBVF_su  , STBVF_eu  , STBVF_ne_u, },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_u   , STBVF_ne_u, },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_wu  , STBVF_nu  , },
+   },{
+      { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+      { STBVF_sw_u, STBVF_su  , STBVF_eu  , STBVF_eu  , },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_su  , STBVF_eu  , },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_u   , },
+   }
+};
+
+static unsigned char stbvox_face_up_normal_013[4][4][4] =
+{
+   {
+      { STBVF_u   , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+      { STBVF_nw_u, STBVF_nu  , STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_nw_u, STBVF_nw_u, STBVF_nu  , STBVF_ne_u, },
+      { STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, STBVF_nu  , },
+   },{
+      { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+      { STBVF_wu  , STBVF_u   , STBVF_eu  , STBVF_eu  , },
+      { STBVF_nw_u, STBVF_nw_u, STBVF_nu  , STBVF_ne_u, },
+      { STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, STBVF_nu  , },
+   },{
+      { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+      { STBVF_sw_u, STBVF_su  , STBVF_eu  , STBVF_eu  , },
+      { STBVF_wu  , STBVF_wu  , STBVF_u   , STBVF_eu  , },
+      { STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, STBVF_nu  , },
+   },{
+      { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_eu  , },
+      { STBVF_sw_u, STBVF_su  , STBVF_su  , STBVF_su  , },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_su  , STBVF_eu  , },
+      { STBVF_wu  , STBVF_wu  , STBVF_wu  , STBVF_u   , },
+   }
+};
+
+static unsigned char stbvox_face_up_normal_023[4][4][4] =
+{
+   {
+      { STBVF_u   , STBVF_nu  , STBVF_nu  , STBVF_nu  , },
+      { STBVF_eu  , STBVF_eu  , STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+      { STBVF_eu  , STBVF_eu  , STBVF_eu  , STBVF_eu  , },
+   },{
+      { STBVF_wu  , STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, },
+      { STBVF_su  , STBVF_u   , STBVF_nu  , STBVF_nu  , },
+      { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+      { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+   },{
+      { STBVF_wu  , STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, },
+      { STBVF_sw_u, STBVF_wu  , STBVF_nw_u, STBVF_nw_u, },
+      { STBVF_su  , STBVF_su  , STBVF_u   , STBVF_nu  , },
+      { STBVF_su  , STBVF_su  , STBVF_eu  , STBVF_eu  , },
+   },{
+      { STBVF_wu  , STBVF_nw_u, STBVF_nw_u, STBVF_nw_u, },
+      { STBVF_sw_u, STBVF_wu  , STBVF_nw_u, STBVF_nw_u, },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_wu  , STBVF_nw_u, },
+      { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_u   , },
+   }
+};
+
+static unsigned char stbvox_face_up_normal_123[4][4][4] =
+{
+   {
+      { STBVF_u   , STBVF_nu  , STBVF_nu  , STBVF_nu  , },
+      { STBVF_eu  , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_eu  , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_eu  , STBVF_ne_u, STBVF_ne_u, STBVF_ne_u, },
+   },{
+      { STBVF_sw_u, STBVF_wu  , STBVF_nw_u, STBVF_nw_u, },
+      { STBVF_su  , STBVF_u   , STBVF_nu  , STBVF_nu  , },
+      { STBVF_eu  , STBVF_eu  , STBVF_ne_u, STBVF_ne_u, },
+      { STBVF_eu  , STBVF_eu  , STBVF_ne_u, STBVF_ne_u, },
+   },{
+      { STBVF_sw_u, STBVF_sw_u, STBVF_wu  , STBVF_nw_u, },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_wu  , STBVF_nw_u, },
+      { STBVF_su  , STBVF_su  , STBVF_u   , STBVF_nu  , },
+      { STBVF_su  , STBVF_eu  , STBVF_eu  , STBVF_ne_u, },
+   },{
+      { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_wu  , },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_wu  , },
+      { STBVF_sw_u, STBVF_sw_u, STBVF_sw_u, STBVF_wu  , },
+      { STBVF_su  , STBVF_su  , STBVF_su  , STBVF_u   , },
+   }
+};
+#endif
+
+void stbvox_get_quad_vertex_pointer(stbvox_mesh_maker *mm, int mesh, stbvox_mesh_vertex **vertices, stbvox_mesh_face face)
+{
+   char *p = mm->output_cur[mesh][0];
+   int step = mm->output_step[mesh][0];
+
+   // allocate a new quad from the mesh
+   vertices[0] = (stbvox_mesh_vertex *) p; p += step;
+   vertices[1] = (stbvox_mesh_vertex *) p; p += step;
+   vertices[2] = (stbvox_mesh_vertex *) p; p += step;
+   vertices[3] = (stbvox_mesh_vertex *) p; p += step;
+   mm->output_cur[mesh][0] = p;
+
+   // output the face
+   #ifdef STBVOX_ICONFIG_FACE_ATTRIBUTE
+      // write face as interleaved vertex data
+      *(stbvox_mesh_face *) (vertices[0]+1) = face;
+      *(stbvox_mesh_face *) (vertices[1]+1) = face;
+      *(stbvox_mesh_face *) (vertices[2]+1) = face;
+      *(stbvox_mesh_face *) (vertices[3]+1) = face;
+   #else
+      *(stbvox_mesh_face *) mm->output_cur[mesh][1] = face;
+      mm->output_cur[mesh][1] += 4;
+   #endif
+}
+
+void stbvox_make_mesh_for_face(stbvox_mesh_maker *mm, stbvox_rotate rot, int face, int v_off, stbvox_pos pos, stbvox_mesh_vertex vertbase, stbvox_mesh_vertex *face_coord, unsigned char mesh, int normal)
+{
+   stbvox_mesh_face face_data = stbvox_compute_mesh_face_value(mm,rot,face,v_off, normal);
+
+   // still need to compute ao & texlerp for each vertex
+
+   // first compute texlerp into p1
+   stbvox_mesh_vertex p1[4] = { 0 };
+
+   #if defined(STBVOX_CONFIG_DOWN_TEXLERP_PACKED) && defined(STBVOX_CONFIG_UP_TEXLERP_PACKED)
+      #define STBVOX_USE_PACKED(f) ((f) == STBVOX_FACE_up || (f) == STBVOX_FACE_down)
+   #elif defined(STBVOX_CONFIG_UP_TEXLERP_PACKED)
+      #define STBVOX_USE_PACKED(f) ((f) == STBVOX_FACE_up                           )
+   #elif defined(STBVOX_CONFIG_DOWN_TEXLERP_PACKED)
+      #define STBVOX_USE_PACKED(f) (                         (f) == STBVOX_FACE_down)
+   #endif
+
+   #if defined(STBVOX_CONFIG_DOWN_TEXLERP_PACKED) || defined(STBVOX_CONFIG_UP_TEXLERP_PACKED)
+   if (STBVOX_USE_PACKED(face)) {
+      if (!mm->input.packed_compact || 0==(mm->input.packed_compact[v_off]&16))
+         goto set_default;
+      p1[0] = (mm->input.packed_compact[v_off + mm->cube_vertex_offset[face][0]] >> 5);
+      p1[1] = (mm->input.packed_compact[v_off + mm->cube_vertex_offset[face][1]] >> 5);
+      p1[2] = (mm->input.packed_compact[v_off + mm->cube_vertex_offset[face][2]] >> 5);
+      p1[3] = (mm->input.packed_compact[v_off + mm->cube_vertex_offset[face][3]] >> 5);
+      p1[0] = stbvox_vertex_encode(0,0,0,0,p1[0]);
+      p1[1] = stbvox_vertex_encode(0,0,0,0,p1[1]);
+      p1[2] = stbvox_vertex_encode(0,0,0,0,p1[2]);
+      p1[3] = stbvox_vertex_encode(0,0,0,0,p1[3]);
+      goto skip;
+   }
+   #endif
+
+   if (mm->input.block_texlerp) {
+      stbvox_block_type bt = mm->input.blocktype[v_off];
+      unsigned char val = mm->input.block_texlerp[bt];
+      p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,val);
+   } else if (mm->input.block_texlerp_face) {
+      stbvox_block_type bt = mm->input.blocktype[v_off];
+      unsigned char bt_face = STBVOX_ROTATE(face, rot.block);
+      unsigned char val = mm->input.block_texlerp_face[bt][bt_face];
+      p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,val);
+   } else if (mm->input.texlerp_face3) {
+      unsigned char val = (mm->input.texlerp_face3[v_off] >> stbvox_face3_lerp[face]) & 7;
+      if (face >= STBVOX_FACE_up)
+         val = stbvox_face3_updown[val];
+      p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,val);
+   } else if (mm->input.texlerp_simple) {
+      unsigned char val = mm->input.texlerp_simple[v_off];
+      unsigned char lerp_face = (val >> 2) & 7;
+      if (lerp_face == face) {
+         p1[0] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][0]] >> 5) & 7;
+         p1[1] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][1]] >> 5) & 7;
+         p1[2] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][2]] >> 5) & 7;
+         p1[3] = (mm->input.texlerp_simple[v_off + mm->cube_vertex_offset[face][3]] >> 5) & 7;
+         p1[0] = stbvox_vertex_encode(0,0,0,0,p1[0]);
+         p1[1] = stbvox_vertex_encode(0,0,0,0,p1[1]);
+         p1[2] = stbvox_vertex_encode(0,0,0,0,p1[2]);
+         p1[3] = stbvox_vertex_encode(0,0,0,0,p1[3]);
+      } else {
+         unsigned char base = stbvox_vert_lerp_for_simple[val&3];
+         p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,base);
+      }
+   } else if (mm->input.texlerp) {
+      unsigned char facelerp = (mm->input.texlerp[v_off] >> stbvox_face_lerp[face]) & 3;
+      if (facelerp == STBVOX_TEXLERP_FACE_use_vert) {
+         if (mm->input.texlerp_vert3 && face != STBVOX_FACE_down) {
+            unsigned char shift = stbvox_vert3_lerp[face];
+            p1[0] = (mm->input.texlerp_vert3[mm->cube_vertex_offset[face][0]] >> shift) & 7;
+            p1[1] = (mm->input.texlerp_vert3[mm->cube_vertex_offset[face][1]] >> shift) & 7;
+            p1[2] = (mm->input.texlerp_vert3[mm->cube_vertex_offset[face][2]] >> shift) & 7;
+            p1[3] = (mm->input.texlerp_vert3[mm->cube_vertex_offset[face][3]] >> shift) & 7;
+         } else {
+            p1[0] = stbvox_vert_lerp_for_simple[mm->input.texlerp[mm->cube_vertex_offset[face][0]]>>6];
+            p1[1] = stbvox_vert_lerp_for_simple[mm->input.texlerp[mm->cube_vertex_offset[face][1]]>>6];
+            p1[2] = stbvox_vert_lerp_for_simple[mm->input.texlerp[mm->cube_vertex_offset[face][2]]>>6];
+            p1[3] = stbvox_vert_lerp_for_simple[mm->input.texlerp[mm->cube_vertex_offset[face][3]]>>6];
+         }
+         p1[0] = stbvox_vertex_encode(0,0,0,0,p1[0]);
+         p1[1] = stbvox_vertex_encode(0,0,0,0,p1[1]);
+         p1[2] = stbvox_vertex_encode(0,0,0,0,p1[2]);
+         p1[3] = stbvox_vertex_encode(0,0,0,0,p1[3]);
+      } else {
+         p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,stbvox_vert_lerp_for_face_lerp[facelerp]);
+      }
+   } else {
+      #if defined(STBVOX_CONFIG_UP_TEXLERP_PACKED) || defined(STBVOX_CONFIG_DOWN_TEXLERP_PACKED)
+      set_default:
+      #endif
+      p1[0] = p1[1] = p1[2] = p1[3] = stbvox_vertex_encode(0,0,0,0,7); // @TODO make this configurable
+   }
+
+   #if defined(STBVOX_CONFIG_UP_TEXLERP_PACKED) || defined(STBVOX_CONFIG_DOWN_TEXLERP_PACKED)
+   skip:
+   #endif
+
+   // now compute lighting and store to vertices
+   {
+      stbvox_mesh_vertex *mv[4];
+      stbvox_get_quad_vertex_pointer(mm, mesh, mv, face_data);
+
+      if (mm->input.lighting) {
+         // @TODO: lighting at block centers, but not gathered, instead constant-per-face
+         if (mm->input.lighting_at_vertices) {
+            int i;
+            for (i=0; i < 4; ++i) {
+               *mv[i] = vertbase + face_coord[i]
+                          + stbvox_vertex_encode(0,0,0,mm->input.lighting[v_off + mm->cube_vertex_offset[face][i]] & 63,0)
+                          + p1[i];
+            }
+         } else {
+            unsigned char *amb = &mm->input.lighting[v_off];
+            int i,j;
+            #if defined(STBVOX_CONFIG_ROTATION_IN_LIGHTING) || defined(STBVOX_CONFIG_VHEIGHT_IN_LIGHTING)
+            #define STBVOX_GET_LIGHTING(light) ((light) & ~3)
+            #define STBVOX_LIGHTING_ROUNDOFF   8
+            #else
+            #define STBVOX_GET_LIGHTING(light) (light)
+            #define STBVOX_LIGHTING_ROUNDOFF   2
+            #endif
+
+            for (i=0; i < 4; ++i) {
+               // for each vertex, gather from the four neighbor blocks it's facing
+               unsigned char *vamb = &amb[mm->cube_vertex_offset[face][i]];
+               int total=0;
+               for (j=0; j < 4; ++j)
+                  total += STBVOX_GET_LIGHTING(vamb[mm->vertex_gather_offset[face][j]]);
+               *mv[i] = vertbase + face_coord[i]
+                          + stbvox_vertex_encode(0,0,0,(total+STBVOX_LIGHTING_ROUNDOFF)>>4,0)
+                          + p1[i];
+                          // >> 4 is because:
+                          //   >> 2 to divide by 4 to get average over 4 samples
+                          //   >> 2 because input is 8 bits, output is 6 bits
+            }
+
+            // @TODO: note that gathering baked *lighting*
+            // is different from gathering baked ao; baked ao can count
+            // solid blocks as 0 ao, but baked lighting wants average
+            // of non-blocked--not take average & treat blocked as 0. And
+            // we can't bake the right value into the solid blocks
+            // because they can have different lighting values on
+            // different sides. So we need to actually gather and
+            // then divide by 0..4 (which we can do with a table-driven
+            // multiply, or have an 'if' for the 3 case)
+
+         }
+      } else {
+         vertbase += stbvox_vertex_encode(0,0,0,63,0);
+         *mv[0] = vertbase + face_coord[0] + p1[0];
+         *mv[1] = vertbase + face_coord[1] + p1[1];
+         *mv[2] = vertbase + face_coord[2] + p1[2];
+         *mv[3] = vertbase + face_coord[3] + p1[3];
+      }
+   }
+}
+
+// get opposite-facing normal & texgen for opposite face, used to map up-facing vheight data to down-facing data
+static unsigned char stbvox_reverse_face[STBVF_count] =
+{
+   STBVF_w, STBVF_s, STBVF_e, STBVF_n, STBVF_d   , STBVF_u   , STBVF_wd, STBVF_wu,
+         0,       0,       0,       0, STBVF_sw_d, STBVF_sw_u, STBVF_sd, STBVF_su,
+         0,       0,       0,       0, STBVF_se_d, STBVF_se_u, STBVF_ed, STBVF_eu,
+         0,       0,       0,       0, STBVF_ne_d, STBVF_ne_d, STBVF_nd, STBVF_nu
+};
+
+#ifndef STBVOX_CONFIG_OPTIMIZED_VHEIGHT
+// render non-planar quads by splitting into two triangles, rendering each as a degenerate quad
+static void stbvox_make_12_split_mesh_for_face(stbvox_mesh_maker *mm, stbvox_rotate rot, int face, int v_off, stbvox_pos pos, stbvox_mesh_vertex vertbase, stbvox_mesh_vertex *face_coord, unsigned char mesh, unsigned char *ht)
+{
+   stbvox_mesh_vertex v[4];
+
+   unsigned char normal1 = stbvox_face_up_normal_012[ht[2]][ht[1]][ht[0]];
+   unsigned char normal2 = stbvox_face_up_normal_123[ht[3]][ht[2]][ht[1]];
+
+   if (face == STBVOX_FACE_down) {
+      normal1 = stbvox_reverse_face[normal1];
+      normal2 = stbvox_reverse_face[normal2];
+   }
+
+   // the floor side face_coord is stored in order NW,NE,SE,SW, but ht[] is stored SW,SE,NW,NE
+   v[0] = face_coord[2];
+   v[1] = face_coord[3];
+   v[2] = face_coord[0];
+   v[3] = face_coord[2];
+   stbvox_make_mesh_for_face(mm, rot, face, v_off, pos, vertbase, v, mesh, normal1);
+   v[1] = face_coord[0];
+   v[2] = face_coord[1];
+   stbvox_make_mesh_for_face(mm, rot, face, v_off, pos, vertbase, v, mesh, normal2);
+}
+
+static void stbvox_make_03_split_mesh_for_face(stbvox_mesh_maker *mm, stbvox_rotate rot, int face, int v_off, stbvox_pos pos, stbvox_mesh_vertex vertbase, stbvox_mesh_vertex *face_coord, unsigned char mesh, unsigned char *ht)
+{
+   stbvox_mesh_vertex v[4];
+
+   unsigned char normal1 = stbvox_face_up_normal_013[ht[3]][ht[1]][ht[0]];
+   unsigned char normal2 = stbvox_face_up_normal_023[ht[3]][ht[2]][ht[0]];
+
+   if (face == STBVOX_FACE_down) {
+      normal1 = stbvox_reverse_face[normal1];
+      normal2 = stbvox_reverse_face[normal2];
+   }
+
+   v[0] = face_coord[1];
+   v[1] = face_coord[2];
+   v[2] = face_coord[3];
+   v[3] = face_coord[1];
+   stbvox_make_mesh_for_face(mm, rot, face, v_off, pos, vertbase, v, mesh, normal1);
+   v[1] = face_coord[3];
+   v[2] = face_coord[0];
+   stbvox_make_mesh_for_face(mm, rot, face, v_off, pos, vertbase, v, mesh, normal2);  // this one is correct!
+}
+#endif
+
+#ifndef STBVOX_CONFIG_PRECISION_Z
+#define STBVOX_CONFIG_PRECISION_Z 1
+#endif
+
+// simple case for mesh generation: we have only solid and empty blocks
+static void stbvox_make_mesh_for_block(stbvox_mesh_maker *mm, stbvox_pos pos, int v_off, stbvox_mesh_vertex *vmesh)
+{
+   int ns_off = mm->y_stride_in_bytes;
+   int ew_off = mm->x_stride_in_bytes;
+
+   unsigned char *blockptr = &mm->input.blocktype[v_off];
+   stbvox_mesh_vertex basevert = stbvox_vertex_encode(pos.x, pos.y, pos.z << STBVOX_CONFIG_PRECISION_Z , 0,0);
+
+   stbvox_rotate rot = { 0,0,0,0 };
+   unsigned char simple_rot = 0;
+
+   unsigned char mesh = mm->default_mesh;
+
+   if (mm->input.selector)
+      mesh = mm->input.selector[v_off];
+   else if (mm->input.block_selector)
+      mesh = mm->input.block_selector[mm->input.blocktype[v_off]];
+
+   // check if we're going off the end
+   if (mm->output_cur[mesh][0] + mm->output_size[mesh][0]*6 > mm->output_end[mesh][0]) {
+      mm->full = 1;
+      return;
+   }
+
+   #ifdef STBVOX_CONFIG_ROTATION_IN_LIGHTING
+   simple_rot = mm->input.lighting[v_off] & 3;
+   #endif
+
+   if (mm->input.packed_compact)
+      simple_rot = mm->input.packed_compact[v_off] & 3;
+
+   if (blockptr[ 1]==0) {
+      rot.facerot = simple_rot;
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_up  , v_off, pos, basevert, vmesh+4*STBVOX_FACE_up, mesh, STBVOX_FACE_up);
+   }
+   if (blockptr[-1]==0) {
+      rot.facerot = (-simple_rot) & 3;
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_down, v_off, pos, basevert, vmesh+4*STBVOX_FACE_down, mesh, STBVOX_FACE_down);
+   }
+
+   if (mm->input.rotate) {
+      unsigned char val = mm->input.rotate[v_off];
+      rot.block   = (val >> 0) & 3;
+      rot.overlay = (val >> 2) & 3;
+      //rot.tex2    = (val >> 4) & 3;
+      rot.ecolor  = (val >> 6) & 3;
+   } else {
+      rot.block = rot.overlay = rot.ecolor = simple_rot;
+   }
+   rot.facerot = 0;
+
+   if (blockptr[ ns_off]==0)
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_north, v_off, pos, basevert, vmesh+4*STBVOX_FACE_north, mesh, STBVOX_FACE_north);
+   if (blockptr[-ns_off]==0)
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_south, v_off, pos, basevert, vmesh+4*STBVOX_FACE_south, mesh, STBVOX_FACE_south);
+   if (blockptr[ ew_off]==0)
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_east , v_off, pos, basevert, vmesh+4*STBVOX_FACE_east, mesh, STBVOX_FACE_east);
+   if (blockptr[-ew_off]==0)
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_west , v_off, pos, basevert, vmesh+4*STBVOX_FACE_west, mesh, STBVOX_FACE_west);
+}
+
+// complex case for mesh generation: we have lots of different
+// block types, and we don't want to generate faces of blocks
+// if they're hidden by neighbors.
+//
+// we use lots of tables to determine this: we have a table
+// which tells us what face type is generated for each type of
+// geometry, and then a table that tells us whether that type
+// is hidden by a neighbor.
+static void stbvox_make_mesh_for_block_with_geo(stbvox_mesh_maker *mm, stbvox_pos pos, int v_off)
+{
+   int ns_off = mm->y_stride_in_bytes;
+   int ew_off = mm->x_stride_in_bytes;
+   int visible_faces, visible_base;
+   unsigned char mesh;
+
+   // first gather the geometry info for this block and all neighbors
+
+   unsigned char bt, nbt[6];
+   unsigned char geo, ngeo[6];
+   unsigned char rot, nrot[6];
+
+   bt = mm->input.blocktype[v_off];
+   nbt[0] = mm->input.blocktype[v_off + ew_off];
+   nbt[1] = mm->input.blocktype[v_off + ns_off];
+   nbt[2] = mm->input.blocktype[v_off - ew_off];
+   nbt[3] = mm->input.blocktype[v_off - ns_off];
+   nbt[4] = mm->input.blocktype[v_off +      1];
+   nbt[5] = mm->input.blocktype[v_off -      1];
+   if (mm->input.geometry) {
+      int i;
+      geo = mm->input.geometry[v_off];
+      ngeo[0] = mm->input.geometry[v_off + ew_off];
+      ngeo[1] = mm->input.geometry[v_off + ns_off];
+      ngeo[2] = mm->input.geometry[v_off - ew_off];
+      ngeo[3] = mm->input.geometry[v_off - ns_off];
+      ngeo[4] = mm->input.geometry[v_off +      1];
+      ngeo[5] = mm->input.geometry[v_off -      1];
+
+      rot = (geo >> 4) & 3;
+      geo &= 15;
+      for (i=0; i < 6; ++i) {
+         nrot[i] = (ngeo[i] >> 4) & 3;
+         ngeo[i] &= 15;
+      }
+   } else {
+      int i;
+      assert(mm->input.block_geometry);
+      geo = mm->input.block_geometry[bt];
+      for (i=0; i < 6; ++i)
+         ngeo[i] = mm->input.block_geometry[nbt[i]];
+      if (mm->input.selector) {
+         #ifndef STBVOX_CONFIG_ROTATION_IN_LIGHTING
+         if (mm->input.packed_compact == NULL) {
+            rot     = (mm->input.selector[v_off         ] >> 4) & 3;
+            nrot[0] = (mm->input.selector[v_off + ew_off] >> 4) & 3;
+            nrot[1] = (mm->input.selector[v_off + ns_off] >> 4) & 3;
+            nrot[2] = (mm->input.selector[v_off - ew_off] >> 4) & 3;
+            nrot[3] = (mm->input.selector[v_off - ns_off] >> 4) & 3;
+            nrot[4] = (mm->input.selector[v_off +      1] >> 4) & 3;
+            nrot[5] = (mm->input.selector[v_off -      1] >> 4) & 3;
+         }
+         #endif
+      } else {
+         #ifndef STBVOX_CONFIG_ROTATION_IN_LIGHTING
+         if (mm->input.packed_compact == NULL) {
+            rot = (geo>>4)&3;
+            geo &= 15;
+            for (i=0; i < 6; ++i) {
+               nrot[i] = (ngeo[i]>>4)&3;
+               ngeo[i] &= 15;
+            }
+         }
+         #endif
+      }
+   }
+
+   #ifndef STBVOX_CONFIG_ROTATION_IN_LIGHTING
+   if (mm->input.packed_compact) {
+      rot = mm->input.packed_compact[rot] & 3;
+      nrot[0] = mm->input.packed_compact[v_off + ew_off] & 3;
+      nrot[1] = mm->input.packed_compact[v_off + ns_off] & 3;
+      nrot[2] = mm->input.packed_compact[v_off - ew_off] & 3;
+      nrot[3] = mm->input.packed_compact[v_off - ns_off] & 3;
+      nrot[4] = mm->input.packed_compact[v_off +      1] & 3;
+      nrot[5] = mm->input.packed_compact[v_off -      1] & 3;
+   }
+   #else
+   rot = mm->input.lighting[v_off] & 3;
+   nrot[0] = (mm->input.lighting[v_off + ew_off]) & 3;
+   nrot[1] = (mm->input.lighting[v_off + ns_off]) & 3;
+   nrot[2] = (mm->input.lighting[v_off - ew_off]) & 3;
+   nrot[3] = (mm->input.lighting[v_off - ns_off]) & 3;
+   nrot[4] = (mm->input.lighting[v_off +      1]) & 3;
+   nrot[5] = (mm->input.lighting[v_off -      1]) & 3;
+   #endif
+
+   if (geo == STBVOX_GEOM_transp) {
+      // transparency has a special rule: if the blocktype is the same,
+      // and the faces are compatible, then can hide them; otherwise,
+      // force them on
+      // Note that this means we don't support any transparentshapes other
+      // than solid blocks, since detecting them is too complicated. If
+      // you wanted to do something like minecraft water, you probably
+      // should just do that with a separate renderer anyway. (We don't
+      // support transparency sorting so you need to use alpha test
+      // anyway)
+      int i;
+      for (i=0; i < 6; ++i)
+         if (nbt[i] != bt) {
+            nbt[i] = 0;
+            ngeo[i] = STBVOX_GEOM_empty;
+         } else
+            ngeo[i] = STBVOX_GEOM_solid;
+      geo = STBVOX_GEOM_solid;
+   }
+
+   // now compute the face visibility
+   visible_base = stbvox_hasface[geo][rot];
+   // @TODO: assert(visible_base != 0); // we should have early-outted earlier in this case
+   visible_faces = 0;
+
+   // now, for every face that might be visible, check if neighbor hides it
+   if (visible_base & (1 << STBVOX_FACE_east)) {
+      int  type = stbvox_facetype[ geo   ][(STBVOX_FACE_east+ rot   )&3];
+      int ntype = stbvox_facetype[ngeo[0]][(STBVOX_FACE_west+nrot[0])&3];
+      visible_faces |= ((stbvox_face_visible[type]) >> (ntype + 5 - STBVOX_FACE_east)) & (1 << STBVOX_FACE_east);
+   }
+   if (visible_base & (1 << STBVOX_FACE_north)) {
+      int  type = stbvox_facetype[ geo   ][(STBVOX_FACE_north+ rot   )&3];
+      int ntype = stbvox_facetype[ngeo[1]][(STBVOX_FACE_south+nrot[1])&3];
+      visible_faces |= ((stbvox_face_visible[type]) >> (ntype + 5 - STBVOX_FACE_north)) & (1 << STBVOX_FACE_north);
+   }
+   if (visible_base & (1 << STBVOX_FACE_west)) {
+      int  type = stbvox_facetype[ geo   ][(STBVOX_FACE_west+ rot   )&3];
+      int ntype = stbvox_facetype[ngeo[2]][(STBVOX_FACE_east+nrot[2])&3];
+      visible_faces |= ((stbvox_face_visible[type]) >> (ntype + 5 - STBVOX_FACE_west)) & (1 << STBVOX_FACE_west);
+   }
+   if (visible_base & (1 << STBVOX_FACE_south)) {
+      int  type = stbvox_facetype[ geo   ][(STBVOX_FACE_south+ rot   )&3];
+      int ntype = stbvox_facetype[ngeo[3]][(STBVOX_FACE_north+nrot[3])&3];
+      visible_faces |= ((stbvox_face_visible[type]) >> (ntype + 5 - STBVOX_FACE_south)) & (1 << STBVOX_FACE_south);
+   }
+   if (visible_base & (1 << STBVOX_FACE_up)) {
+      int  type = stbvox_facetype[ geo   ][STBVOX_FACE_up];
+      int ntype = stbvox_facetype[ngeo[4]][STBVOX_FACE_down];
+      visible_faces |= ((stbvox_face_visible[type]) >> (ntype + 5 - STBVOX_FACE_up)) & (1 << STBVOX_FACE_up);
+   }
+   if (visible_base & (1 << STBVOX_FACE_down)) {
+      int  type = stbvox_facetype[ geo   ][STBVOX_FACE_down];
+      int ntype = stbvox_facetype[ngeo[5]][STBVOX_FACE_up];
+      visible_faces |= ((stbvox_face_visible[type]) >> (ntype + 5 - STBVOX_FACE_down)) & (1 << STBVOX_FACE_down);
+   }
+
+   if (geo == STBVOX_GEOM_force)
+      geo = STBVOX_GEOM_solid;
+
+   assert((geo == STBVOX_GEOM_crossed_pair) ? (visible_faces == 15) : 1);
+
+   // now we finally know for sure which faces are getting generated
+   if (visible_faces == 0)
+      return;
+
+   mesh = mm->default_mesh;
+   if (mm->input.selector)
+      mesh = mm->input.selector[v_off];
+   else if (mm->input.block_selector)
+      mesh = mm->input.block_selector[bt];
+
+   if (geo <= STBVOX_GEOM_ceil_slope_north_is_bottom) {
+      // this is the simple case, we can just use regular block gen with special vmesh calculated with vheight
+      stbvox_mesh_vertex basevert;
+      stbvox_mesh_vertex vmesh[6][4];
+      stbvox_rotate rotate = { 0,0,0,0 };
+      unsigned char simple_rot = rot;
+      int i;
+      // we only need to do this for the displayed faces, but it's easier
+      // to just do it up front; @OPTIMIZE check if it's faster to do it
+      // for visible faces only
+      for (i=0; i < 6*4; ++i) {
+         int vert = stbvox_vertex_selector[0][i];
+         vert = stbvox_rotate_vertex[vert][rot];
+         vmesh[0][i] = stbvox_vmesh_pre_vheight[0][i]
+                     + stbvox_geometry_vheight[geo][vert];
+      }
+
+      basevert = stbvox_vertex_encode(pos.x, pos.y, pos.z << STBVOX_CONFIG_PRECISION_Z, 0,0);
+      if (mm->input.selector) {
+         mesh = mm->input.selector[v_off];
+      } else if (mm->input.block_selector)
+         mesh = mm->input.block_selector[bt];
+
+
+      // check if we're going off the end
+      if (mm->output_cur[mesh][0] + mm->output_size[mesh][0]*6 > mm->output_end[mesh][0]) {
+         mm->full = 1;
+         return;
+      }
+
+      if (geo >= STBVOX_GEOM_floor_slope_north_is_top) {
+         if (visible_faces & (1 << STBVOX_FACE_up)) {
+            int normal = geo == STBVOX_GEOM_floor_slope_north_is_top ? stbvox_floor_slope_for_rot[simple_rot] : STBVOX_FACE_up;
+            rotate.facerot = simple_rot;
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_up  , v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, normal);
+         }
+         if (visible_faces & (1 << STBVOX_FACE_down)) {
+            int normal = geo == STBVOX_GEOM_ceil_slope_north_is_bottom ? stbvox_ceil_slope_for_rot[simple_rot] : STBVOX_FACE_down;
+            rotate.facerot = (-rotate.facerot) & 3;
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, normal);
+         }
+      } else {
+         if (visible_faces & (1 << STBVOX_FACE_up)) {
+            rotate.facerot = simple_rot;
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_up  , v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, STBVOX_FACE_up);
+         }
+         if (visible_faces & (1 << STBVOX_FACE_down)) {
+            rotate.facerot = (-rotate.facerot) & 3;
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, STBVOX_FACE_down);
+         }
+      }
+
+      if (mm->input.rotate) {
+         unsigned char val = mm->input.rotate[v_off];
+         rotate.block   = (val >> 0) & 3;
+         rotate.overlay = (val >> 2) & 3;
+         //rotate.tex2    = (val >> 4) & 3;
+         rotate.ecolor  = (val >> 6) & 3;
+      } else {
+         rotate.block = rotate.overlay = rotate.ecolor = simple_rot;
+      }
+
+      rotate.facerot = 0;
+
+      if (visible_faces & (1 << STBVOX_FACE_north))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_north, v_off, pos, basevert, vmesh[STBVOX_FACE_north], mesh, STBVOX_FACE_north);
+      if (visible_faces & (1 << STBVOX_FACE_south))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_south, v_off, pos, basevert, vmesh[STBVOX_FACE_south], mesh, STBVOX_FACE_south);
+      if (visible_faces & (1 << STBVOX_FACE_east))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_east , v_off, pos, basevert, vmesh[STBVOX_FACE_east ], mesh, STBVOX_FACE_east);
+      if (visible_faces & (1 << STBVOX_FACE_west))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_west , v_off, pos, basevert, vmesh[STBVOX_FACE_west ], mesh, STBVOX_FACE_west);
+   }
+   if (geo >= STBVOX_GEOM_floor_vheight_03) {
+      // this case can also be generated with regular block gen with special vmesh,
+      // except:
+      //     if we want to generate middle diagonal for 'weird' blocks
+      //     it's more complicated to detect neighbor matchups
+      stbvox_mesh_vertex vmesh[6][4];
+      stbvox_mesh_vertex cube[8];
+      stbvox_mesh_vertex basevert;
+      stbvox_rotate rotate = { 0,0,0,0 };
+      unsigned char simple_rot = rot;
+      unsigned char ht[4];
+      int extreme;
+
+      // extract the heights
+      #ifdef STBVOX_CONFIG_VHEIGHT_IN_LIGHTING
+      ht[0] = mm->input.lighting[v_off              ] & 3;
+      ht[1] = mm->input.lighting[v_off+ew_off       ] & 3;
+      ht[2] = mm->input.lighting[v_off       +ns_off] & 3;
+      ht[3] = mm->input.lighting[v_off+ew_off+ns_off] & 3;
+      #else
+      if (mm->input.vheight) {
+         unsigned char v =  mm->input.vheight[v_off];
+         ht[0] = (v >> 0) & 3;
+         ht[1] = (v >> 2) & 3;
+         ht[2] = (v >> 4) & 3;
+         ht[3] = (v >> 6) & 3;
+      } else if (mm->input.block_vheight) {
+         unsigned char v = mm->input.block_vheight[bt];
+         unsigned char raw[4];
+         int i;
+
+         raw[0] = (v >> 0) & 3;
+         raw[1] = (v >> 2) & 3;
+         raw[2] = (v >> 4) & 3;
+         raw[3] = (v >> 6) & 3;
+
+         for (i=0; i < 4; ++i)
+            ht[i] = raw[stbvox_rotate_vertex[i][rot]];
+      } else if (mm->input.packed_compact) {
+         ht[0] = (mm->input.packed_compact[v_off              ] >> 2) & 3;
+         ht[1] = (mm->input.packed_compact[v_off+ew_off       ] >> 2) & 3;
+         ht[2] = (mm->input.packed_compact[v_off       +ns_off] >> 2) & 3;
+         ht[3] = (mm->input.packed_compact[v_off+ew_off+ns_off] >> 2) & 3;
+      } else if (mm->input.geometry) {
+         ht[0] = mm->input.geometry[v_off              ] >> 6;
+         ht[1] = mm->input.geometry[v_off+ew_off       ] >> 6;
+         ht[2] = mm->input.geometry[v_off       +ns_off] >> 6;
+         ht[3] = mm->input.geometry[v_off+ew_off+ns_off] >> 6;
+      } else {
+         assert(0);
+      }
+      #endif
+
+      // flag whether any sides go off the top of the block, which means
+      // our visible_faces test was wrong
+      extreme = (ht[0] == 3 || ht[1] == 3 || ht[2] == 3 || ht[3] == 3);
+
+      if (geo >= STBVOX_GEOM_ceil_vheight_03) {
+         cube[0] = stbvox_vertex_encode(0,0,ht[0],0,0);
+         cube[1] = stbvox_vertex_encode(0,0,ht[1],0,0);
+         cube[2] = stbvox_vertex_encode(0,0,ht[2],0,0);
+         cube[3] = stbvox_vertex_encode(0,0,ht[3],0,0);
+         cube[4] = stbvox_vertex_encode(0,0,2,0,0);
+         cube[5] = stbvox_vertex_encode(0,0,2,0,0);
+         cube[6] = stbvox_vertex_encode(0,0,2,0,0);
+         cube[7] = stbvox_vertex_encode(0,0,2,0,0);
+      } else {
+         cube[0] = stbvox_vertex_encode(0,0,0,0,0);
+         cube[1] = stbvox_vertex_encode(0,0,0,0,0);
+         cube[2] = stbvox_vertex_encode(0,0,0,0,0);
+         cube[3] = stbvox_vertex_encode(0,0,0,0,0);
+         cube[4] = stbvox_vertex_encode(0,0,ht[0],0,0);
+         cube[5] = stbvox_vertex_encode(0,0,ht[1],0,0);
+         cube[6] = stbvox_vertex_encode(0,0,ht[2],0,0);
+         cube[7] = stbvox_vertex_encode(0,0,ht[3],0,0);
+      }
+      if (!mm->input.vheight && mm->input.block_vheight) {
+         // @TODO: support block vheight here, I've forgotten what needs to be done specially
+      }
+
+      // build vertex mesh
+      {
+         int i;
+         for (i=0; i < 6*4; ++i) {
+            int vert = stbvox_vertex_selector[0][i];
+            vmesh[0][i] = stbvox_vmesh_pre_vheight[0][i]
+                        + cube[vert];
+         }
+      }
+
+      basevert = stbvox_vertex_encode(pos.x, pos.y, pos.z << STBVOX_CONFIG_PRECISION_Z, 0,0);
+      // check if we're going off the end
+      if (mm->output_cur[mesh][0] + mm->output_size[mesh][0]*6 > mm->output_end[mesh][0]) {
+         mm->full = 1;
+         return;
+      }
+
+      // @TODO generate split faces
+      if (visible_faces & (1 << STBVOX_FACE_up)) {
+         if (geo >= STBVOX_GEOM_ceil_vheight_03)
+            // flat
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_up  , v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, STBVOX_FACE_up);
+         else {
+         #ifndef STBVOX_CONFIG_OPTIMIZED_VHEIGHT
+            // check if it's non-planar
+            if (cube[5] + cube[6] != cube[4] + cube[7]) {
+               // not planar, split along diagonal and make degenerate quads
+               if (geo == STBVOX_GEOM_floor_vheight_03)
+                  stbvox_make_03_split_mesh_for_face(mm, rotate, STBVOX_FACE_up, v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, ht);
+               else
+                  stbvox_make_12_split_mesh_for_face(mm, rotate, STBVOX_FACE_up, v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, ht);
+            } else
+               stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_up  , v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, stbvox_planar_face_up_normal[ht[2]][ht[1]][ht[0]]);
+         #else
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_up  , v_off, pos, basevert, vmesh[STBVOX_FACE_up], mesh, stbvox_optimized_face_up_normal[ht[3]][ht[2]][ht[1]][ht[0]]);
+         #endif
+         }
+      }
+      if (visible_faces & (1 << STBVOX_FACE_down)) {
+         if (geo < STBVOX_GEOM_ceil_vheight_03)
+            // flat
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, STBVOX_FACE_down);
+         else {
+         #ifndef STBVOX_CONFIG_OPTIMIZED_VHEIGHT
+            // check if it's non-planar
+            if (cube[1] + cube[2] != cube[0] + cube[3]) {
+               // not planar, split along diagonal and make degenerate quads
+               if (geo == STBVOX_GEOM_ceil_vheight_03)
+                  stbvox_make_03_split_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, ht);
+               else
+                  stbvox_make_12_split_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, ht);
+            } else
+               stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, stbvox_reverse_face[stbvox_planar_face_up_normal[ht[2]][ht[1]][ht[0]]]);
+         #else
+            stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_down, v_off, pos, basevert, vmesh[STBVOX_FACE_down], mesh, stbvox_reverse_face[stbvox_optimized_face_up_normal[ht[3]][ht[2]][ht[1]][ht[0]]]);
+         #endif
+         }
+      }
+
+      if (mm->input.rotate) {
+         unsigned char val = mm->input.rotate[v_off];
+         rotate.block   = (val >> 0) & 3;
+         rotate.overlay = (val >> 2) & 3;
+         //rotate.tex2    = (val >> 4) & 3;
+         rotate.ecolor  = (val >> 6) & 3;
+      } else if (mm->input.selector) {
+         rotate.block = rotate.overlay = rotate.ecolor = simple_rot;
+      }
+
+      if ((visible_faces & (1 << STBVOX_FACE_north)) || (extreme && (ht[2] == 3 || ht[3] == 3)))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_north, v_off, pos, basevert, vmesh[STBVOX_FACE_north], mesh, STBVOX_FACE_north);
+      if ((visible_faces & (1 << STBVOX_FACE_south)) || (extreme && (ht[0] == 3 || ht[1] == 3)))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_south, v_off, pos, basevert, vmesh[STBVOX_FACE_south], mesh, STBVOX_FACE_south);
+      if ((visible_faces & (1 << STBVOX_FACE_east)) || (extreme && (ht[1] == 3 || ht[3] == 3)))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_east , v_off, pos, basevert, vmesh[STBVOX_FACE_east ], mesh, STBVOX_FACE_east);
+      if ((visible_faces & (1 << STBVOX_FACE_west)) || (extreme && (ht[0] == 3 || ht[2] == 3)))
+         stbvox_make_mesh_for_face(mm, rotate, STBVOX_FACE_west , v_off, pos, basevert, vmesh[STBVOX_FACE_west ], mesh, STBVOX_FACE_west);
+   }
+
+   if (geo == STBVOX_GEOM_crossed_pair) {
+      // this can be generated with a special vmesh
+      stbvox_mesh_vertex basevert = stbvox_vertex_encode(pos.x, pos.y, pos.z << STBVOX_CONFIG_PRECISION_Z , 0,0);
+      unsigned char simple_rot=0;
+      stbvox_rotate rot = { 0,0,0,0 };
+      unsigned char mesh = mm->default_mesh;
+      if (mm->input.selector) {
+         mesh = mm->input.selector[v_off];
+         simple_rot = mesh >> 4;
+         mesh &= 15;
+      }
+      if (mm->input.block_selector) {
+         mesh = mm->input.block_selector[bt];
+      }
+
+      // check if we're going off the end
+      if (mm->output_cur[mesh][0] + mm->output_size[mesh][0]*4 > mm->output_end[mesh][0]) {
+         mm->full = 1;
+         return;
+      }
+
+      if (mm->input.rotate) {
+         unsigned char val = mm->input.rotate[v_off];
+         rot.block   = (val >> 0) & 3;
+         rot.overlay = (val >> 2) & 3;
+         //rot.tex2    = (val >> 4) & 3;
+         rot.ecolor  = (val >> 6) & 3;
+      } else if (mm->input.selector) {
+         rot.block = rot.overlay = rot.ecolor = simple_rot;
+      }
+      rot.facerot = 0;
+
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_north, v_off, pos, basevert, stbvox_vmesh_crossed_pair[STBVOX_FACE_north], mesh, STBVF_ne_u_cross);
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_south, v_off, pos, basevert, stbvox_vmesh_crossed_pair[STBVOX_FACE_south], mesh, STBVF_sw_u_cross);
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_east , v_off, pos, basevert, stbvox_vmesh_crossed_pair[STBVOX_FACE_east ], mesh, STBVF_se_u_cross);
+      stbvox_make_mesh_for_face(mm, rot, STBVOX_FACE_west , v_off, pos, basevert, stbvox_vmesh_crossed_pair[STBVOX_FACE_west ], mesh, STBVF_nw_u_cross);
+   }
+
+
+   // @TODO
+   // STBVOX_GEOM_floor_slope_north_is_top_as_wall,
+   // STBVOX_GEOM_ceil_slope_north_is_bottom_as_wall,
+}
+
+static void stbvox_make_mesh_for_column(stbvox_mesh_maker *mm, int x, int y, int z0)
+{
+   stbvox_pos pos;
+   int v_off = x * mm->x_stride_in_bytes + y * mm->y_stride_in_bytes;
+   int ns_off = mm->y_stride_in_bytes;
+   int ew_off = mm->x_stride_in_bytes;
+   pos.x = x;
+   pos.y = y;
+   pos.z = 0;
+   if (mm->input.geometry) {
+      unsigned char *bt  = mm->input.blocktype + v_off;
+      unsigned char *geo = mm->input.geometry + v_off;
+      int z;
+      for (z=z0; z < mm->z1; ++z) {
+         if (bt[z] && ( !bt[z+ns_off] || !STBVOX_GET_GEO(geo[z+ns_off]) || !bt[z-ns_off] || !STBVOX_GET_GEO(geo[z-ns_off])
+                      || !bt[z+ew_off] || !STBVOX_GET_GEO(geo[z+ew_off]) || !bt[z-ew_off] || !STBVOX_GET_GEO(geo[z-ew_off])
+                      || !bt[z-1] || !STBVOX_GET_GEO(geo[z-1]) || !bt[z+1] || !STBVOX_GET_GEO(geo[z+1])))
+         {  // TODO check up and down
+            pos.z = z;
+            stbvox_make_mesh_for_block_with_geo(mm, pos, v_off+z);
+            if (mm->full) {
+               mm->cur_z = z;
+               return;
+            }
+         }
+      }
+   } else if (mm->input.block_geometry) {
+      int z;
+      unsigned char *bt  = mm->input.blocktype + v_off;
+      unsigned char *geo = mm->input.block_geometry;
+      for (z=z0; z < mm->z1; ++z) {
+         if (bt[z] && (    geo[bt[z+ns_off]] != STBVOX_GEOM_solid
+                        || geo[bt[z-ns_off]] != STBVOX_GEOM_solid
+                        || geo[bt[z+ew_off]] != STBVOX_GEOM_solid
+                        || geo[bt[z-ew_off]] != STBVOX_GEOM_solid
+                        || geo[bt[z-1]] != STBVOX_GEOM_solid
+                        || geo[bt[z+1]] != STBVOX_GEOM_solid))
+         {
+            pos.z = z;
+            stbvox_make_mesh_for_block_with_geo(mm, pos, v_off+z);
+            if (mm->full) {
+               mm->cur_z = z;
+               return;
+            }
+         }
+      }
+   } else {
+      unsigned char *bt = mm->input.blocktype + v_off;
+      int z;
+      #if STBVOX_CONFIG_PRECISION_Z == 1
+      stbvox_mesh_vertex *vmesh = stbvox_vmesh_delta_half_z[0];
+      #else
+      stbvox_mesh_vertex *vmesh = stbvox_vmesh_delta_normal[0];
+      #endif
+      for (z=z0; z < mm->z1; ++z) {
+         // if it's solid and at least one neighbor isn't solid
+         if (bt[z] && (!bt[z+ns_off] || !bt[z-ns_off] || !bt[z+ew_off] || !bt[z-ew_off] || !bt[z-1] || !bt[z+1])) {
+            pos.z = z;
+            stbvox_make_mesh_for_block(mm, pos, v_off+z, vmesh);
+            if (mm->full) {
+               mm->cur_z = z;
+               return;
+            }
+         }
+      }
+   }
+}
+
+static void stbvox_bring_up_to_date(stbvox_mesh_maker *mm)
+{
+   if (mm->config_dirty) {
+      int i;
+      #ifdef STBVOX_ICONFIG_FACE_ATTRIBUTE
+         mm->num_mesh_slots = 1;
+         for (i=0; i < STBVOX_MAX_MESHES; ++i) {
+            mm->output_size[i][0] = 32;
+            mm->output_step[i][0] = 8;
+         }
+      #else
+         mm->num_mesh_slots = 2;
+         for (i=0; i < STBVOX_MAX_MESHES; ++i) {
+            mm->output_size[i][0] = 16;
+            mm->output_step[i][0] = 4;
+            mm->output_size[i][1] = 4;
+            mm->output_step[i][1] = 4;
+         }
+      #endif
+
+      mm->config_dirty = 0;
+   }
+}
+
+int stbvox_make_mesh(stbvox_mesh_maker *mm)
+{
+   int x,y;
+   stbvox_bring_up_to_date(mm);
+   mm->full = 0;
+   if (mm->cur_x > mm->x0 || mm->cur_y > mm->y0 || mm->cur_z > mm->z0) {
+      stbvox_make_mesh_for_column(mm, mm->cur_x, mm->cur_y, mm->cur_z);
+      if (mm->full)
+         return 0;
+      ++mm->cur_y;
+      while (mm->cur_y < mm->y1 && !mm->full) {
+         stbvox_make_mesh_for_column(mm, mm->cur_x, mm->cur_y, mm->z0);
+         if (mm->full)
+            return 0;
+         ++mm->cur_y;
+      }
+      ++mm->cur_x;
+   }
+   for (x=mm->cur_x; x < mm->x1; ++x) {
+      for (y=mm->y0; y < mm->y1; ++y) {
+         stbvox_make_mesh_for_column(mm, x, y, mm->z0);
+         if (mm->full) {
+            mm->cur_x = x;
+            mm->cur_y = y;
+            return 0;
+         }
+      }
+   }
+   return 1;
+}
+
+void stbvox_init_mesh_maker(stbvox_mesh_maker *mm)
+{
+   memset(mm, 0, sizeof(*mm));
+   stbvox_build_default_palette();
+
+   mm->config_dirty = 1;
+   mm->default_mesh = 0;
+}
+
+int stbvox_get_buffer_count(stbvox_mesh_maker *mm)
+{
+   stbvox_bring_up_to_date(mm);
+   return mm->num_mesh_slots;
+}
+
+int stbvox_get_buffer_size_per_quad(stbvox_mesh_maker *mm, int n)
+{
+   return mm->output_size[0][n];
+}
+
+void stbvox_reset_buffers(stbvox_mesh_maker *mm)
+{
+   int i;
+   for (i=0; i < STBVOX_MAX_MESHES*STBVOX_MAX_MESH_SLOTS; ++i) {
+      mm->output_cur[0][i] = 0;
+      mm->output_buffer[0][i] = 0;
+   }
+}
+
+void stbvox_set_buffer(stbvox_mesh_maker *mm, int mesh, int slot, void *buffer, size_t len)
+{
+   int i;
+   stbvox_bring_up_to_date(mm);
+   mm->output_buffer[mesh][slot] = (char *) buffer;
+   mm->output_cur   [mesh][slot] = (char *) buffer;
+   mm->output_len   [mesh][slot] = (int) len;
+   mm->output_end   [mesh][slot] = (char *) buffer + len;
+   for (i=0; i < STBVOX_MAX_MESH_SLOTS; ++i) {
+      if (mm->output_buffer[mesh][i]) {
+         assert(mm->output_len[mesh][i] / mm->output_size[mesh][i] == mm->output_len[mesh][slot] / mm->output_size[mesh][slot]);
+      }
+   }
+}
+
+void stbvox_set_default_mesh(stbvox_mesh_maker *mm, int mesh)
+{
+   mm->default_mesh = mesh;
+}
+
+int stbvox_get_quad_count(stbvox_mesh_maker *mm, int mesh)
+{
+   return (int) ((mm->output_cur[mesh][0] - mm->output_buffer[mesh][0]) / mm->output_size[mesh][0]);
+}
+
+stbvox_input_description *stbvox_get_input_description(stbvox_mesh_maker *mm)
+{
+   return &mm->input;
+}
+
+void stbvox_set_input_range(stbvox_mesh_maker *mm, int x0, int y0, int z0, int x1, int y1, int z1)
+{
+   mm->x0 = x0;
+   mm->y0 = y0;
+   mm->z0 = z0;
+
+   mm->x1 = x1;
+   mm->y1 = y1;
+   mm->z1 = z1;
+
+   mm->cur_x = x0;
+   mm->cur_y = y0;
+   mm->cur_z = z0;
+
+   // @TODO validate that this range is representable in this mode
+}
+
+void stbvox_get_transform(stbvox_mesh_maker *mm, float transform[3][3])
+{
+   // scale
+   transform[0][0] = 1.0;
+   transform[0][1] = 1.0;
+   #if STBVOX_CONFIG_PRECISION_Z==1
+   transform[0][2] = 0.5f;
+   #else
+   transform[0][2] = 1.0f;
+   #endif
+   // translation
+   transform[1][0] = (float) (mm->pos_x);
+   transform[1][1] = (float) (mm->pos_y);
+   transform[1][2] = (float) (mm->pos_z);
+   // texture coordinate projection translation
+   transform[2][0] = (float) (mm->pos_x & 255); // @TODO depends on max texture scale
+   transform[2][1] = (float) (mm->pos_y & 255);
+   transform[2][2] = (float) (mm->pos_z & 255);
+}
+
+void stbvox_get_bounds(stbvox_mesh_maker *mm, float bounds[2][3])
+{
+   bounds[0][0] = (float) (mm->pos_x + mm->x0);
+   bounds[0][1] = (float) (mm->pos_y + mm->y0);
+   bounds[0][2] = (float) (mm->pos_z + mm->z0);
+   bounds[1][0] = (float) (mm->pos_x + mm->x1);
+   bounds[1][1] = (float) (mm->pos_y + mm->y1);
+   bounds[1][2] = (float) (mm->pos_z + mm->z1);
+}
+
+void stbvox_set_mesh_coordinates(stbvox_mesh_maker *mm, int x, int y, int z)
+{
+   mm->pos_x = x;
+   mm->pos_y = y;
+   mm->pos_z = z;
+}
+
+void stbvox_set_input_stride(stbvox_mesh_maker *mm, int x_stride_in_bytes, int y_stride_in_bytes)
+{
+   int f,v;
+   mm->x_stride_in_bytes = x_stride_in_bytes;
+   mm->y_stride_in_bytes = y_stride_in_bytes;
+   for (f=0; f < 6; ++f) {
+      for (v=0; v < 4; ++v) {
+         mm->cube_vertex_offset[f][v]   =   stbvox_vertex_vector[f][v][0]    * mm->x_stride_in_bytes
+                                         +  stbvox_vertex_vector[f][v][1]    * mm->y_stride_in_bytes
+                                         +  stbvox_vertex_vector[f][v][2]                           ;
+         mm->vertex_gather_offset[f][v] =  (stbvox_vertex_vector[f][v][0]-1) * mm->x_stride_in_bytes
+                                         + (stbvox_vertex_vector[f][v][1]-1) * mm->y_stride_in_bytes
+                                         + (stbvox_vertex_vector[f][v][2]-1)                        ;
+      }
+   }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+//    offline computation of tables
+//
+
+#if 0
+// compute optimized vheight table
+static char *normal_names[32] =
+{
+   0,0,0,0,"u   ",0, "eu  ",0,
+   0,0,0,0,"ne_u",0, "nu  ",0,
+   0,0,0,0,"nw_u",0, "wu  ",0,
+   0,0,0,0,"sw_u",0, "su  ",0,
+};
+
+static char *find_best_normal(float x, float y, float z)
+{
+   int best_slot = 4;
+   float best_dot = 0;
+   int i;
+   for (i=0; i < 32; ++i) {
+      if (normal_names[i]) {
+         float dot = x * stbvox_default_normals[i][0] + y * stbvox_default_normals[i][1] + z * stbvox_default_normals[i][2];
+         if (dot > best_dot) {
+            best_dot = dot;
+            best_slot = i;
+         }
+      }
+   }
+   return normal_names[best_slot];
+}
+
+int main(int argc, char **argv)
+{
+   int sw,se,nw,ne;
+   for (ne=0; ne < 4; ++ne) {
+      for (nw=0; nw < 4; ++nw) {
+         for (se=0; se < 4; ++se) {
+            printf("        { ");
+            for (sw=0; sw < 4; ++sw) {
+               float x = (float) (nw + sw - ne - se);
+               float y = (float) (sw + se - nw - ne);
+               float z = 2;
+               printf("STBVF_%s, ", find_best_normal(x,y,z));
+            }
+            printf("},\n");
+         }
+      }
+   }
+   return 0;
+}
+#endif
+
+// @TODO
+//
+//   - test API for texture rotation on side faces
+//   - API for texture rotation on top & bottom
+//   - better culling of vheight faces with vheight neighbors
+//   - better culling of non-vheight faces with vheight neighbors
+//   - gather vertex lighting from slopes correctly
+//   - better support texture edge_clamp: currently if you fall
+//     exactly on 1.0 you get wrapped incorrectly; this is rare, but
+//     can avoid: compute texcoords in vertex shader, offset towards
+//     center before modding, need 2 bits per vertex to know offset direction)
+//   - other mesh modes (10,6,4-byte quads)
+//
+//
+// With TexBuffer for the fixed vertex data, we can actually do
+// minecrafty non-blocks like stairs -- we still probably only
+// want 256 or so, so we can't do the equivalent of all the vheight
+// combos, but that's ok. The 256 includes baked rotations, but only
+// some of them need it, and lots of block types share some faces.
+//
+// mode 5 (6 bytes):   mode 6 (6 bytes)
+//   x:7                x:6
+//   y:7                y:6
+//   z:6                z:6
+//   tex1:8             tex1:8
+//   tex2:8             tex2:7
+//   color:8            color:8
+//   face:4             face:7
+//
+//
+//  side faces (all x4)        top&bottom faces (2x)    internal faces (1x)
+//     1  regular                1 regular
+//     2  slabs                                             2
+//     8  stairs                 4 stairs                  16
+//     4  diag side                                         8
+//     4  upper diag side                                   8
+//     4  lower diag side                                   8
+//                                                          4 crossed pairs
+//
+//    23*4                   +   5*4                    +  46
+//  == 92 + 20 + 46 = 158
+//
+//   Must drop 30 of them to fit in 7 bits:
+//       ceiling half diagonals: 16+8 = 24
+//   Need to get rid of 6 more.
+//       ceiling diagonals: 8+4 = 12
+//   This brings it to 122, so can add a crossed-pair variant.
+//       (diagonal and non-diagonal, or randomly offset)
+//   Or carpet, which would be 5 more.
+//
+//
+// Mode 4 (10 bytes):
+//  v:  z:2,light:6
+//  f:  x:6,y:6,z:7, t1:8,t2:8,c:8,f:5
+//
+// Mode ? (10 bytes)
+//  v:  xyz:5 (27 values), light:3
+//  f:  x:7,y:7,z:6, t1:8,t2:8,c:8,f:4
+// (v:  x:2,y:2,z:2,light:2)
+
+#endif // STB_VOXEL_RENDER_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/