template <typename T> 
	T reduce2(T v) {
		/* pre: ((short*)&v)[i] < 100 for all i
		 *  post: 
		 *     ((char*)&v)[2i] = ((short*)&v)[i] / 10
		 *     ((char*)&v)[2i + 1] = ((short*)&v)[i] % 10
		 *     
		 *     That is, we split each short in v into its ones and tens digits
		 */
 
		/* t < 100 --> (t * 410) >> 12 == t / 10
		 *			&& (t * 410) < 0x10000
		 * 
		 * For the least order short that's all we need, for the others the
		 * shift doesn't drop the remainder so we mask those out
		 */
		T k = ((v * 410) >> 12) & 0x000F000F000F000Full;
 
		/*
		 * Then just subtract out the tens digit to get the ones digit and
		 * shift them into the right place
		 */
		return (((v - k * 10) << 8) + k);
	}
 
	template <typename T>
	T reduce4(T v) {
		/* pre: ((unsigned*)&v)[i] < 10000 for all i
		 *
		 *  preReduce2: 
		 *     ((short*)&v)[2i] = ((unsigned*)&v)[i] / 100
		 *     ((short*)&v)[2i + 1] = ((unsigned*)&v)[i] % 100
		 *     
		 *     That is, we split each int in v into its one/ten and hundred/thousand 
		 *     digit pairs. Put them into the corresponding short positions and then
		 *     call reduce2 to finish the splitting
		 */
 
		/* This is basically the same as reduce2 with different constants
		 */
		T k = ((v * 10486) >> 20) & 0xFF000000FFull;
		return reduce2(((v - k * 100) << 16) + (k));
	}
 
	typedef unsigned long long ull;
	inline ull reduce8(ull v) {
		/* pre: v  < 100000000
		 *
		 *  preReduce4: 
		 *     ((unsigned*)&v)[0] = v / 10000
		 *     ((unsigned*)&v)[1] = v % 10000
		 *
		 *     This should be familiar now, split v into the two 4-digit segments,
		 *     put them in the right place, and let reduce4 continue the splitting
		 */
 
		/* Again, use the same method as reduce4 and reduce2 with correctly tailored constants
		 */
		ull k = ((v * 3518437209u) >> 45);
		return reduce4(((v - k * 10000) << 32) + (k));
	}
 
	template <typename T>
	std::string itostr(T o) {
		/*
		 * Use of this union is not strictly compliant, but, really,
		 * who cares? This is just for fun :)
		 *
		 * Our ones digit will be in str[15]
		 *
		 * We don't actually need the first 6 bytes, but w/e
		 */
		union {
			char str[16];
			unsigned short u2[8];
			unsigned u4[4];
			unsigned long long u8[2];
		};
 
		/* Technically should be "... ? unsigned(~0) + 1 ..." to ensure correct behavior I think */
		/* Tends to compile to: v = (o ^ (o >> 31)) - (o >> 31); */
		unsigned v = o < 0 ? ~o + 1 : o;
 
		/* We want u2[3] = v / 100000000 ... that is, the first 2 bytes of the decimal rep */
 
		/* This is the same as in reduce8, that is divide by 10000. So u8[0] = v / 10000 */
		u8[0] = (ull(v) * 3518437209u) >> 45;
 
		/* Now we want u2[3] = u8[0] / 10000.
		 * If we added " >> 48 " to the end of the calculation below we would get u8[0] = u8[0] / 10000
		 * Note though that in little endian byte ordering u2[3] is the top 2 bytes of u8[0]
		 * and 64 - 16 == 48... Aha, We've got what we want, the rest of u8[0] is junk, but we don't care
		 */
		u8[0] = (u8[0] * 28147497672ull);
 
		/* Then just subtract out those digits from v so that u8[1] now holds
		 * the low 8 decimal digits of v
		 */
		u8[1] = v - u2[3] * 100000000;
 
		/* Split u8[1] into its 8 decimal digits */
		u8[1] = reduce8(u8[1]);
 
		/* f will point to the high order non-zero char */
		char* f;
 
		/* branch post: f is at the correct short (but not necessarily the correct byte) */
		if (u2[3]) {
			/* split the top two digits into their respective chars */
			u2[3] = reduce2(u2[3]);
			f = str + 6;
		} else {
			/* a sort of binary search on first non-zero digit */
			unsigned short* k = u4[2] ? u2 + 4 : u2 + 6;
			f = *k ? (char*)k : (char*)(k + 1);
		}
		/* update f to its final position */
		if (!*f) f++;
 
		/* '0' == 0x30 and i < 10 --> i <= 0xF ... that is, i | 0x30 = 'i' *
		 * Note that we could do u8[0] |= ... u8[1] |= ... but the corresponding
		 * x86-64 operation cannot use a 64 bit immediate value whereas the
		 * 32 bit 'or' can use a 32 bit immediate.
		 */
		u4[1] |= 0x30303030;
		u4[2] |= 0x30303030;
		u4[3] |= 0x30303030;
 
		/* Add the negative sign... note that o is just the original parameter passed */
		if (o < 0) *--f = '-';
 
		/* gcc basically forwards this to std::string(f, str + 16)
		 * but msvc handles it way more efficiently
		 */
		return std::string(f, (str + 16) - f);
	}