template T reduce2(T v) { /* pre: ((short*)&v)[i] < 100 for all i * post: * ((char*)&v)[2i] = ((short*)&v)[i] / 10 * ((char*)&v)[2i + 1] = ((short*)&v)[i] % 10 * * That is, we split each short in v into its ones and tens digits */ /* t < 100 --> (t * 410) >> 12 == t / 10 * && (t * 410) < 0x10000 * * For the least order short that's all we need, for the others the * shift doesn't drop the remainder so we mask those out */ T k = ((v * 410) >> 12) & 0x000F000F000F000Full; /* * Then just subtract out the tens digit to get the ones digit and * shift them into the right place */ return (((v - k * 10) << 8) + k); } template T reduce4(T v) { /* pre: ((unsigned*)&v)[i] < 10000 for all i * * preReduce2: * ((short*)&v)[2i] = ((unsigned*)&v)[i] / 100 * ((short*)&v)[2i + 1] = ((unsigned*)&v)[i] % 100 * * That is, we split each int in v into its one/ten and hundred/thousand * digit pairs. Put them into the corresponding short positions and then * call reduce2 to finish the splitting */ /* This is basically the same as reduce2 with different constants */ T k = ((v * 10486) >> 20) & 0xFF000000FFull; return reduce2(((v - k * 100) << 16) + (k)); } typedef unsigned long long ull; inline ull reduce8(ull v) { /* pre: v < 100000000 * * preReduce4: * ((unsigned*)&v)[0] = v / 10000 * ((unsigned*)&v)[1] = v % 10000 * * This should be familiar now, split v into the two 4-digit segments, * put them in the right place, and let reduce4 continue the splitting */ /* Again, use the same method as reduce4 and reduce2 with correctly tailored constants */ ull k = ((v * 3518437209u) >> 45); return reduce4(((v - k * 10000) << 32) + (k)); } template std::string itostr(T o) { /* * Use of this union is not strictly compliant, but, really, * who cares? This is just for fun :) * * Our ones digit will be in str[15] * * We don't actually need the first 6 bytes, but w/e */ union { char str[16]; unsigned short u2[8]; unsigned u4[4]; unsigned long long u8[2]; }; /* Technically should be "... ? unsigned(~0) + 1 ..." to ensure correct behavior I think */ /* Tends to compile to: v = (o ^ (o >> 31)) - (o >> 31); */ unsigned v = o < 0 ? ~o + 1 : o; /* We want u2[3] = v / 100000000 ... that is, the first 2 bytes of the decimal rep */ /* This is the same as in reduce8, that is divide by 10000. So u8[0] = v / 10000 */ u8[0] = (ull(v) * 3518437209u) >> 45; /* Now we want u2[3] = u8[0] / 10000. * If we added " >> 48 " to the end of the calculation below we would get u8[0] = u8[0] / 10000 * Note though that in little endian byte ordering u2[3] is the top 2 bytes of u8[0] * and 64 - 16 == 48... Aha, We've got what we want, the rest of u8[0] is junk, but we don't care */ u8[0] = (u8[0] * 28147497672ull); /* Then just subtract out those digits from v so that u8[1] now holds * the low 8 decimal digits of v */ u8[1] = v - u2[3] * 100000000; /* Split u8[1] into its 8 decimal digits */ u8[1] = reduce8(u8[1]); /* f will point to the high order non-zero char */ char* f; /* branch post: f is at the correct short (but not necessarily the correct byte) */ if (u2[3]) { /* split the top two digits into their respective chars */ u2[3] = reduce2(u2[3]); f = str + 6; } else { /* a sort of binary search on first non-zero digit */ unsigned short* k = u4[2] ? u2 + 4 : u2 + 6; f = *k ? (char*)k : (char*)(k + 1); } /* update f to its final position */ if (!*f) f++; /* '0' == 0x30 and i < 10 --> i <= 0xF ... that is, i | 0x30 = 'i' * * Note that we could do u8[0] |= ... u8[1] |= ... but the corresponding * x86-64 operation cannot use a 64 bit immediate value whereas the * 32 bit 'or' can use a 32 bit immediate. */ u4[1] |= 0x30303030; u4[2] |= 0x30303030; u4[3] |= 0x30303030; /* Add the negative sign... note that o is just the original parameter passed */ if (o < 0) *--f = '-'; /* gcc basically forwards this to std::string(f, str + 16) * but msvc handles it way more efficiently */ return std::string(f, (str + 16) - f); }