#include <unordered_map>
#include <vector>
#include <algorithm>

#include <chrono>

#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cmath>

template<typename T> struct cast_to_uint
{
	// This is identical to std::hash, but it optimizes differently!
	constexpr std::size_t operator()(T const* k) const
	{
		return reinterpret_cast<size_t>(k);
	}
};

template<typename T, unsigned int N> struct right_shift
{
	// This actually makes sense for N = 2 or N = 3, since the low 3 bits
	// are always zero, for larger N this only discards entropy.
	// Surprisingly, this is not at all a bad hash, most of the time
	// among the fastest three!
	constexpr std::size_t operator()(T const* k) const
	{
		return (size_t) k>>N;
	}
};

template<typename T> struct simple_xorshift
{
	// Marsaglia PRNG used to hash bits, large shifts used
	// to fill the constant low zero-bits with entropy
	// Mixes bits very well considering it's about 2x the speed of murmur,
	// but no overall advantage in hash table performance
	std::size_t operator()(T const* k) const
	{
		unsigned int x = (unsigned int) k;
		x ^= x >> 11;
		x ^= x << 7;
		x ^= x >> 12;
		return x;
	}
};

template<typename T> struct double_xorshift
{
	// Same as above, but combining two different generators
	// Sadly 50% slower on x86 due to lack of registers
	// (amd64 implementation should be +/- 1 clock tick of simple version)
	std::size_t operator()(T const* k) const
	{
		unsigned int x = (unsigned int) k;
		unsigned int y = x;
		x ^= x << 14;
		x ^= x >> 13;
		x ^= x << 15;
		y ^= y << 11;
		y ^= y >> 19;
		y ^= y << 8;

		return x+y;
	}
};

template<typename T> struct djb2
{
	// standard implementation,
	// hardcoded input size = 4
	std::size_t operator()(T const* k) const
	{
		unsigned int x = (unsigned int) k;
		unsigned int h = 5381;
		h = h*33 + (x     & 0xff);
		h = h*33 + (x>>8  & 0xff);
		h = h*33 + (x>>16 & 0xff);
		h = h*33 + (x>>24 & 0xff);
		return h;
	}
};

template<typename T> struct djb2_mod
{
	// Same as above, except for using larger seed so
	// overflow happens earlier (since only 4 bytes of input)
	std::size_t operator()(T const* k) const
	{
		unsigned int x = (unsigned int) k;
		unsigned int h = 715225739;
		h = h*33 + (x     & 0xff);
		h = h*33 + (x>>8  & 0xff);
		h = h*33 + (x>>16 & 0xff);
		h = h*33 + (x>>24 & 0xff);
		return h;
	}
};

template<typename T> struct sdbm
{
	// Standard implementation, hardcoded input size = 4
	std::size_t operator()(T const* k) const
	{
		unsigned int x = (unsigned int) k;
		unsigned int h = 0;

		h = h * 65599 + (x     & 0xff);
		h = h * 65599 + (x>>8  & 0xff);
		h = h * 65599 + (x>>16 & 0xff);
		h = h * 65599 + (x>>24 & 0xff);
		return h;
	}
};

template<typename T> struct yet_another_lc
{
	// Same design as djb and sdbm, using different prime multiplier.
	// This is the last one of about 50 different primes beween 7 and 17 bits
	// that I've tried. None truly performed any better (though some did worse).
	// This proves the common knowledge that finding a good (or better) hash function
	// is very hard.
	std::size_t operator()(T const* k) const
	{
		unsigned int x = (unsigned int) k;
		unsigned int h = 174440041;

		h = h * 257 + (x     & 0xff);
		h = h * 257 + (x>>8  & 0xff);
		h = h * 257 + (x>>16 & 0xff);
		h = h * 257 + (x>>24 & 0xff);
		return h;
	}
};

template<typename T> struct murmur2
{
	// Hardcoded input size = 4
	std::size_t operator()(T const* k) const
	{
		const unsigned int m = 0x5bd1e995;

		uint32_t h = 4;

		unsigned int d = (unsigned int) k;

		d *= m;
		d ^= d >> 24;
		d *= m;

		h *= m;
		h ^= d;


		h ^= h >> 13;
		h *= m;
		h ^= h >> 15;

		return h;
	}
};

template<typename T> struct murmur3
{
	// Hardcoded input size = 4.
	// Omitted xor with input size in the final mix step, as that's a constant,
	// and this only flips a single bit (waste of CPU cycles).

	template<int r> constexpr uint32_t rotl(uint32_t x) { return (x << r) | (x >> (32 - r)); }

	unsigned int fmix32( unsigned int h ) const
	{
		h ^= h >> 16;
		h *= 0x85ebca6b;
		h ^= h >> 13;
		h *= 0xc2b2ae35;
		h ^= h >> 16;
		return h;
	}

	std::size_t operator()(T const* k) const
	{
		constexpr unsigned int c1 = 0xcc9e2d51;
		constexpr unsigned int c2 = 0x1b873593;

		unsigned int h = (unsigned int) k;

		h *= c1;
		h = rotl<15>(h);
		h *= c2;

		h = rotl<13>(h);
		h = h*5+0xe6546b64;

		return fmix32(h);
	}
};

// Copy-paste as posted on SO
template<typename Tval>
struct MyTemplatePointerHash1 {
    size_t operator()(const Tval* val) const {
        static const size_t shift = (size_t)log2(1 + sizeof(Tval));
        return (size_t)(val) >> shift;
    }
};

// Copy-paste as posted on SO
struct BasileStarynkevitch {
      size_t operator()(const void* val) const {
         uintptr_t ad = (uintptr_t) val;
         return (size_t) ((13*ad) ^ (ad >> 15));
      }
  };



template<typename H> void benchmark_hash(const char* name)
{
	using namespace std::chrono;

	printf("%-25s ", name);
	H hash;

	{
		auto start = system_clock::now();

		int unused = 0;
		for(unsigned int i = 0; i < 4*1000*1000*1000; ++i)
			unused += hash((int*)i);
		__attribute__ (( unused )) volatile int optimizer_sink = unused;

		auto stop = system_clock::now();
		printf("%-10lld\n", duration_cast<milliseconds>(stop - start).count());
	}
}


template<typename T, typename H> void test(std::vector<T*> const& vec, unsigned int bucket_count, const char* name)
{
	using namespace std::chrono;

	printf("%-25s ", name);

	{
		auto start = system_clock::now();

		for(unsigned int iterations = 0; iterations < 50; ++iterations)
		{
			std::unordered_map<T*, int, H> map(bucket_count);
			for(auto& elem : vec) map.insert(std::pair<T*, int>(elem, 1));
		}
		auto stop = system_clock::now();
		printf("%-10lld ", duration_cast<milliseconds>(stop - start).count());
	}

	{
		std::unordered_map<T*, int, H> map(bucket_count);

		for(auto& elem : vec) map.insert(std::pair<T*, int>(elem, 1));

		auto start = system_clock::now();

		int unused = 0;
		for(unsigned int i = 0; i < 100*1000*1000; ++i)
		{
			auto it = map.find(vec[(rand() ^ (rand()<<16)) % vec.size()]);
			unused += it->second;
		}
		__attribute__ (( unused )) volatile int optimizer_sink = unused;

		auto stop = system_clock::now();
		printf("%-10lld ", duration_cast<milliseconds>(stop - start).count());
	}


	puts("");
	fflush(stdout);
}


template<typename T> void do_tests(unsigned int num_elements, bool shuffle)
{
	puts  ("--------------------------------");
	printf("sizeof(T)       = %u\n", sizeof(T));
	printf("sizeof(T*)      = %u\n", sizeof(T*));
	printf("insertion order = %s\n\n", (shuffle ? "random" : "sequential"));
	printf("dataset size    = %8u elements\n", num_elements);

	std::vector<T*> vec;
	vec.reserve(num_elements);

	for(unsigned int i = 0; i < num_elements; ++i)
		vec.push_back(new T);

	for(auto& elem : vec)
		delete elem; // don't actually need these, only need the addresses

	if(shuffle) std::random_shuffle(vec.begin(), vec.end());

	unsigned int seed = time(0); // random, but identical access pattern for all tests

	printf("%-25s %-10s %-10s\n", "name", "insert", "access");

	srand(seed);  test<T, std::hash<T*>>             (vec, num_elements, "std::hash");
	srand(seed);  test<T, cast_to_uint<T>>           (vec, num_elements, "reinterpret_cast");
	srand(seed);  test<T, djb2<T>>                   (vec, num_elements, "djb2");
	srand(seed);  test<T, djb2_mod<T>>               (vec, num_elements, "djb2_mod");
	srand(seed);  test<T, sdbm<T>>                   (vec, num_elements, "sdbm");
	srand(seed);  test<T, yet_another_lc<T>>         (vec, num_elements, "yet_another_lc");
	srand(seed);  test<T, murmur2<T>>                (vec, num_elements, "murmur2");
	srand(seed);  test<T, murmur3<T>>                (vec, num_elements, "murmur3");
	srand(seed);  test<T, simple_xorshift<T>>        (vec, num_elements, "simple_xorshift");
	srand(seed);  test<T, double_xorshift<T>>        (vec, num_elements, "double_xorshift");
	srand(seed);  test<T, right_shift<T, 2>>         (vec, num_elements, "right_shift_2");
	srand(seed);  test<T, right_shift<T, 3>>         (vec, num_elements, "right_shift_3");
	srand(seed);  test<T, right_shift<T, 4>>         (vec, num_elements, "right_shift_4");
	srand(seed);  test<T, right_shift<T, 5>>         (vec, num_elements, "right_shift_5");
	srand(seed);  test<T, right_shift<T, 8>>         (vec, num_elements, "right_shift_8");
	srand(seed);  test<T, right_shift<T, 12>>        (vec, num_elements, "right_shift_12");
	srand(seed);  test<T, MyTemplatePointerHash1<T>> (vec, num_elements, "MyTemplatePointerHash1");
	srand(seed);  test<T, BasileStarynkevitch>       (vec, num_elements, "BasileStarynkevitch");
	puts("");
}



#include <windows.h>

struct object64   { char x[64]; };
struct object256  { char x[256]; };
struct object1024 { char x[1024]; };

int main()
{
	timeBeginPeriod(1);

	puts("Benchmarking hash funcs...");
	benchmark_hash<std::hash<int*>>             ("std::hash");
	benchmark_hash<cast_to_uint<int>>           ("reinterpret_cast");
	benchmark_hash<djb2<int>>                   ("djb2");
	benchmark_hash<djb2_mod<int>>               ("djb2_mod");
	benchmark_hash<sdbm<int>>                   ("sdbm");
	benchmark_hash<yet_another_lc<int>>         ("yet_another_lc");
	benchmark_hash<murmur2<int>>                ("murmur2");
	benchmark_hash<murmur3<int>>                ("murmur3");
	benchmark_hash<simple_xorshift<int>>        ("simple_xorshift");
	benchmark_hash<double_xorshift<int>>        ("double_xorshift");
	benchmark_hash<right_shift<int, 2>>         ("right_shift_2");
	benchmark_hash<right_shift<int, 3>>         ("right_shift_3");
	benchmark_hash<right_shift<int, 4>>         ("right_shift_4");
	benchmark_hash<right_shift<int, 5>>         ("right_shift_5");
	benchmark_hash<right_shift<int, 8>>         ("right_shift_8");
	benchmark_hash<right_shift<int, 12>>        ("right_shift_12");
	benchmark_hash<MyTemplatePointerHash1<int>> ("MyTemplatePointerHash1");
	benchmark_hash<BasileStarynkevitch>         ("BasileStarynkevitch");

	puts("");

	do_tests<int>(50000, false);
	do_tests<int>(50000, true);
	do_tests<int>(1000000, false);
	do_tests<int>(1000000, true);


	do_tests<object64>(50000, false);
	do_tests<object64>(50000, true);
	do_tests<object64>(1000000, false);
	do_tests<object64>(1000000, true);

	do_tests<object256>(50000, false);
	do_tests<object256>(50000, true);
	do_tests<object256>(1000000, false);
	do_tests<object256>(1000000, true);

	do_tests<object1024>(50000, false);
	do_tests<object1024>(50000, true);
	do_tests<object1024>(1000000, false);
	do_tests<object1024>(1000000, true);


	timeEndPeriod(1);

    return 0;
}
