#include <stdint.h>

template<int BytesCount>
struct rawdata
{
	char _[ BytesCount ];
    inline char& operator[](int index)
    {
        return _[ index ];
    }
};

//...

rawdata<8> genSWAP32(rawdata<8> p)
{
    rawdata<8> res = p;
	uint32_t* a = (uint32_t*)&res[0];
    uint32_t* b = (uint32_t*)&res[4];
    uint32_t tmp = *a;
    *a = *b;
    *b = tmp;
    return res;
}

rawdata<8> genSWAP(rawdata<8> p)
{
    return { p[ 4 ], p[ 5 ], p[ 6 ], p[ 7 ], p[ 0 ], p[ 1 ], p[ 2 ], p[ 3 ] };
}


#include <iostream>
#include <chrono>

template<int n>
std::ostream& operator<<(std::ostream& os, rawdata<n> p)
{
	for(int i = 0; i < n; i++)
		os << " - " << (int)p[i];
	return os;
}

int main()
{
	// Check that functions outputs the same things
    rawdata<8> test = { 10, 20, 30, 40, 50, 60, 70, 80 };
    rawdata<8> t1 = genSWAP(test);
    rawdata<8> t2 = genSWAP32(test);
    std::cout << "test" << test << "\n";
    std::cout << "t1  " << t1 << "\n";
    std::cout << "t2  " << t2 << "\n";
    
    // Perf check
    const uint64_t N = 10000000;
    rawdata<8> val = test;
    std::chrono::steady_clock::time_point start, end;
    uint64_t timeSWAP = 0, timeSWAP32 = 0;
    
    start = std::chrono::steady_clock::now();
    for(uint64_t t = 0; t < N; t++)
    	val = genSWAP(val);
    end = std::chrono::steady_clock::now();
    timeSWAP = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    
    start = std::chrono::steady_clock::now();
    for(uint64_t t = 0; t < N; t++)
    	val = genSWAP32(val);
    end = std::chrono::steady_clock::now();
    timeSWAP32 = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    
    std::cout << "SWAP  : " << timeSWAP << "µs\n";
    std::cout << "SWAP32: " << timeSWAP32 << "µs\n";
    std::cout << "SWAP32 is " << (timeSWAP / float(timeSWAP32)) << "x faster than SWAP\n";
    std::cout << "\n\n(unused" << val << ")\n";
}
