#include <iostream>
#include <emmintrin.h>

using namespace std;

short input[8] = {0, -1, 2048, -2048, 0x7fff, -0x8000, short(2048*3.1415), short(2048*2.7182)};

float output[8];



int main() {
    // get input:
    __m128i val = _mm_loadu_si128((__m128i*)input);
    // add 0x8000 to wrap to unsigned short domain:
    val = _mm_add_epi16(val, _mm_set1_epi16(0x8000));
    // interleave with upper part of float(1<<23)/2048.f:
    __m128i lo = _mm_unpacklo_epi16(val, _mm_set1_epi16(0x4580));
    __m128i hi = _mm_unpackhi_epi16(val, _mm_set1_epi16(0x4580));
    // interpret as float and subtract float((1<<23) + (0x8000))/2048.f
    __m128 lo_f = _mm_sub_ps(_mm_castsi128_ps(lo), _mm_set_ps1(float((1<<23) + (1<<15))/2048.f));
    __m128 hi_f = _mm_sub_ps(_mm_castsi128_ps(hi), _mm_set_ps1(float((1<<23) + (1<<15))/2048.f));
    // store:
    _mm_storeu_ps(output, lo_f);
    _mm_storeu_ps(output+4, hi_f);
    
    for(int i=0; i<8; ++i)
       std::cout << output[i] << " ";
    std::cout << "\n";
}