#include <emmintrin.h>
#include <tmmintrin.h>

#define AZI_FMT_RGB_P16 0x22272
#define AZI_FMT_RGBA_I8 0x3210F4
#define AZI_FMT_RGBA_I16 0x6420F8

typedef struct azi_image {
    unsigned int width, height, format;
    union {
        unsigned short int* pixels;
        unsigned int* rgba;
    };
} azi_image;

#define SET16(a, b) a = (__v8hu)_mm_setr_epi16(b, b, b, b, b, b, b, b)
#define LOAD(a, b) a = (__v8hu)_mm_load_si128((__m128i*)(b))
#define UPLO(a, b) a = (__v8hu)_mm_unpacklo_epi8((__m128i)b, (__m128i)regZero)
#define UPHI(a, b) a = (__v8hu)_mm_unpackhi_epi8((__m128i)b, (__m128i)regZero)
#define ADD(a, b) a = (__v8hu)_mm_add_epi16((__m128i)a, (__m128i)b)
#define MUL(a, b) a = (__v8hu)_mm_mullo_epi16((__m128i)a, (__m128i)b)
#define SRL16I(a, b) a = (__v8hu)_mm_srli_epi16((__m128i)a, (__m128i)b);

void azi_rgba_pr_fir3_line_sse2(azi_image* image,
    unsigned char* data,
    double src_left,
    double src_top,
    double src_right,
    double src_bottom,
    double dst_left,
    double dst_top,
    double scale,
    int width)
{
    int dst_left_int = ceil(dst_left);
    int dst_right_int = floor(dst_left + scale * (src_right - src_left) + 0.1);
    int dst_top_int = ceil(dst_top);
    int dst_bottom_int = floor(dst_top + scale * (src_bottom - src_top) + 0.1);

    long long src_fixp_x_begin = src_left * 0x100000000l;
    long long src_fixp_y = src_top * 0x100000000l;
    long long src_step = 0x100000000l / scale;

    if(src_step * (dst_bottom_int - dst_top_int) >> 32 >= image->height)
        src_step--;
    else if(src_step * (dst_right_int - dst_left_int) >> 32 >= image->width)
        src_step--;

    int linesize = 4 * width;
    data += linesize * dst_top_int;

    int line_pitch = (image->width - 1 | 3) + 1;
    int x4begin = src_fixp_x_begin >> 34 << 2;
    unsigned int* line0 = image->rgba + ((int)src_top * line_pitch);
    unsigned int *line1 = line0, *line2 = line0, *line3 = line0;

    // MM registers
    __v8hu regX, regY, regZero, reg0, reg1, regK0, regK1, regK2, regK3, regS, regKC, regKP, reg2, reg3, regLC, regLI;

    SET16(regZero, 0);

    double flK0;
    if(src_step >= 0x300000000) {
        flK0 = 0.0;
    } else if(src_step <= 0x180000000) {
        flK0 = 1.0;
    } else {
        flK0 = (0x300000000 - src_step) / (double)0x180000000;
        // flK0 = sqrt(flK0);
    }
    double flK1 = 0.5 * (1. - flK0);

    short shKP = 256. * flK1 + 0.5;
    short shKC = 256 - shKP - shKP; // corrected

    SET16(regKC, shKC);
    SET16(regKP, shKP);

    for(int y = src_top; y < src_bottom; y++) {
        long long src_fixp_x = src_fixp_x_begin;
        int dst_x = dst_left_int;

        int r = src_right;
        for(int x = r; x < (r - 1 | 3) + 1; x++)
            line0[x] = line0[r - 1];

        if(src_fixp_y >> 32 == y) {

            double flKV = (double)(src_fixp_y & 0xFFFFFFFFll) / 0x100000000ll;

            double k0kV = flK0 * flKV, k1kV = flK1 * flKV, k0 = k1kV, k1 = k0kV + flK1 - k1kV, k2 = k1kV + flK0 - k0kV,
                   k3 = flK1 - k1kV;

            short shK0 = 256. * k0 + 0.5, shK1 = 256. * k1 + 0.5, shK2 = 256. * k2 + 0.5, shK3 = 256. * k3 + 0.5;
            shK1 = 256 - shK0 - shK2 - shK3; // corrected

            SET16(regK0, shK0);
            SET16(regK1, shK1);
            SET16(regK2, shK2);
            SET16(regK3, shK3);

            LOAD(regLC, line0 + x4begin);
            UPLO(regLC, regLC);
            regLI = _mm_slli_si128(regLC, 8);
            regLI = _mm_srli_si128(regLI, 8);
            regLC = _mm_slli_epi16(regLC, 8);

            for(int x = x4begin; x < src_right; x += 4) {

                // VERTICAL FILTERING
                LOAD(regX, line0 + x);
                UPLO(reg0, regX);
                MUL(reg0, regK0);
                UPHI(reg1, regX);
                MUL(reg1, regK0);

                LOAD(regX, line1 + x);
                UPLO(regY, regX);
                MUL(regY, regK1);
                ADD(reg0, regY);
                UPHI(regY, regX);
                MUL(regY, regK1);
                ADD(reg1, regY);

                LOAD(regX, line2 + x);
                UPLO(regY, regX);
                MUL(regY, regK2);
                ADD(reg0, regY);
                UPHI(regY, regX);
                MUL(regY, regK2);
                ADD(reg1, regY);

                LOAD(regX, line3 + x);
                UPLO(regY, regX);
                MUL(regY, regK3);
                ADD(reg0, regY);
                UPHI(regY, regX);
                MUL(regY, regK3);
                ADD(reg1, regY);

                // SRL16I(reg0, 8);
                // SRL16I(reg1, 8);

                // vertically filtered 4 pixels in this registers now

                // !!! use extract and insert intrinsics?

                regX = reg0;
                reg2 = _mm_slli_si128(reg0, 8);
                reg3 = _mm_slli_si128(reg1, 8);
                regX = _mm_srli_si128(regX, 8);

                reg3 = _mm_or_si128(reg3, regX);

                regX = regLC;

                // [regX, reg0] shifted 2 now

                regLC = _mm_srli_si128(regLC, 8);
                reg2 = _mm_or_si128(reg2, regLC);

                // [reg2, reg3] shifted 1 now

                regLC = reg1; // remembered next regLC

                // [reg0, reg1] unshifted
                // [regX, reg0] shifted 2
                // [reg2, reg3] shifted 1
                reg0 = _mm_srli_epi16(reg0, 8);
                reg1 = _mm_srli_epi16(reg1, 8);
                reg2 = _mm_srli_epi16(reg2, 8);
                reg3 = _mm_srli_epi16(reg3, 8);
                regX = _mm_srli_epi16(regX, 8);

                reg1 = _mm_add_epi16(reg1, reg0);
                reg0 = _mm_add_epi16(reg0, regX);

                reg0 = _mm_mullo_epi16(reg0, regKP);
                reg1 = _mm_mullo_epi16(reg1, regKP);

                reg2 = _mm_mullo_epi16(reg2, regKC);
                reg3 = _mm_mullo_epi16(reg3, regKC);

                reg2 = _mm_add_epi16(reg0, reg2);
                reg3 = _mm_add_epi16(reg1, reg3);

                // [reg2, reg3] our result

                reg2 = _mm_srli_epi16(reg2, 8);
                reg3 = _mm_srli_epi16(reg3, 8);

                while(src_fixp_x >> 34 == x >> 2) {

                    int index = (src_fixp_x >> 32) - x;

                    switch(index) {
                    case 0:
                        reg0 = _mm_slli_si128(reg2, 8);
                        reg0 = _mm_or_si128(reg0, regLI);
                        break;
                    case 1:
                        reg0 = reg2;
                        break;
                    case 2:
                        reg0 = _mm_srli_si128(reg2, 8);
                        reg1 = _mm_slli_si128(reg3, 8);
                        reg0 = _mm_or_si128(reg0, reg1);
                        break;
                    case 3:
                        reg0 = reg3;
                        break;
                    }

                    unsigned long long llKI = (src_fixp_x >> 24) & 0xFF;
                    unsigned long long llK1 = 256 - llKI;
                    llKI |= llKI << 16;
                    llKI |= llKI << 32;
                    llK1 |= llK1 << 16;
                    llK1 |= llK1 << 32;

                    regX = _mm_cvtsi64_si128(llKI);
                    regY = _mm_cvtsi64_si128(llK1);
                    regX = _mm_slli_si128(regX, 8);
                    reg1 = _mm_or_si128(regX, regY);

                    reg0 = _mm_mullo_epi16(reg0, reg1);
                    reg1 = _mm_srli_si128(reg0, 8);
                    reg0 = _mm_add_epi16(reg0, reg1);

                    /*/ alpha blending
                    unsigned long d = *((unsigned int*)(data + dst_x * 4));
                    reg1 = _mm_cvtsi64_si128(d);
                    reg1 = _mm_unpacklo_epi8(reg1, regZero);
                    unsigned long alpha = _mm_cvtsi128_si64(reg0);
                    alpha = alpha >> 48 & 0xFFFF;
                    alpha = alpha * 65536 / 65280 / 256;
                    unsigned long not_alpha = 256 - alpha;
                    alpha |= alpha << 16;
                    alpha |= alpha << 32;
                    not_alpha |= not_alpha << 16;
                    not_alpha |= not_alpha << 32;
                    regX = _mm_cvtsi64_si128(alpha);
                    regY = _mm_cvtsi64_si128(not_alpha);
                    //
                    reg0 = _mm_srli_epi16(reg0, 8);
                    reg0 = _mm_mullo_epi16(reg0, regX);
                    reg1 = _mm_mullo_epi16(reg1, regY);
                    reg0 = _mm_add_epi16(reg0, reg1);
                     */
                    reg0 = _mm_srli_epi16(reg0, 8);
                    //*

                    // reg0 = _mm_srli_epi16(reg0, 8);
                    reg0 = _mm_packus_epi16(reg0, reg0);

                    unsigned int color = _mm_cvtsi128_si32(reg0);
                    *((unsigned int*)(data + dst_x * 4)) = color | 0xFF000000u;
                    //*((unsigned int*)(data + dst_x * 4)) = 0x80808080u;

                    /*if(color < 0xFF000000u) {
                        printf("color 0x%8.8X\n", color);
                        return;
                    }*/

                    dst_x++;
                    src_fixp_x += src_step;
                }
                regLI = _mm_srli_si128(reg3, 8);
            }
        }
        // break;
        line3 = line2;
        line2 = line1;
        line1 = line0;
        line0 += line_pitch;
        if(src_fixp_y >> 32 > y)
            continue;
        src_fixp_y += src_step;
        data += linesize;
    }
}
