#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef unsigned long long uint64;

#define C_ASSERT(expr) extern char CAssertExtern[(expr)?1:-1]
C_ASSERT(sizeof(uint64) * CHAR_BIT == 64);

void mul64full(uint64 prod[2], uint64 a, uint64 b)
{
  uint64 p0, p1, p2, p3;

  p0 = (a & 0xFFFFFFFF) *
       (b & 0xFFFFFFFF);

  p1 = (a & 0xFFFFFFFF) *
       (b >> 32);

  p2 = (b & 0xFFFFFFFF) *
       (a >> 32);

  p3 = (a >> 32) *
       (b >> 32);

  prod[0] = p0;
  prod[1] = p3;

  if ((p1 << 32) > 0xFFFFFFFFFFFFFFFFULL - prod[0])
    prod[1]++;
  prod[0] += p1 << 32;
  prod[1] += p1 >> 32;

  if ((p2 << 32) > 0xFFFFFFFFFFFFFFFFULL - prod[0])
    prod[1]++;
  prod[0] += p2 << 32;
  prod[1] += p2 >> 32;
}

void mul128short(uint64 prod[2], const uint64 a[2], const uint64 b[2])
{
  uint64 p0[2], p1[2], p2[2];

  mul64full(p0, a[0], b[0]);
  mul64full(p1, a[0], b[1]);
  mul64full(p2, a[1], b[0]);

  prod[0] = p0[0];
  prod[1] = p0[1] + p1[0] + p2[0];
}

int lessThan128(const uint64 a[2], const uint64 b[2])
{
  return (a[1] < b[1]) || ((a[1] == b[1]) && (a[0] < b[0]));
}

int div256x128(uint64 qr[4], const uint64 dividend[4], const uint64 divisor[2])
{
  int i;

  if (!lessThan128(dividend + 2, divisor))
    return 0; // overflow

  qr[0] = dividend[0];
  qr[1] = dividend[1];
  qr[2] = dividend[2];
  qr[3] = dividend[3];

  for (i = 0; i < 128; i++)
  {
    if (qr[3] >= 0x8000000000000000ULL)
    {
      qr[3] = (qr[3] << 1) | (qr[2] >> 63);
      qr[2] = (qr[2] << 1) | (qr[1] >> 63);
      qr[1] = (qr[1] << 1) | (qr[0] >> 63);
      qr[0] <<= 1;

      if (qr[2] < divisor[0])
        qr[3]--;
      qr[2] -= divisor[0];
      qr[3] -= divisor[1];

      qr[0] |= 1;
    }
    else
    {
      qr[3] = (qr[3] << 1) | (qr[2] >> 63);
      qr[2] = (qr[2] << 1) | (qr[1] >> 63);
      qr[1] = (qr[1] << 1) | (qr[0] >> 63);
      qr[0] <<= 1;

      if (!lessThan128(qr + 2, divisor))
      {
        if (qr[2] < divisor[0])
          qr[3]--;
        qr[2] -= divisor[0];
        qr[3] -= divisor[1];

        qr[0] |= 1;
      }
    }
  }

  return 1;
}

void randfill(void* p, size_t s)
{
  unsigned char* pc = p;
  while (s--)
    *pc++ = rand();
}

int main(void)
{
  int i;

  srand(time(NULL));

  for (i = 0; i < 100; i++)
  {
    uint64 qr[4], dividend[4], divisor[2];

    // divisor, 64 bits only
    randfill(&divisor[0], sizeof(divisor[0]));
    divisor[1] = 0;

    if (divisor[0] != 0)
    {
      uint64 dividend2[2];

      // dividend, 128 bits only
      randfill(&dividend[0], sizeof(dividend[0]) * 2);
      dividend[3] = dividend[2] = 0;

      // divide
      div256x128(qr, dividend, divisor);

      // test via multiplication and addition
      mul128short(dividend2, qr, divisor);
      if (dividend2[0] + qr[2] < dividend2[0])
        dividend2[1]++;
      dividend2[0] += qr[2];
      dividend2[1] += qr[3];

      printf("0x%016llX%016llX / 0x%016llX = 0x%016llX%016llX:0x%016llX%016llX\n",
             dividend[1], dividend[0], divisor[0], qr[1], qr[0], qr[3], qr[2]);

      if (dividend[0] != dividend2[0] ||
          dividend[1] != dividend2[1])
      {
        printf("0x%016llX%016llX - restored dividend - ERROR\n",
               dividend2[1], dividend2[0]);
        exit(-1);
      }
    }
  }

  return 0;
}
