// bench.c
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#include <stdarg.h>
#include <pthread.h>

typedef int (*core_fn)(int index,int count,void* data);

typedef struct {
	pthread_t thread;
	int index,count;
	void *data;
	core_fn core;
	int rc,trc;
	struct timespec ts_start;
	double t_begin,t_end;
} run_ncore_t;

static int fatal_error(const char *msg) {
	fprintf(stderr,"FATAL_ERROR: %s\n",msg);
	exit(1);
}
static double run_ncore_ts(run_ncore_t *prm) {
	struct timespec ts[1];
	clock_gettime(CLOCK_REALTIME,ts) && fatal_error("clock_gettime");
	ts->tv_sec-=prm->ts_start.tv_sec;
	if (ts->tv_nsec<prm->ts_start.tv_nsec) {
		ts->tv_sec--;
		ts->tv_nsec=1000000000-prm->ts_start.tv_nsec+ts->tv_nsec;
	} else {
		ts->tv_nsec-=prm->ts_start.tv_nsec;
	}
	return ts->tv_sec+1e-9*ts->tv_nsec;
}
static void* run_ncore_main(void* ctx) {
	struct timespec ts;
	run_ncore_t *prm=(run_ncore_t*)ctx;
	prm->t_begin=run_ncore_ts(prm);
	prm->rc=prm->core(prm->index,prm->count,prm->data);
	prm->t_end=run_ncore_ts(prm);
	return 0;
}
int run_ncore_en_stat=0;
int run_ncore(int count,core_fn core,void *data,double *dt) {
	run_ncore_t *cores;
	unsigned size,i,nerr;
	struct timespec ts[1];
	double ts_start, ts_done;

	if (count<1) return 1; // invalid argument
	size=sizeof(run_ncore_t)*count;
	cores=(run_ncore_t*)malloc(size);
	if (!cores) return 2; // unable to allocate memory
	nerr=0; memset(cores,0,size);
	clock_gettime(CLOCK_REALTIME,ts) && fatal_error("clock_gettime");
	for(i=0;i<count;i++) {
		cores[i].index=i;
		cores[i].count=count;
		cores[i].data=data;
		cores[i].core=core;
		cores[i].ts_start=*ts;
		cores[i].trc=i>0 ? pthread_create(&cores[i].thread,0,run_ncore_main,&cores[i]) : 0;
		if (cores[i].trc) nerr++;
	}
	ts_start=run_ncore_ts(cores);
	run_ncore_main(cores);
	for(i=1;i<count;i++) {
		void* tret=0;
		if (cores[i].trc==0) pthread_join(cores[i].thread,&tret);
	}
	ts_done=run_ncore_ts(cores);
	if (run_ncore_en_stat) {
		printf("creating threads dt=%.3fms\n",ts_start*1e3);
		printf("working time ts=%.3fms\n",ts_done*1e3);
		for(i=0;i<count;i++) {
			printf("%d. start=%.3fms done=%.6fms\n",i,
				cores[i].t_begin*1e3,cores[i].t_end*1e3);
		}
	}
	free((void*)cores); cores=0;
	if (nerr) return 3; // has problems
	if (dt) *dt=ts_done;
	return 0;
}

typedef struct {
	void *buf;
	size_t buf_size;
} memtest1_t;

int memtest1_init(memtest1_t *t,size_t buf_size) {
	t->buf=malloc(buf_size);
	t->buf_size=buf_size;
	return t->buf ? 0 : 1;
}
void memtest1_done(memtest1_t *t) {
	if (t->buf) { free(t->buf); t->buf=0; }
}

int memtest1_memset(int index,int count,void *data) {
	size_t h,t,sz; void *p;
	memtest1_t *mt=(memtest1_t*)data;
	h=index*mt->buf_size/count;     h=(h+63)/64*64;
	t=(index+1)*mt->buf_size/count; t=(t+63)/64*64;
	p=(void*)((char*)mt->buf+h); sz=t-h;
	memset(p,255,sz);
	return 0;
}
void my_memset8(void* data,size_t value,size_t size) {
	size=(size+7)>>3;
	asm ("rep stosq\n\t"
		:
		: "D"(data), "c"(size), "a"(value)
	);
}
int memtest1_memset8(int index,int count,void *data) {
	size_t h,t,sz; void *p;
	memtest1_t *mt=(memtest1_t*)data;
	h=index*mt->buf_size/count;     h=(h+63)/64*64;
	t=(index+1)*mt->buf_size/count; t=(t+63)/64*64;
	p=(void*)((char*)mt->buf+h); sz=t-h;
	my_memset8(p,255,sz);
	return 0;
}
int memtest1_memmove(int index,int count,void *data) {
	size_t h,t,m,sz; void *p1, *p2;
	memtest1_t *mt=(memtest1_t*)data;
	h=index*mt->buf_size/count;     h=(h+63)/64*64;
	t=(index+1)*mt->buf_size/count; t=(t+63)/64*64;
	m=(h+t)/2;
	p1=(void*)((char*)mt->buf+h); sz=m-h;
	p2=(void*)((char*)mt->buf+m);
	memmove(p1,p2,sz);
	return 0;
}
int memtest1_memcpy(int index,int count,void *data) {
	size_t h,t,m,sz; void *p1, *p2;
	memtest1_t *mt=(memtest1_t*)data;
	h=index*mt->buf_size/count;     h=(h+63)/64*64;
	t=(index+1)*mt->buf_size/count; t=(t+63)/64*64;
	m=(h+t)/2;
	p1=(void*)((char*)mt->buf+h); sz=m-h;
	p2=(void*)((char*)mt->buf+m);
	memcpy(p1,p2,sz);
	return 0;
}

enum { KB=1024, MB=1024*KB, GB=1024*MB };
void test_write(int n,size_t size,core_fn core,const char* name) {
	int i,j; double dt,rate;
	memtest1_t mt[1];
	memtest1_init(mt,size);
	printf("%s %.3fMb\n",name,(double)size/MB);
	for(i=0;i<=n;i++) {
		run_ncore(i<1?1:i,core,mt,&dt);
		rate=size/dt;
		if (i==0)printf("\tfirst time one thread %.1fMb/s\n",rate/MB);
		else printf("\t%d\t%.1fMb/s\t%.2fms\n",i,rate/MB,dt*1e3);
	}
	memtest1_done(mt);
}

void test_copy(int n,size_t size,core_fn core,const char* name) {
	int i,j; double dt,rate;
	memtest1_t mt[1];
	memtest1_init(mt,2*size);
	printf("%s %.3fMb->%.3fMb\n",name,(double)size/MB,(double)size/MB);
	for(i=0;i<=n;i++) {
		run_ncore(i<1?1:i,core,mt,&dt);
		rate=size/dt;
		if (i==0)printf("\tfirst time one thread %.1fMb/s\n",rate/MB);
		else printf("\t%d\t%.1fMb/s\t%.2fms\n",i,rate/MB,dt*1e3);
	}
	memtest1_done(mt);
}

typedef struct {
	size_t count;
} test0_t;

int test0_core0(size_t x) {
	int i,r=0;
	for(i=0;i<1900;i++) r^=i-x;
	return r;
}
int test0_core(int index,int count,void* data) {
	size_t h,t,i; int r=0;
	test0_t *ts=(test0_t*)data;

	h=index*ts->count/count;
	t=(index+1)*ts->count/count;
	for(i=h;i<t;i++) {
		r+=test0_core0(i);
	}
	return r;
}

void test0(int n,size_t count) {
	test0_t ts[1]; int i;
	double dt,rate,rate1,alpha,k;
	ts->count=count;
	printf("threading speedup test\n");
	for(i=1;i<=n;i++) {
		run_ncore(i,test0_core,ts,&dt);
		rate=count/dt;
		if (i==1) rate1=rate;
		k=rate1/rate;
		alpha=i>1 ? (k*i-1)/(i-1) : 0;
		printf("\t%d\t%.1fMHz/s\t%.2fms\t%.4f\n",i,rate/1000000,dt*1e3,alpha);
	}
}

int main(int argc,char** argv) {
	int n=8; enum { N=512, M=200000 };
	if (argc>1) {
		n=atoi(argv[1]);
		if (n<1) n=1;
		if (n>256) n=256;
	}
	//run_ncore_en_stat=1;
	test0(n,M);
	test_write(n,N*MB,memtest1_memset,"memset");
	test_write(n,N*MB,memtest1_memset8,"rep stosq");
	test_copy(n,N*MB,memtest1_memcpy,"memcpy");
	test_copy(n,N*MB,memtest1_memmove,"memmove");
	return 0;
}

