#include "stdafx.h"
#include <stdio.h>
#include <tchar.h>
#include <windows.h>

struct FOO{ int a; int *b; long c; char *d; };

__declspec(noinline)
void __stdcall foo1(FOO h){
	printf("", h.a, h.b, h.c, h.d);
}

__declspec(noinline)
void __stdcall foo2(int a, int *b, long c, char *d){
	printf("", a, b, c, d);
}

typedef void (__stdcall *FOO1)(FOO h);
typedef void (__stdcall *FOO2)(int a, void *b, long c, char *d);

double QPFInit(){
	LARGE_INTEGER li;
	QueryPerformanceFrequency(&li);
	return (double)li.QuadPart;
}

double QPT(){
	static double qpf = QPFInit();
	LARGE_INTEGER li;
	QueryPerformanceCounter(&li);
	return li.QuadPart / qpf;
}

static const size_t cnt1 = 10000;
static const size_t cnt2 = 10000;
static double times[cnt2];

int compare(const void *v1, const void *v2){
	auto &t1 = *(const double*)v1;
	auto &t2 = *(const double*)v2;
	if(t1 < t2){
		return -1;
	}else if(t1 > t2){
		return 1;
	}else{
		return 0;
	}
}

double bench(const char *name, void (*func)()){
	printf("%s: ", name);
	for(size_t t = 0; t < cnt2; ++t){
		auto start = QPT();
		for(size_t s = 0; s < cnt1; ++s){
			func();
		}
		auto stop = QPT();
		times[t] = (stop - start) / cnt1;
	}
	qsort(times, cnt2, sizeof(double), compare);
	double sum = 0;
	for(size_t t = cnt2 / 4, e = t + cnt2 / 2; t < e; ++t){
		sum += times[t];
	}
	sum /= cnt2 / 2;
	printf("%g\n", sum);
	return sum;
}

FOO h = { 6, &h.a, 8, "efd", };

void printf_test(){
	printf("", h.a, h.b, h.c, h.d);
}

void foo1_test(){
	foo1(h);
}

void foo2_test(){
	foo2(h.a, h.b, h.c, h.d);
}

int _tmain(int argc, _TCHAR* argv[]){
	
	auto printf_time = bench("printf", printf_test);
	auto foo1_time = bench("foo1", foo1_test);
	auto foo2_time = bench("foo2", foo2_test);

	//foo1(h);
	((FOO1)foo2)(h);	//これがあるとfoo2がfoo1に統合される

	//foo2(h.a, h.b, h.c, h.d);
	//((FOO2)foo1)(h.a, h.b, h.c, h.d);

	return 0;
}

/*

---------------------------------------------------------------------------
Releaseビルド
((FOO1)foo2)(h);	//これがあるとfoo2がfoo1に統合される


//統合されたfoo1とfoo2関数本体
__declspec(noinline)
void __stdcall foo2(int a, int *b, long c, char *d){
00401000 55                   push        ebp  
00401001 8B EC                mov         ebp,esp  
	printf("", a, b, c, d);
00401003 8B 45 14             mov         eax,dword ptr [ebp+14h]  
00401006 8B 4D 10             mov         ecx,dword ptr [ebp+10h]  
00401009 8B 55 0C             mov         edx,dword ptr [ebp+0Ch]  
0040100C 50                   push        eax  
0040100D 8B 45 08             mov         eax,dword ptr [h]  
00401010 51                   push        ecx  
00401011 52                   push        edx  
00401012 50                   push        eax  
00401013 68 F8 20 40 00       push        offset string "" (4020F8h)  
00401018 FF 15 A4 20 40 00    call        dword ptr [__imp__printf (4020A4h)]  
0040101E 83 C4 14             add         esp,14h  
}
00401021 5D                   pop         ebp  
00401022 C2 10 00             ret         10h 


//foo1呼び出し
	foo1(h);
004011D0 8B 0D 18 30 40 00    mov         ecx,dword ptr [h (403018h)]  
004011D6 8B 15 1C 30 40 00    mov         edx,dword ptr [h+4 (40301Ch)]  
004011DC 83 EC 10             sub         esp,10h  
004011DF 8B C4                mov         eax,esp  
004011E1 89 08                mov         dword ptr [eax],ecx  
004011E3 8B 0D 20 30 40 00    mov         ecx,dword ptr [h+8 (403020h)]  
004011E9 89 50 04             mov         dword ptr [eax+4],edx  
004011EC 8B 15 24 30 40 00    mov         edx,dword ptr [h+0Ch (403024h)]  
004011F2 89 48 08             mov         dword ptr [eax+8],ecx  
004011F5 89 50 0C             mov         dword ptr [eax+0Ch],edx  
004011F8 E8 03 FE FF FF       call        foo1 (401000h)  


//foo2呼び出し
	foo2(h.a, h.b, h.c, h.d);
00401200 A1 24 30 40 00       mov         eax,dword ptr [h+0Ch (403024h)]  
00401205 8B 0D 20 30 40 00    mov         ecx,dword ptr [h+8 (403020h)]  
0040120B 8B 15 1C 30 40 00    mov         edx,dword ptr [h+4 (40301Ch)]  
00401211 50                   push        eax  
00401212 A1 18 30 40 00       mov         eax,dword ptr [h (403018h)]  
00401217 51                   push        ecx  
00401218 52                   push        edx  
00401219 50                   push        eax  
0040121A E8 E1 FD FF FF       call        foo1 (401000h)  


printf: 1.1886e-007
foo1: 1.20982e-007
foo2: 1.21645e-007

movで引数を入れるfoo1呼び出しの方が速い。


---------------------------------------------------------------------------
Releaseビルド
//((FOO1)foo2)(h);	//これがないとfoo2がfoo1に統合されない


//統合されないfoo1
__declspec(noinline)
void __stdcall foo1(FOO h){
00401000 55                   push        ebp  
00401001 8B EC                mov         ebp,esp  
	printf("", h.a, h.b, h.c, h.d);
00401003 8B 45 14             mov         eax,dword ptr [ebp+14h]  
00401006 8B 4D 10             mov         ecx,dword ptr [ebp+10h]  
00401009 8B 55 0C             mov         edx,dword ptr [ebp+0Ch]  
0040100C 50                   push        eax  
0040100D 8B 45 08             mov         eax,dword ptr [h]  
00401010 51                   push        ecx  
00401011 52                   push        edx  
00401012 50                   push        eax  
00401013 68 F8 20 40 00       push        offset string "" (4020F8h)  
00401018 FF 15 A4 20 40 00    call        dword ptr [__imp__printf (4020A4h)]  
0040101E 83 C4 14             add         esp,14h  
}
00401021 5D                   pop         ebp  
00401022 C2 10 00             ret         10h  


//統合されないfoo2
__declspec(noinline)
void __stdcall foo2(int a, int *b, long c, char *d){
00401030 55                   push        ebp  
00401031 8B EC                mov         ebp,esp  
	printf("", a, b, c, d);
00401033 50                   push        eax  
00401034 8B 45 08             mov         eax,dword ptr [a]  
00401037 51                   push        ecx  
00401038 52                   push        edx  
00401039 50                   push        eax  
0040103A 68 F8 20 40 00       push        offset string "" (4020F8h)  
0040103F FF 15 A4 20 40 00    call        dword ptr [__imp__printf (4020A4h)]  
00401045 83 C4 14             add         esp,14h  
}
00401048 5D                   pop         ebp  
00401049 C2 04 00             ret         4  


//foo1呼び出し
	foo1(h);
004011F0 8B 0D 18 30 40 00    mov         ecx,dword ptr [h (403018h)]  
004011F6 8B 15 1C 30 40 00    mov         edx,dword ptr [h+4 (40301Ch)]  
004011FC 83 EC 10             sub         esp,10h  
004011FF 8B C4                mov         eax,esp  
00401201 89 08                mov         dword ptr [eax],ecx  
00401203 8B 0D 20 30 40 00    mov         ecx,dword ptr [h+8 (403020h)]  
00401209 89 50 04             mov         dword ptr [eax+4],edx  
0040120C 8B 15 24 30 40 00    mov         edx,dword ptr [h+0Ch (403024h)]  
00401212 89 48 08             mov         dword ptr [eax+8],ecx  
00401215 89 50 0C             mov         dword ptr [eax+0Ch],edx  
00401218 E8 E3 FD FF FF       call        foo1 (401000h)  


//foo2呼び出し
	foo2(h.a, h.b, h.c, h.d);
00401220 A1 18 30 40 00       mov         eax,dword ptr [h (403018h)]  
00401225 8B 0D 20 30 40 00    mov         ecx,dword ptr [h+8 (403020h)]  
0040122B 8B 15 1C 30 40 00    mov         edx,dword ptr [h+4 (40301Ch)]  
00401231 50                   push        eax  
00401232 A1 24 30 40 00       mov         eax,dword ptr [h+0Ch (403024h)]  
00401237 E8 F4 FD FF FF       call        foo2 (401030h)  


printf: 1.1888e-007
foo1: 1.21176e-007
foo2: 1.13223e-007

foo1とfoo2は関数自体が違うので、呼び出し方のベンチにならない。
関数自体はfoo2の方が速い。なぜかprintfよりも速い。


---------------------------------------------------------------------------
Debugビルド

printf: 1.94619e-007
foo1: 2.12508e-007
foo2: 2.10874e-007

foo1とfoo2は関数自体が違うので、呼び出し方のベンチにならない。

*/
