#include <windows.h>
#include <stdio.h>
class StopWatch {
LONGLONG start_time,lap_time,freq;
public:
StopWatch() {
freq = 0;
}
~StopWatch() {
}
bool Setup() {
if (! QueryPerformanceFrequency((LARGE_INTEGER*)&freq)) {
fprintf(stderr, "このハードウェアはパフォーマンスカウンタをサポートしていない為、計測できません。\n");
return false;
}
return true;
}
void Start() {
QueryPerformanceCounter((LARGE_INTEGER*)&start_time);
}
double LapTime() {
QueryPerformanceCounter((LARGE_INTEGER*)&lap_time);
return (double)(lap_time-start_time)/(double)freq;
}
};
class CpuFreq {
StopWatch stopwatch;
double freq;
public:
CpuFreq() {
}
~CpuFreq() {
}
bool Setup() {
if (!stopwatch.Setup()) {
return false;
}
bool except_flag = false;
__try {
__asm {
rdtsc
}
} __except (true) {
except_flag = true;
}
if (except_flag) {
fprintf(stderr, "このCPUはrdtsc命令をサポートしていない為、計測できません。\n");
return false;
}
HANDLE process = GetCurrentProcess();
int i;
double maxf=0.0;
for (i=0; i<5; ++i) {
Sleep(10);
SetPriorityClass(process, REALTIME_PRIORITY_CLASS);
LONGLONG startcount, lapcount;
double laptime;
stopwatch.Start();
__asm {
rdtsc
mov dword ptr startcount, eax
mov dword ptr startcount+4, edx
}
while ((laptime=stopwatch.LapTime()) < 0.1);
__asm {
rdtsc
mov dword ptr lapcount, eax
mov dword ptr lapcount+4, edx
}
SetPriorityClass(process, NORMAL_PRIORITY_CLASS);
double f = (double)(lapcount-startcount) / laptime;
if (f > maxf) maxf = f;
}
freq = maxf;
return true;
}
double GetFreq() {
return freq;
}
};
#define MMX_FLAG 0x00800000
#define SSE_FLAG 0x02000000
#define SSE2_FLAG 0x04000000
#define BUFSIZE (32*1024*1024)
int buf[(BUFSIZE+256)/4];
int *bufa = (int*)((((int)buf)+255)&(-256));
void mk_buf(int size, int mode) {
int i;
switch (mode) {
case 0:
for (i=0; i<size/4; ++i) {
bufa[i] = -1;
}
break;
case 1:
for (i=0; i<size/4; ++i) {
*(float*)&bufa[i] = -1.0f;
}
break;
case 2:
for (i=0; i<size/4; i+=2) {
*(double*)&bufa[i] = -1.0;
}
break;
}
}
LONGLONG read_int(int count,int range) {
// LONGLONG c;
__asm {
mov esi,bufa
mov ebx,range
sub ebx,1
mov ecx,count
push ebp
sub esp,8
rdtsc
mov dword ptr [esp],eax
mov dword ptr [esp+4],edx
xor edx,edx
mov edi,-4*8
loop1:
mov eax,1024*1024/(4*8*2)
align 16
loop2:
mov ebp,[esi+edx+4*0]
mov ebp,[esi+edx+4*1]
add edi,4*8*2
and edi,ebx
mov ebp,[esi+edx+4*2]
mov ebp,[esi+edx+4*3]
mov ebp,[esi+edx+4*4]
mov ebp,[esi+edx+4*5]
mov ebp,[esi+edx+4*6]
mov ebp,[esi+edx+4*7]
mov ebp,[esi+edi+4*0]
mov ebp,[esi+edi+4*1]
add edx,4*8*2
and edx,ebx
mov ebp,[esi+edi+4*2]
mov ebp,[esi+edi+4*3]
mov ebp,[esi+edi+4*4]
mov ebp,[esi+edi+4*5]
mov ebp,[esi+edi+4*6]
mov ebp,[esi+edi+4*7]
sub eax,1
jne loop2
sub ecx,1
jne loop1
rdtsc
sub eax,dword ptr [esp]
sbb edx,dword ptr [esp+4]
add esp,8
pop ebp
}
}
LONGLONG read_x87(int count,int range) {
LONGLONG c;
__asm {
mov esi,bufa
mov ebx,range
sub ebx,1
rdtsc
mov dword ptr c,eax
mov dword ptr c+4,edx
xor edx,edx
mov edi,-8*8
mov ecx,count
loop1:
mov eax,1024*1024/(8*8*2)
align 16
loop2:
fld qword ptr [esi+edx+8*0]
fstp st(0)
fld qword ptr [esi+edx+8*1]
fstp st(0)
add edi,8*8*2
and edi,ebx
fld qword ptr [esi+edx+8*2]
fstp st(0)
fld qword ptr [esi+edx+8*3]
fstp st(0)
fld qword ptr [esi+edx+8*4]
fstp st(0)
fld qword ptr [esi+edx+8*5]
fstp st(0)
fld qword ptr [esi+edx+8*6]
fstp st(0)
fld qword ptr [esi+edx+8*7]
fstp st(0)
fld qword ptr [esi+edi+8*0]
fstp st(0)
fld qword ptr [esi+edi+8*1]
fstp st(0)
add edx,8*8*2
and edx,ebx
fld qword ptr [esi+edi+8*2]
fstp st(0)
fld qword ptr [esi+edi+8*3]
fstp st(0)
fld qword ptr [esi+edi+8*4]
fstp st(0)
fld qword ptr [esi+edi+8*5]
fstp st(0)
fld qword ptr [esi+edi+8*6]
fstp st(0)
fld qword ptr [esi+edi+8*7]
fstp st(0)
sub eax,1
jne loop2
sub ecx,1
jne loop1
rdtsc
sub eax,dword ptr c
sbb edx,dword ptr c+4
}
}
LONGLONG read_mmx(int count,int range) {
LONGLONG c;
__asm {
mov esi,bufa
mov ebx,range
sub ebx,1
rdtsc
mov dword ptr c,eax
mov dword ptr c+4,edx
xor edx,edx
mov edi,-8*8
mov ecx,count
loop1:
mov eax,1024*1024/(8*8*2)
align 16
loop2:
movq mm0,[esi+edx+8*0]
movq mm0,[esi+edx+8*1]
add edi,8*8*2
and edi,ebx
movq mm0,[esi+edx+8*2]
movq mm0,[esi+edx+8*3]
movq mm0,[esi+edx+8*4]
movq mm0,[esi+edx+8*5]
movq mm0,[esi+edx+8*6]
movq mm0,[esi+edx+8*7]
movq mm0,[esi+edi+8*0]
movq mm0,[esi+edi+8*1]
add edx,8*8*2
and edx,ebx
movq mm0,[esi+edi+8*2]
movq mm0,[esi+edi+8*3]
movq mm0,[esi+edi+8*4]
movq mm0,[esi+edi+8*5]
movq mm0,[esi+edi+8*6]
movq mm0,[esi+edi+8*7]
sub eax,1
jne loop2
sub ecx,1
jne loop1
rdtsc
sub eax,dword ptr c
sbb edx,dword ptr c+4
emms
}
}
LONGLONG read_sse(int count,int range) {
LONGLONG c;
__asm {
mov esi,bufa
mov ebx,range
sub ebx,1
rdtsc
mov dword ptr c,eax
mov dword ptr c+4,edx
xor edx,edx
mov edi,-16*8
mov ecx,count
loop1:
mov eax,1024*1024/(16*8*2)
align 16
loop2:
movaps xmm0,[esi+edx+16*0]
movaps xmm0,[esi+edx+16*1]
add edi,16*8*2
and edi,ebx
movaps xmm0,[esi+edx+16*2]
movaps xmm0,[esi+edx+16*3]
movaps xmm0,[esi+edx+16*4]
movaps xmm0,[esi+edx+16*5]
movaps xmm0,[esi+edx+16*6]
movaps xmm0,[esi+edx+16*7]
movaps xmm0,[esi+edi+16*0]
movaps xmm0,[esi+edi+16*1]
add edx,16*8*2
and edx,ebx
movaps xmm0,[esi+edi+16*2]
movaps xmm0,[esi+edi+16*3]
movaps xmm0,[esi+edi+16*4]
movaps xmm0,[esi+edi+16*5]
movaps xmm0,[esi+edi+16*6]
movaps xmm0,[esi+edi+16*7]
sub eax,1
jne loop2
sub ecx,1
jne loop1
rdtsc
sub eax,dword ptr c
sbb edx,dword ptr c+4
}
}
LONGLONG read_avx(int count,int range) {
LONGLONG c;
__asm {
mov esi,bufa
mov ebx,range
sub ebx,1
rdtsc
mov dword ptr c,eax
mov dword ptr c+4,edx
xor edx,edx
mov edi,-32*8
mov ecx,count
loop1:
mov eax,1024*1024/(32*8*2)
align 16
loop2:
vmovaps ymm0,[esi+edx+32*0]
vmovaps ymm0,[esi+edx+32*1]
add edi,32*8*2
and edi,ebx
vmovaps ymm0,[esi+edx+32*2]
vmovaps ymm0,[esi+edx+32*3]
vmovaps ymm0,[esi+edx+32*4]
vmovaps ymm0,[esi+edx+32*5]
vmovaps ymm0,[esi+edx+32*6]
vmovaps ymm0,[esi+edx+32*7]
vmovaps ymm0,[esi+edi+32*0]
vmovaps ymm0,[esi+edi+32*1]
add edx,32*8*2
and edx,ebx
vmovaps ymm0,[esi+edi+32*2]
vmovaps ymm0,[esi+edi+32*3]
vmovaps ymm0,[esi+edi+32*4]
vmovaps ymm0,[esi+edi+32*5]
vmovaps ymm0,[esi+edi+32*6]
vmovaps ymm0,[esi+edi+32*7]
sub eax,1
jne loop2
sub ecx,1
jne loop1
rdtsc
sub eax,dword ptr c
sbb edx,dword ptr c+4
}
}
void main() {
printf("load 帯域 計測ツール v0.4+(AVXサポートCPU専用)\n");
Sleep(1000);
CpuFreq cpufreq;
if (! cpufreq.Setup()) {
exit(0);
}
printf("CPU動作クロック : %.1f MHz\n", cpufreq.GetFreq() / 1000000.0);
HANDLE process = GetCurrentProcess();
LONGLONG (*test_func[])(int,int) = {read_int, read_x87, read_mmx, read_sse, read_avx};
char *test_name[] = {" Int32bit"," Float64bit", " MMX64bit", " SSE128bit", " AVX256bit"};
int test_mode[] = {0,2,0,1};
printf("\n");
printf("アクセス範囲 ");
int nf = 2;
int cpuid_edx;
int f;
for (f=0; f<5; ++f) {
printf(" %s", test_name[f]);
}
printf("\n");
int size;
for (size=1024; size<=BUFSIZE; size+=size) {
printf(" %5dKB :", size/1024);
int f;
for (f=0; f<5; ++f) {
mk_buf(size, test_mode[f]);
double minclk = 100000000000.0;
int i;
for (i=0; i<5; ++i) {
double clk;
int count = 10;
for (;;) {
Sleep(10);
SetPriorityClass(process, REALTIME_PRIORITY_CLASS);
test_func[f](1, size);
clk = test_func[f](count, size);
SetPriorityClass(process, NORMAL_PRIORITY_CLASS);
if (clk/cpufreq.GetFreq() >= 0.1) break;
count *= 2;
}
clk /= count;
if (clk < minclk) minclk = clk;
}
printf(" %6.0f MB/S", cpufreq.GetFreq()/minclk);
}
printf("\n");
}
fflush(stdin);
fprintf(stderr, "\n終了します。Enterキーを押してください : ");
scanf("%*c");
}
#include <windows.h>
#include <stdio.h>

class StopWatch {
	LONGLONG start_time,lap_time,freq;
public:
	StopWatch() {
		freq = 0;
	}
	~StopWatch() {
	}
	bool Setup() {
		if (! QueryPerformanceFrequency((LARGE_INTEGER*)&freq)) {
			fprintf(stderr, "このハードウェアはパフォーマンスカウンタをサポートしていない為、計測できません。\n");
			return false;
		}
		return true;
	}
	void Start() {
		QueryPerformanceCounter((LARGE_INTEGER*)&start_time);
	}
	double LapTime() {
		QueryPerformanceCounter((LARGE_INTEGER*)&lap_time);
		return (double)(lap_time-start_time)/(double)freq;
	}
};

class CpuFreq {
	StopWatch stopwatch;
	double freq;
public:
	CpuFreq() {
	}
	~CpuFreq() {
	}
	bool Setup() {
		if (!stopwatch.Setup()) {
			return false;
		}
		bool except_flag = false;
		__try {
			__asm {
				rdtsc
			}
		} __except (true) {
			except_flag = true;
		}
		if (except_flag) {
			fprintf(stderr, "このCPUはrdtsc命令をサポートしていない為、計測できません。\n");
			return false;
		}
		
		HANDLE process = GetCurrentProcess();
		int i;
		double maxf=0.0;
		for (i=0; i<5; ++i) {
			Sleep(10);
			SetPriorityClass(process, REALTIME_PRIORITY_CLASS);
		
			LONGLONG startcount, lapcount;
			double laptime;
		
			stopwatch.Start();

			__asm {
				rdtsc
				mov dword ptr startcount, eax
				mov dword ptr startcount+4, edx
			}

			while ((laptime=stopwatch.LapTime()) < 0.1);

			__asm {
				rdtsc
				mov dword ptr lapcount, eax
				mov dword ptr lapcount+4, edx
			}
		
			SetPriorityClass(process, NORMAL_PRIORITY_CLASS);

			double f = (double)(lapcount-startcount) / laptime;
			if (f > maxf) maxf = f;
		}
		freq = maxf;
		
		return true;
	}
	double GetFreq() {
		return freq;
	}
};


#define MMX_FLAG	0x00800000
#define SSE_FLAG	0x02000000
#define SSE2_FLAG	0x04000000


#define BUFSIZE (32*1024*1024)

int buf[(BUFSIZE+256)/4];
int *bufa = (int*)((((int)buf)+255)&(-256));

void mk_buf(int size, int mode) {
	int i;
	switch (mode) {
	case 0:
		for (i=0; i<size/4; ++i) {
			bufa[i] = -1;
		}
		break;
	case 1:
		for (i=0; i<size/4; ++i) {
			*(float*)&bufa[i] = -1.0f;
		}
		break;
	case 2:
		for (i=0; i<size/4; i+=2) {
			*(double*)&bufa[i] = -1.0;
		}
		break;
	}

}

LONGLONG read_int(int count,int range) {
//	LONGLONG c;
	__asm {
		mov esi,bufa

		mov ebx,range
		sub ebx,1

		mov ecx,count

		push ebp
		sub esp,8

		rdtsc
		mov dword ptr [esp],eax
		mov dword ptr [esp+4],edx

		xor edx,edx
		mov edi,-4*8
		
		loop1:
			mov eax,1024*1024/(4*8*2)
			align 16
			loop2:
				mov ebp,[esi+edx+4*0]
				mov ebp,[esi+edx+4*1]
				add edi,4*8*2
				and edi,ebx
				mov ebp,[esi+edx+4*2]
				mov ebp,[esi+edx+4*3]
				mov ebp,[esi+edx+4*4]
				mov ebp,[esi+edx+4*5]
				mov ebp,[esi+edx+4*6]
				mov ebp,[esi+edx+4*7]

				mov ebp,[esi+edi+4*0]
				mov ebp,[esi+edi+4*1]
				add edx,4*8*2
				and edx,ebx
				mov ebp,[esi+edi+4*2]
				mov ebp,[esi+edi+4*3]
				mov ebp,[esi+edi+4*4]
				mov ebp,[esi+edi+4*5]
				mov ebp,[esi+edi+4*6]
				mov ebp,[esi+edi+4*7]
	
			sub eax,1
			jne loop2
		
		sub ecx,1
		jne loop1

		rdtsc
		sub eax,dword ptr [esp]
		sbb edx,dword ptr [esp+4]

		add esp,8
		pop ebp
	}
}

LONGLONG read_x87(int count,int range) {
	LONGLONG c;
	__asm {
		mov esi,bufa
		mov ebx,range
		sub ebx,1

		rdtsc
		mov dword ptr c,eax
		mov dword ptr c+4,edx

		xor edx,edx
		mov edi,-8*8
		
		mov ecx,count
		loop1:
			mov eax,1024*1024/(8*8*2)
			align 16
			loop2:
				fld qword ptr [esi+edx+8*0]
				fstp st(0)
				fld qword ptr [esi+edx+8*1]
				fstp st(0)
				add edi,8*8*2
				and edi,ebx
				fld qword ptr [esi+edx+8*2]
				fstp st(0)
				fld qword ptr [esi+edx+8*3]
				fstp st(0)
				fld qword ptr [esi+edx+8*4]
				fstp st(0)
				fld qword ptr [esi+edx+8*5]
				fstp st(0)
				fld qword ptr [esi+edx+8*6]
				fstp st(0)
				fld qword ptr [esi+edx+8*7]
				fstp st(0)

				fld qword ptr [esi+edi+8*0]
				fstp st(0)
				fld qword ptr [esi+edi+8*1]
				fstp st(0)
				add edx,8*8*2
				and edx,ebx
				fld qword ptr [esi+edi+8*2]
				fstp st(0)
				fld qword ptr [esi+edi+8*3]
				fstp st(0)
				fld qword ptr [esi+edi+8*4]
				fstp st(0)
				fld qword ptr [esi+edi+8*5]
				fstp st(0)
				fld qword ptr [esi+edi+8*6]
				fstp st(0)
				fld qword ptr [esi+edi+8*7]
				fstp st(0)
			
			sub eax,1
			jne loop2
		
		sub ecx,1
		jne loop1

		rdtsc
		sub eax,dword ptr c
		sbb edx,dword ptr c+4

	}
}

LONGLONG read_mmx(int count,int range) {
	LONGLONG c;
	__asm {
		mov esi,bufa
		mov ebx,range
		sub ebx,1

		rdtsc
		mov dword ptr c,eax
		mov dword ptr c+4,edx

		xor edx,edx
		mov edi,-8*8
		
		mov ecx,count
		loop1:
			mov eax,1024*1024/(8*8*2)
			align 16
			loop2:
				movq mm0,[esi+edx+8*0]
				movq mm0,[esi+edx+8*1]
				add edi,8*8*2
				and edi,ebx
				movq mm0,[esi+edx+8*2]
				movq mm0,[esi+edx+8*3]
				movq mm0,[esi+edx+8*4]
				movq mm0,[esi+edx+8*5]
				movq mm0,[esi+edx+8*6]
				movq mm0,[esi+edx+8*7]

				movq mm0,[esi+edi+8*0]
				movq mm0,[esi+edi+8*1]
				add edx,8*8*2
				and edx,ebx
				movq mm0,[esi+edi+8*2]
				movq mm0,[esi+edi+8*3]
				movq mm0,[esi+edi+8*4]
				movq mm0,[esi+edi+8*5]
				movq mm0,[esi+edi+8*6]
				movq mm0,[esi+edi+8*7]
			
			sub eax,1
			jne loop2
		
		sub ecx,1
		jne loop1

		rdtsc
		sub eax,dword ptr c
		sbb edx,dword ptr c+4

		emms

	}
}

LONGLONG read_sse(int count,int range) {
	LONGLONG c;
	__asm {
		mov esi,bufa
		mov ebx,range
		sub ebx,1

		rdtsc
		mov dword ptr c,eax
		mov dword ptr c+4,edx

		xor edx,edx
		mov edi,-16*8
		
		mov ecx,count
		loop1:
			mov eax,1024*1024/(16*8*2)
			align 16
			loop2:
				movaps xmm0,[esi+edx+16*0]
				movaps xmm0,[esi+edx+16*1]
				add edi,16*8*2
				and edi,ebx
				movaps xmm0,[esi+edx+16*2]
				movaps xmm0,[esi+edx+16*3]
				movaps xmm0,[esi+edx+16*4]
				movaps xmm0,[esi+edx+16*5]
				movaps xmm0,[esi+edx+16*6]
				movaps xmm0,[esi+edx+16*7]

				movaps xmm0,[esi+edi+16*0]
				movaps xmm0,[esi+edi+16*1]
				add edx,16*8*2
				and edx,ebx
				movaps xmm0,[esi+edi+16*2]
				movaps xmm0,[esi+edi+16*3]
				movaps xmm0,[esi+edi+16*4]
				movaps xmm0,[esi+edi+16*5]
				movaps xmm0,[esi+edi+16*6]
				movaps xmm0,[esi+edi+16*7]
			
			sub eax,1
			jne loop2
		
		sub ecx,1
		jne loop1

		rdtsc
		sub eax,dword ptr c
		sbb edx,dword ptr c+4

	}
}


LONGLONG read_avx(int count,int range) {
	LONGLONG c;
	__asm {
		mov esi,bufa
		mov ebx,range
		sub ebx,1

		rdtsc
		mov dword ptr c,eax
		mov dword ptr c+4,edx

		xor edx,edx
		mov edi,-32*8
		
		mov ecx,count
		loop1:
			mov eax,1024*1024/(32*8*2)
			align 16
			loop2:
				vmovaps ymm0,[esi+edx+32*0]
				vmovaps ymm0,[esi+edx+32*1]
				add edi,32*8*2
				and edi,ebx
				vmovaps ymm0,[esi+edx+32*2]
				vmovaps ymm0,[esi+edx+32*3]
				vmovaps ymm0,[esi+edx+32*4]
				vmovaps ymm0,[esi+edx+32*5]
				vmovaps ymm0,[esi+edx+32*6]
				vmovaps ymm0,[esi+edx+32*7]

				vmovaps ymm0,[esi+edi+32*0]
				vmovaps ymm0,[esi+edi+32*1]
				add edx,32*8*2
				and edx,ebx
				vmovaps ymm0,[esi+edi+32*2]
				vmovaps ymm0,[esi+edi+32*3]
				vmovaps ymm0,[esi+edi+32*4]
				vmovaps ymm0,[esi+edi+32*5]
				vmovaps ymm0,[esi+edi+32*6]
				vmovaps ymm0,[esi+edi+32*7]
			
			sub eax,1
			jne loop2
		
		sub ecx,1
		jne loop1

		rdtsc
		sub eax,dword ptr c
		sbb edx,dword ptr c+4

	}
}

void main() {
	printf("load 帯域 計測ツール v0.4+(AVXサポートCPU専用)\n");

	Sleep(1000);

	CpuFreq cpufreq;
	if (! cpufreq.Setup()) {
		exit(0);
	}
	printf("CPU動作クロック : %.1f MHz\n", cpufreq.GetFreq() / 1000000.0);

	HANDLE process = GetCurrentProcess();

	LONGLONG (*test_func[])(int,int) = {read_int, read_x87, read_mmx, read_sse, read_avx};
	char *test_name[] = {"  Int32bit"," Float64bit", "   MMX64bit", "  SSE128bit", "  AVX256bit"};
	int test_mode[] = {0,2,0,1};
		
	printf("\n");
	printf("ｱｸｾｽ範囲   ");
	int nf = 2;
	int cpuid_edx;

	int f;
	for (f=0; f<5; ++f) {
		printf(" %s", test_name[f]);
	}
	
	printf("\n");
	int size;
	for (size=1024; size<=BUFSIZE; size+=size) {
	
		printf(" %5dKB :", size/1024);

		int f;
		for (f=0; f<5; ++f) {

			mk_buf(size, test_mode[f]);
	
			double minclk = 100000000000.0;
			int i;
			for (i=0; i<5; ++i) {
				double clk;
				int count = 10;
				for (;;) {
					Sleep(10);
					SetPriorityClass(process, REALTIME_PRIORITY_CLASS);
			
					test_func[f](1, size);
					clk = test_func[f](count, size);

					SetPriorityClass(process, NORMAL_PRIORITY_CLASS);

					if (clk/cpufreq.GetFreq() >= 0.1) break;

					count *= 2;
				}
				clk /= count;
				if (clk < minclk) minclk = clk;
			}

			printf(" %6.0f MB/S", cpufreq.GetFreq()/minclk);
		}

		printf("\n");
	}

	fflush(stdin);
   	fprintf(stderr, "\n終了します。Enterキーを押してください : ");
	scanf("%*c");
}
