float和double一样快吗?

文 号

821317

6 回复 / 1213 浏览


acmilan1 年前 -2016-06-11 11:42821317

我记得世纪之初的时候,某本古老的书上有这么一句话,大概是这个意思,无论是float还是double,在CPU内部都是转换为80位浮点数运算的,因此float和double其实是一样快的。

但是时代变化太快,这句话现在还对不对呢?写了个程序验证一下。使用的是Visual C++ 2015 Update 2,编译为x64架构。为了避免调试器的干扰,直接使用Ctrl+F5运行。

程序如下:

// realspeed.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>
#include <windows.h>

#define veclen 1048576

float vec1[veclen];
float vec2[veclen];
float vec3[veclen];

double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];

int main()
{
	ULONGLONG tk1, tk2;

	for (int i = 0; i < veclen; i++)
	{
		vec1[i] = vec2[i] = 2.0f;
		vec3[i] = 0.0f;
		dvec1[i] = dvec2[i] = 2.0;
		dvec3[i] = 0.0;
	}
	
	printf("float:\n");
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
		for (int i = 0; i < veclen; i++)
		{
			vec3[i] = vec1[i] * vec2[i];
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
		printf("ticks: %lld\n", tk2 - tk1);
	}

	printf("double:\n");
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
		for (int i = 0; i < veclen; i++)
		{
			dvec3[i] = dvec1[i] * dvec2[i];
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
		printf("ticks: %lld\n", tk2 - tk1);
	}
	
    return 0;
}

Debug下Ctrl+F5直接运行:

265948

Release下Ctrl+F5直接运行:

265949

可以看到,在Release编译下,float比double快得多,而在Debug编译下则几乎没有差别。这是为什么呢?在这里我们设置了个断点,进行一下反编译——

Debug下的反编译:

// realspeed.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>
#include <windows.h>

#define veclen 1048576

float vec1[veclen];
float vec2[veclen];
float vec3[veclen];

double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];

int main()
{
00007FF65F6717D0  push        rbp  
00007FF65F6717D2  push        rdi  
00007FF65F6717D3  sub         rsp,1C8h  
00007FF65F6717DA  lea         rbp,[rsp+20h]  
00007FF65F6717DF  mov         rdi,rsp  
00007FF65F6717E2  mov         ecx,72h  
00007FF65F6717E7  mov         eax,0CCCCCCCCh  
// realspeed.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>
#include <windows.h>

#define veclen 1048576

float vec1[veclen];
float vec2[veclen];
float vec3[veclen];

double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];

int main()
{
00007FF65F6717EC  rep stos    dword ptr [rdi]  
	ULONGLONG tk1, tk2;

	for (int i = 0; i < veclen; i++)
00007FF65F6717EE  mov         dword ptr [rbp+44h],0  
00007FF65F6717F5  jmp         main+2Fh (07FF65F6717FFh)  
00007FF65F6717F7  mov         eax,dword ptr [rbp+44h]  
00007FF65F6717FA  inc         eax  
00007FF65F6717FC  mov         dword ptr [rbp+44h],eax  
00007FF65F6717FF  cmp         dword ptr [rbp+44h],100000h  
00007FF65F671806  jge         main+0C7h (07FF65F671897h)  
	{
		vec1[i] = vec2[i] = 2.0f;
00007FF65F67180C  movsxd      rax,dword ptr [rbp+44h]  
00007FF65F671810  lea         rcx,[vec2 (07FF65FA7C170h)]  
00007FF65F671817  movss       xmm0,dword ptr [__real@40000000 (07FF65F679D1Ch)]  
00007FF65F67181F  movss       dword ptr [rcx+rax*4],xmm0  
00007FF65F671824  movsxd      rax,dword ptr [rbp+44h]  
00007FF65F671828  lea         rcx,[vec1 (07FF65F67C170h)]  
00007FF65F67182F  movss       xmm0,dword ptr [__real@40000000 (07FF65F679D1Ch)]  
00007FF65F671837  movss       dword ptr [rcx+rax*4],xmm0  
		vec3[i] = 0.0f;
00007FF65F67183C  movsxd      rax,dword ptr [rbp+44h]  
00007FF65F671840  lea         rcx,[vec3 (07FF65FE7C170h)]  
00007FF65F671847  xorps       xmm0,xmm0  
00007FF65F67184A  movss       dword ptr [rcx+rax*4],xmm0  
		dvec1[i] = dvec2[i] = 2.0;
00007FF65F67184F  movsxd      rax,dword ptr [rbp+44h]  
00007FF65F671853  lea         rcx,[dvec2 (07FF660A7C170h)]  
00007FF65F67185A  movsd       xmm0,mmword ptr [__real@4000000000000000 (07FF65F679D20h)]  
00007FF65F671862  movsd       mmword ptr [rcx+rax*8],xmm0  
00007FF65F671867  movsxd      rax,dword ptr [rbp+44h]  
00007FF65F67186B  lea         rcx,[dvec1 (07FF66027C170h)]  
00007FF65F671872  movsd       xmm0,mmword ptr [__real@4000000000000000 (07FF65F679D20h)]  
00007FF65F67187A  movsd       mmword ptr [rcx+rax*8],xmm0  
		dvec3[i] = 0.0;
00007FF65F67187F  movsxd      rax,dword ptr [rbp+44h]  
00007FF65F671883  lea         rcx,[dvec3 (07FF66127C170h)]  
00007FF65F67188A  xorps       xmm0,xmm0  
00007FF65F67188D  movsd       mmword ptr [rcx+rax*8],xmm0  
	}
00007FF65F671892  jmp         main+27h (07FF65F6717F7h)  
	
	printf("float:\n");
00007FF65F671897  lea         rcx,[string "float:\n" (07FF65F679CF0h)]  
00007FF65F67189E  call        printf (07FF65F6711CCh)  
	for (int i = 0; i < 10; i++)
00007FF65F6718A3  mov         dword ptr [rbp+64h],0  
00007FF65F6718AA  jmp         main+0E4h (07FF65F6718B4h)  
00007FF65F6718AC  mov         eax,dword ptr [rbp+64h]  
00007FF65F6718AF  inc         eax  
00007FF65F6718B1  mov         dword ptr [rbp+64h],eax  
00007FF65F6718B4  cmp         dword ptr [rbp+64h],0Ah  
00007FF65F6718B8  jge         main+186h (07FF65F671956h)  
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF65F6718BE  lea         rcx,[tk1]  
00007FF65F6718C2  call        qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]  
		for (int i = 0; i < veclen; i++)
00007FF65F6718C8  mov         dword ptr [rbp+84h],0  
00007FF65F6718D2  jmp         main+112h (07FF65F6718E2h)  
00007FF65F6718D4  mov         eax,dword ptr [rbp+84h]  
00007FF65F6718DA  inc         eax  
00007FF65F6718DC  mov         dword ptr [rbp+84h],eax  
00007FF65F6718E2  cmp         dword ptr [rbp+84h],100000h  
00007FF65F6718EC  jge         main+15Ah (07FF65F67192Ah)  
		{
			vec3[i] = vec1[i] * vec2[i];
00007FF65F6718EE  movsxd      rax,dword ptr [rbp+84h]  
00007FF65F6718F5  lea         rcx,[vec1 (07FF65F67C170h)]  
00007FF65F6718FC  movsxd      rdx,dword ptr [rbp+84h]  
00007FF65F671903  lea         r8,[vec2 (07FF65FA7C170h)]  
00007FF65F67190A  movss       xmm0,dword ptr [rcx+rax*4]  
00007FF65F67190F  mulss       xmm0,dword ptr [r8+rdx*4]  
00007FF65F671915  movsxd      rax,dword ptr [rbp+84h]  
00007FF65F67191C  lea         rcx,[vec3 (07FF65FE7C170h)]  
00007FF65F671923  movss       dword ptr [rcx+rax*4],xmm0  
		}
00007FF65F671928  jmp         main+104h (07FF65F6718D4h)  
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF65F67192A  lea         rcx,[tk2]  
00007FF65F67192E  call        qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF65F671934  mov         rax,qword ptr [tk1]  
00007FF65F671938  mov         rcx,qword ptr [tk2]  
00007FF65F67193C  sub         rcx,rax  
00007FF65F67193F  mov         rax,rcx  
00007FF65F671942  mov         rdx,rax  
00007FF65F671945  lea         rcx,[string "ticks: %lld\n" (07FF65F679D00h)]  
00007FF65F67194C  call        printf (07FF65F6711CCh)  
	}
00007FF65F671951  jmp         main+0DCh (07FF65F6718ACh)  

	printf("double:\n");
00007FF65F671956  lea         rcx,[string "double:\n" (07FF65F679D10h)]  
00007FF65F67195D  call        printf (07FF65F6711CCh)  
	for (int i = 0; i < 10; i++)
00007FF65F671962  mov         dword ptr [rbp+0A4h],0  
00007FF65F67196C  jmp         main+1ACh (07FF65F67197Ch)  
00007FF65F67196E  mov         eax,dword ptr [rbp+0A4h]  
00007FF65F671974  inc         eax  
00007FF65F671976  mov         dword ptr [rbp+0A4h],eax  
00007FF65F67197C  cmp         dword ptr [rbp+0A4h],0Ah  
00007FF65F671983  jge         main+251h (07FF65F671A21h)  
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF65F671989  lea         rcx,[tk1]  
00007FF65F67198D  call        qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]  
		for (int i = 0; i < veclen; i++)
00007FF65F671993  mov         dword ptr [rbp+0C4h],0  
00007FF65F67199D  jmp         main+1DDh (07FF65F6719ADh)  
00007FF65F67199F  mov         eax,dword ptr [rbp+0C4h]  
00007FF65F6719A5  inc         eax  
00007FF65F6719A7  mov         dword ptr [rbp+0C4h],eax  
00007FF65F6719AD  cmp         dword ptr [rbp+0C4h],100000h  
00007FF65F6719B7  jge         main+225h (07FF65F6719F5h)  
		{
			dvec3[i] = dvec1[i] * dvec2[i];
00007FF65F6719B9  movsxd      rax,dword ptr [rbp+0C4h]  
00007FF65F6719C0  lea         rcx,[dvec1 (07FF66027C170h)]  
00007FF65F6719C7  movsxd      rdx,dword ptr [rbp+0C4h]  
00007FF65F6719CE  lea         r8,[dvec2 (07FF660A7C170h)]  
00007FF65F6719D5  movsd       xmm0,mmword ptr [rcx+rax*8]  
00007FF65F6719DA  mulsd       xmm0,mmword ptr [r8+rdx*8]  
00007FF65F6719E0  movsxd      rax,dword ptr [rbp+0C4h]  
00007FF65F6719E7  lea         rcx,[dvec3 (07FF66127C170h)]  
00007FF65F6719EE  movsd       mmword ptr [rcx+rax*8],xmm0  
		}
00007FF65F6719F3  jmp         main+1CFh (07FF65F67199Fh)  
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF65F6719F5  lea         rcx,[tk2]  
00007FF65F6719F9  call        qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF65F6719FF  mov         rax,qword ptr [tk1]  
00007FF65F671A03  mov         rcx,qword ptr [tk2]  
00007FF65F671A07  sub         rcx,rax  
00007FF65F671A0A  mov         rax,rcx  
00007FF65F671A0D  mov         rdx,rax  
00007FF65F671A10  lea         rcx,[string "ticks: %lld\n" (07FF65F679D00h)]  
00007FF65F671A17  call        printf (07FF65F6711CCh)  
	}
00007FF65F671A1C  jmp         main+19Eh (07FF65F67196Eh)  
	
    return 0;
00007FF65F671A21  xor         eax,eax  
}
00007FF65F671A23  mov         edi,eax  
00007FF65F671A25  lea         rcx,[rbp-20h]  
00007FF65F671A29  lea         rdx,[__xt_z+220h (07FF65F679CC0h)]  
00007FF65F671A30  call        _RTC_CheckStackVars (07FF65F671136h)  
00007FF65F671A35  mov         eax,edi  
00007FF65F671A37  lea         rsp,[rbp+1A8h]  
00007FF65F671A3E  pop         rdi  
00007FF65F671A3F  pop         rbp  
00007FF65F671A40  ret  

Release下的反编译:

// realspeed.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>
#include <windows.h>

#define veclen 1048576

float vec1[veclen];
float vec2[veclen];
float vec3[veclen];

double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];

int main()
{
00007FF6A9EF1070  mov         qword ptr [rsp+18h],rbx  
00007FF6A9EF1075  push        rbp  
00007FF6A9EF1076  push        rsi  
00007FF6A9EF1077  push        rdi  
00007FF6A9EF1078  push        r12  
00007FF6A9EF107A  push        r13  
00007FF6A9EF107C  push        r14  
00007FF6A9EF107E  push        r15  
00007FF6A9EF1080  sub         rsp,20h  
	ULONGLONG tk1, tk2;

	for (int i = 0; i < veclen; i++)
	{
		vec1[i] = vec2[i] = 2.0f;
00007FF6A9EF1084  mov         eax,40000000h  
00007FF6A9EF1089  lea         r12,[vec2 (07FF6A9EF3620h)]  
00007FF6A9EF1090  mov         rdi,r12  
00007FF6A9EF1093  lea         r13,[vec1 (07FF6AAAF3620h)]  
00007FF6A9EF109A  mov         ecx,100000h  
		vec3[i] = 0.0f;
00007FF6A9EF109F  lea         r15,[vec3 (07FF6AB6F3620h)]  
00007FF6A9EF10A6  rep stos    dword ptr [rdi]  
00007FF6A9EF10A8  mov         rdi,r13  
		dvec1[i] = dvec2[i] = 2.0;
00007FF6A9EF10AB  lea         r14,[dvec2 (07FF6AAEF3620h)]  
00007FF6A9EF10B2  mov         ecx,100000h  
00007FF6A9EF10B7  lea         rbp,[dvec1 (07FF6AA2F3620h)]  
00007FF6A9EF10BE  rep stos    dword ptr [rdi]  
00007FF6A9EF10C0  xor         eax,eax  
		dvec3[i] = 0.0;
00007FF6A9EF10C2  lea         rsi,[dvec3 (07FF6ABAF3620h)]  
00007FF6A9EF10C9  mov         rdi,r15  
00007FF6A9EF10CC  mov         ecx,100000h  
00007FF6A9EF10D1  rep stos    dword ptr [rdi]  
00007FF6A9EF10D3  mov         rax,4000000000000000h  
00007FF6A9EF10DD  mov         rdi,r14  
00007FF6A9EF10E0  mov         ecx,100000h  
00007FF6A9EF10E5  rep stos    qword ptr [rdi]  
00007FF6A9EF10E8  mov         rdi,rbp  
00007FF6A9EF10EB  mov         ecx,100000h  
00007FF6A9EF10F0  rep stos    qword ptr [rdi]  
00007FF6A9EF10F3  xor         eax,eax  
00007FF6A9EF10F5  mov         rdi,rsi  
00007FF6A9EF10F8  mov         ecx,100000h  
00007FF6A9EF10FD  rep stos    qword ptr [rdi]  
	}
	
	printf("float:\n");
00007FF6A9EF1100  lea         rcx,[string "float:\n" (07FF6A9EF2210h)]  
00007FF6A9EF1107  call        printf (07FF6A9EF1010h)  
00007FF6A9EF110C  mov         ebx,0Ah  
00007FF6A9EF1111  mov         edi,ebx  
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF6A9EF1113  lea         rcx,[tk1]  
00007FF6A9EF1118  call        qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]  
		for (int i = 0; i < veclen; i++)
00007FF6A9EF111E  xor         eax,eax  
00007FF6A9EF1120  mov         ecx,20000h  
00007FF6A9EF1125  nop         word ptr [rax+rax]  
		{
			vec3[i] = vec1[i] * vec2[i];
00007FF6A9EF1130  movups      xmm0,xmmword ptr [rax+r13]  
00007FF6A9EF1135  movups      xmm1,xmmword ptr [rax+r12]  
00007FF6A9EF113A  lea         rax,[rax+20h]  
00007FF6A9EF113E  mulps       xmm1,xmm0  
00007FF6A9EF1141  movups      xmm0,xmmword ptr [rax+r13-10h]  
00007FF6A9EF1147  movups      xmmword ptr [rax+r15-20h],xmm1  
00007FF6A9EF114D  movups      xmm1,xmmword ptr [rax+r12-10h]  
00007FF6A9EF1153  mulps       xmm1,xmm0  
00007FF6A9EF1156  movups      xmmword ptr [rax+r15-10h],xmm1  
00007FF6A9EF115C  sub         rcx,1  
00007FF6A9EF1160  jne         main+0C0h (07FF6A9EF1130h)  
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF6A9EF1162  lea         rcx,[tk2]  
00007FF6A9EF1167  call        qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF6A9EF116D  mov         rdx,qword ptr [tk2]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF6A9EF1172  lea         rcx,[string "ticks: %lld\n" (07FF6A9EF2218h)]  
00007FF6A9EF1179  sub         rdx,qword ptr [tk1]  
00007FF6A9EF117E  call        printf (07FF6A9EF1010h)  
00007FF6A9EF1183  sub         rdi,1  
00007FF6A9EF1187  jne         main+0A3h (07FF6A9EF1113h)  
	}

	printf("double:\n");
00007FF6A9EF1189  lea         rcx,[string "double:\n" (07FF6A9EF2228h)]  
00007FF6A9EF1190  call        printf (07FF6A9EF1010h)  
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF6A9EF1195  lea         rcx,[tk1]  
00007FF6A9EF119A  call        qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]  
		for (int i = 0; i < veclen; i++)
00007FF6A9EF11A0  xor         eax,eax  
00007FF6A9EF11A2  mov         ecx,40000h  
00007FF6A9EF11A7  nop         word ptr [rax+rax]  
		{
			dvec3[i] = dvec1[i] * dvec2[i];
00007FF6A9EF11B0  movups      xmm0,xmmword ptr [rax+rbp]  
00007FF6A9EF11B4  movups      xmm1,xmmword ptr [rax+r14]  
00007FF6A9EF11B9  lea         rax,[rax+20h]  
00007FF6A9EF11BD  mulpd       xmm1,xmm0  
00007FF6A9EF11C1  movups      xmm0,xmmword ptr [rax+r14-10h]  
00007FF6A9EF11C7  movups      xmmword ptr [rax+rsi-20h],xmm1  
00007FF6A9EF11CC  movups      xmm1,xmmword ptr [rax+rbp-10h]  
00007FF6A9EF11D1  mulpd       xmm1,xmm0  
		{
			dvec3[i] = dvec1[i] * dvec2[i];
00007FF6A9EF11D5  movups      xmmword ptr [rax+rsi-10h],xmm1  
00007FF6A9EF11DA  sub         rcx,1  
00007FF6A9EF11DE  jne         main+140h (07FF6A9EF11B0h)  
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF6A9EF11E0  lea         rcx,[tk2]  
00007FF6A9EF11E5  call        qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF6A9EF11EB  mov         rdx,qword ptr [tk2]  
00007FF6A9EF11F0  lea         rcx,[string "ticks: %lld\n" (07FF6A9EF2218h)]  
00007FF6A9EF11F7  sub         rdx,qword ptr [tk1]  
00007FF6A9EF11FC  call        printf (07FF6A9EF1010h)  
00007FF6A9EF1201  sub         rbx,1  
00007FF6A9EF1205  jne         main+125h (07FF6A9EF1195h)  
	}
	
    return 0;
00007FF6A9EF1207  xor         eax,eax  
}
00007FF6A9EF1209  mov         rbx,qword ptr [rsp+70h]  
00007FF6A9EF120E  add         rsp,20h  
00007FF6A9EF1212  pop         r15  
00007FF6A9EF1214  pop         r14  
00007FF6A9EF1216  pop         r13  
00007FF6A9EF1218  pop         r12  
00007FF6A9EF121A  pop         rdi  
00007FF6A9EF121B  pop         rsi  
00007FF6A9EF121C  pop         rbp  
00007FF6A9EF121D  ret  

可以看到,现在早已过了x87 FPU的年代,编译器并没有使用FPU指令,而是使用的SSE指令。

Debug编译下,为了调试方便,将每一个循环都完整表现出来了(循环计数为100000h,即1048576),并且使用了movss/mulss和movsd/mulsd这两组标量指令,速度当然差不多。

而Release编译下,则将循环计数精简为20000h(131072=1048576/8)和40000h(262144=1048576/4),并且使用了movups/mulps和movups/mulpd这两组矢量指令,每次循环内进行2次运算,总计进行40000h(262144=1048576/4)和80000h(524288=1048576/2)次运算。由于SSE寄存器是固定的128位宽,每次只能放置4个32位宽的float或2个64位宽的double数据,因此使用float的话,只需要进行1/4次运算,而使用double的话,则需要进行1/2次运算。

结论就是:对于标量运算,float和double没有显著差别,而对于矢量运算,float比double要快。

因此,在计算量庞大的图形运算中,通常使用float而不是double以提高运算速度。

[修改于 1 年前 - 2016-06-11 12:32:47]


acmilan1 年前 -2016-06-11 15:00821328

上面演示的是四则运算,编译器自然有充足的弹性进行优化。那如果是像exp、log这样的math.h函数,编译器怎么优化呢?比如下面的代码:

// realspeed.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>
#include <windows.h>
#include <math.h>

#define veclen 1048576

float vec1[veclen];
float vec2[veclen];
float vec3[veclen];

double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];

int main()
{
	ULONGLONG tk1, tk2;

	for (int i = 0; i < veclen; i++)
	{
		vec1[i] = vec2[i] = 2.0f;
		vec3[i] = 0.0f;
		dvec1[i] = dvec2[i] = 2.0;
		dvec3[i] = 0.0;
	}
	
	printf("float:\n");
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
		for (int i = 0; i < veclen; i++)
		{
			//vec3[i] = vec1[i] * vec2[i];
			vec3[i] = logf(vec1[i]);
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
		printf("ticks: %lld\n", tk2 - tk1);
	}

	printf("double:\n");
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
		for (int i = 0; i < veclen; i++)
		{
			//dvec3[i] = dvec1[i] * dvec2[i];
			dvec3[i] = log(dvec1[i]);
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
		printf("ticks: %lld\n", tk2 - tk1);
	}
	
    return 0;
}

我们看一下Release反汇编就知道了。Release反汇编如下:

// realspeed.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"

#include <stdio.h>
#include <windows.h>
#include <math.h>

#define veclen 1048576

float vec1[veclen];
float vec2[veclen];
float vec3[veclen];

double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];

int main()
{
00007FF78CE81070  mov         qword ptr [rsp+18h],rbx  
00007FF78CE81075  push        rbp  
00007FF78CE81076  push        rsi  
00007FF78CE81077  push        rdi  
00007FF78CE81078  push        r12  
00007FF78CE8107A  push        r13  
00007FF78CE8107C  push        r14  
00007FF78CE8107E  push        r15  
00007FF78CE81080  sub         rsp,20h  
	ULONGLONG tk1, tk2;

	for (int i = 0; i < veclen; i++)
	{
		vec1[i] = vec2[i] = 2.0f;
00007FF78CE81084  mov         eax,40000000h  
00007FF78CE81089  lea         rdi,[vec2 (07FF78CE84630h)]  
00007FF78CE81090  mov         ecx,100000h  
00007FF78CE81095  lea         r15,[vec1 (07FF78DA84630h)]  
00007FF78CE8109C  rep stos    dword ptr [rdi]  
00007FF78CE8109E  mov         rdi,r15  
		vec3[i] = 0.0f;
00007FF78CE810A1  lea         r14,[vec3 (07FF78E684630h)]  
00007FF78CE810A8  mov         ecx,100000h  
		dvec1[i] = dvec2[i] = 2.0;
00007FF78CE810AD  lea         r13,[dvec1 (07FF78D284630h)]  
00007FF78CE810B4  rep stos    dword ptr [rdi]  
00007FF78CE810B6  xor         eax,eax  
		dvec3[i] = 0.0;
00007FF78CE810B8  lea         r12,[dvec3 (07FF78EA84630h)]  
00007FF78CE810BF  mov         rdi,r14  
00007FF78CE810C2  mov         ecx,100000h  
00007FF78CE810C7  rep stos    dword ptr [rdi]  
00007FF78CE810C9  mov         rax,4000000000000000h  
00007FF78CE810D3  lea         rdi,[dvec2 (07FF78DE84630h)]  
00007FF78CE810DA  mov         ecx,100000h  
00007FF78CE810DF  rep stos    qword ptr [rdi]  
00007FF78CE810E2  mov         rdi,r13  
00007FF78CE810E5  mov         ecx,100000h  
00007FF78CE810EA  rep stos    qword ptr [rdi]  
00007FF78CE810ED  xor         eax,eax  
00007FF78CE810EF  mov         rdi,r12  
00007FF78CE810F2  mov         ecx,100000h  
00007FF78CE810F7  rep stos    qword ptr [rdi]  
	}
	
	printf("float:\n");
00007FF78CE810FA  lea         rcx,[string "float:\n" (07FF78CE83210h)]  
00007FF78CE81101  call        printf (07FF78CE81010h)  
00007FF78CE81106  mov         ebp,0Ah  
00007FF78CE8110B  mov         esi,ebp  
00007FF78CE8110D  nop         dword ptr [rax]  
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF78CE81110  lea         rcx,[tk1]  
00007FF78CE81115  call        qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]  
		for (int i = 0; i < veclen; i++)
00007FF78CE8111B  xor         edi,edi  
00007FF78CE8111D  mov         ebx,40000h  
		{
			//vec3[i] = vec1[i] * vec2[i];
			vec3[i] = logf(vec1[i]);
00007FF78CE81122  movups      xmm0,xmmword ptr [rdi+r15]  
00007FF78CE81127  call        __vdecl_logf4 (07FF78CE81EF0h)  
00007FF78CE8112C  movups      xmmword ptr [rdi+r14],xmm0  
00007FF78CE81131  lea         rdi,[rdi+10h]  
00007FF78CE81135  sub         rbx,1  
00007FF78CE81139  jne         main+0B2h (07FF78CE81122h)  
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF78CE8113B  lea         rcx,[tk2]  
00007FF78CE81140  call        qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF78CE81146  mov         rdx,qword ptr [tk2]  
00007FF78CE8114B  lea         rcx,[string "ticks: %lld\n" (07FF78CE83218h)]  
00007FF78CE81152  sub         rdx,qword ptr [tk1]  
00007FF78CE81157  call        printf (07FF78CE81010h)  
00007FF78CE8115C  sub         rsi,1  
00007FF78CE81160  jne         main+0A0h (07FF78CE81110h)  
	}

	printf("double:\n");
00007FF78CE81162  lea         rcx,[string "double:\n" (07FF78CE83228h)]  
	}

	printf("double:\n");
00007FF78CE81169  call        printf (07FF78CE81010h)  
00007FF78CE8116E  xchg        ax,ax  
	for (int i = 0; i < 10; i++)
	{
		QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF78CE81170  lea         rcx,[tk1]  
00007FF78CE81175  call        qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]  
		for (int i = 0; i < veclen; i++)
00007FF78CE8117B  xor         edi,edi  
00007FF78CE8117D  mov         ebx,80000h  
		{
			//dvec3[i] = dvec1[i] * dvec2[i];
			dvec3[i] = log(dvec1[i]);
00007FF78CE81182  movups      xmm0,xmmword ptr [rdi+r13]  
00007FF78CE81187  call        __vdecl_log2 (07FF78CE81EE0h)  
00007FF78CE8118C  movups      xmmword ptr [rdi+r12],xmm0  
00007FF78CE81191  lea         rdi,[rdi+10h]  
00007FF78CE81195  sub         rbx,1  
00007FF78CE81199  jne         main+112h (07FF78CE81182h)  
		}
		QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF78CE8119B  lea         rcx,[tk2]  
00007FF78CE811A0  call        qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]  
		printf("ticks: %lld\n", tk2 - tk1);
00007FF78CE811A6  mov         rdx,qword ptr [tk2]  
00007FF78CE811AB  lea         rcx,[string "ticks: %lld\n" (07FF78CE83218h)]  
00007FF78CE811B2  sub         rdx,qword ptr [tk1]  
00007FF78CE811B7  call        printf (07FF78CE81010h)  
00007FF78CE811BC  sub         rbp,1  
00007FF78CE811C0  jne         main+100h (07FF78CE81170h)  
	}
	
    return 0;
00007FF78CE811C2  xor         eax,eax  
}
00007FF78CE811C4  mov         rbx,qword ptr [rsp+70h]  
00007FF78CE811C9  add         rsp,20h  
00007FF78CE811CD  pop         r15  
00007FF78CE811CF  pop         r14  
00007FF78CE811D1  pop         r13  
00007FF78CE811D3  pop         r12  
00007FF78CE811D5  pop         rdi  
00007FF78CE811D6  pop         rsi  
00007FF78CE811D7  pop         rbp  
00007FF78CE811D8  ret  

可以看到,编译器并没有调用logf和log函数,而是调用了__vdecl_logf4和__vdecl_log2函数。因此,即使是使用了math.h中的数学函数,仍然可以实现矢量运算优化。

[修改于 1 年前 - 2016-06-11 15:06:49]


novakon1 年前 -2016-06-12 16:55821380

提一点点小建议:ASM片段实在是太长了,截取重点段会更利于阅读。


Yip_19912 个月前 -2017-05-24 09:46834182
学到一招  DDD

云中子35291 个月前 -2017-06-19 18:28835062
没记错debug版本里面还有一个彩蛋……
程序会在传递参数的时候隐式将float转换成double,送给函数做完再拿回来……
这就是为何printf的时候不用区分%f和%lf而scanf时候需要的原因

小俊12 天前 -2017-07-11 06:23836245
这个测试还不算完整,只能证明没有vectorization的情况下,double和float的throughput是一样的,但不能证明它们的latency也是一样。

acmilan7 天前 -2017-07-16 10:58836577
引用 小俊:
这个测试还不算完整,只能证明没有vectorization的情况下,double和float的throughput是一样的,但不能证明它们的latency也是一样。
这里不做准确的的延迟测试,只是为了表明float向量化之后的吞吐量增加到4倍,而double只增加到2倍,因此float更有利于大吞吐量但精度要求不高的运算。float和double在没有向量化之前速度相同,不是要证明的重点,只是对现象的粗略的表述。

[修改于 7 天前 - 2017-07-16 11:05:11]


返回 软件综合
返回 本页顶部

想参与大家的讨论?现在就 登陆 或者 注册


nkc Development Server https://github.com/lovetheory/nkc2

科创研究院 (c)2005-2017

蜀ICP备11004945号-2 川公网安备51010802000058号