X86处理器包含两种类型的浮点数寄存器。第一种使用8个浮点寄存器组成浮点寄存器栈，另一种为向量寄存器(XMM,YMM)，它们对于单双精度的处理是不同的。本文将讨论两种模式下的浮点数计算速度问题。

```float a,b,c;
c=a*b;

fld         dword ptr [a]  //将a加载到浮点栈顶,即ST(0)=a;
fmul        dword ptr [b]  //将栈顶元素与b相乘，结果仍存于栈顶,即ST(0)=ST(0)*b
fstp        dword ptr [c]  //将栈顶元素弹出并保存于c,即c=ST(0),POP();```
```double  a,b,c;
c=a*b;

fld         qword ptr [a]
fmul        qword ptr [b]
fstp        qword ptr [c]  ```

Single precision division, square root and mathematical functions are calculated faster than double precision when the XMM registers are used, while the speed of addition, subtraction, multiplication, etc. is still the same regardless of precision on most processors (when vector operations are not used).

```float SqrtfloatV1(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=sqrt(A[i]);
}
return fSum;
}

double SqrtdoubleV1(double *A,const int len)
{
double dSum=;
for(int i=;i<len;i++)
{
dSum+=sqrt(A[i]);
}
return dSum;
}```

```double  a,b;
int c;
c=a*b;

fld         qword ptr [a]
fmul        qword ptr [b]
call        @ILT+(__ftol2_sse) (0EC10CDh) //调用函数_ftol2_sse实现浮点数到整数的转换
mov         dword ptr [c],eax  ```

```double  a,b;
int c;
c=a*b;

movsd       xmm0,mmword ptr [a]
mulsd       xmm0,mmword ptr [b]
cvttsd2si   eax,xmm0  //cvttsd2si指令实现
mov         dword ptr [c],eax  ```

```float AddfloatV1(float *A,const int len)
{
int iSum=;
for (int i=;i<len;i++)
{
iSum+=A[i];//转成整数再求和
}
return (float)iSum;
}

float AddfloatV2(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=A[i];
}
return fSum;
}```

(1)float与double混用(默认的浮点常量为double)

```float a,b;
a=b*1.2;

movd        xmm0,dword ptr [b]
cvtps2pd    xmm0,xmm0
mulsd       xmm0,mmword ptr [__real@3ff3333333333333 (13F646790h)]
cvtsd2ss    xmm0,xmm0
movss       dword ptr [a],xmm0  ```

（2）纯float

```float a,b;
a=b*1.2f;

movss       xmm0,dword ptr [b]
mulss       xmm0,dword ptr [__real@3f99999a (13F84678Ch)]
movss       dword ptr [a],xmm0  ```

```float MulfloatV1(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=A[i]*1.2f;
}
return fSum;
}

float MulfloatV2(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=A[i]*1.2;//默认的浮点常数是double
}
return fSum;
}```

```#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include "Timing.h"
const int BUFSIZE =;
float buf[BUFSIZE];
double buf2[BUFSIZE];

//测试64位下float与double的速度差
float SqrtfloatV1(float *A,const int len);
double SqrtdoubleV1(double *A,const int len);

//测试浮点数转整数的速度
float AddfloatV1(float *A,const int len);
float AddfloatV2(float *A,const int len);

//测试64位下float与double混用速度
float MulfloatV2(float *A,const int len);
float MulfloatV1(float *A,const int len);

int main()
{
const int testloop=;
double interval;
srand( (unsigned)time( NULL ) );
for (int i = ; i < BUFSIZE; i++)
{

buf[i] = (float)(rand() & 0x3f);
buf2[i]= (double)(buf[i]);
}
//*****************************************************************//
//测试64位下float与double的速度差(32位无明显差异)
volatile float result1=;
startTiming();
for(unsigned int i=;i<testloop;i++)
{
result1=SqrtfloatV1(buf,BUFSIZE);
}
interval=stopTiming();
printf("SqrtfloatV1:\t%f,\t%lfms\n",result1,interval);

volatile double result2=;
startTiming();
for(unsigned int i=;i<testloop;i++)
{
result2=SqrtdoubleV1(buf2,BUFSIZE);
}
interval=stopTiming();
printf("SqrtdoubleV1:\t%lf,\t%lfms\n",result2,interval);
//*****************************************************************//

//*****************************************************************//
//测试浮点数转整数的速度(32为与64位均有明显差异)
volatile float result3=;
startTiming();
for(unsigned int i=;i<testloop;i++)
{
}
interval=stopTiming();

volatile float result4=;
startTiming();
for(unsigned int i=;i<testloop;i++)
{
}
interval=stopTiming();
//*****************************************************************//

//*****************************************************************//
//测试64位下float与double混用速度(32位无差异,因统一处理)
volatile float result5=;
startTiming();
for(unsigned int i=;i<testloop;i++)
{
result5=MulfloatV1(buf,BUFSIZE);
}
interval=stopTiming();
printf("MulfloatV1:\t%f,\t%lfms\n",result5,interval);

volatile float result6=;
startTiming();
for(unsigned int i=;i<testloop;i++)
{
result6=MulfloatV2(buf,BUFSIZE);
}
interval=stopTiming();
printf("MulfloatV2:\t%f,\t%lfms\n",result6,interval);
//*****************************************************************//
return ;
}

float SqrtfloatV1(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=sqrt(A[i]);
}
return fSum;
}

double SqrtdoubleV1(double *A,const int len)
{
double dSum=;
for(int i=;i<len;i++)
{
dSum+=sqrt(A[i]);
}
return dSum;
}

float AddfloatV1(float *A,const int len)
{
int iSum=;
for (int i=;i<len;i++)
{
iSum+=A[i];//转成整数再求和
}
return (float)iSum;
}

float AddfloatV2(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=A[i];
}
return fSum;
}

float MulfloatV1(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=A[i]*1.2f;
}
return fSum;
}

float MulfloatV2(float *A,const int len)
{
float fSum=;
for (int i=;i<len;i++)
{
fSum+=A[i]*1.2;//默认的浮点常数是double
}
return fSum;
}```

