The routine below uses the AMD 3DNOW pfmull instruction to multiply two float vectors.
#include <stdio.h>
void __declspec(naked) mul_vec(float * f1, float * f2, int n)
{
_asm("movl 4(%esp),%eax");
// f1 address (memory)
_asm("movl 8(%esp),%edx");
// f2 address (memory)
_asm("cmpl $0,12(%esp)");
// none?
_asm("je pfexit");
_asm("femms");
_asm("xorl %ecx,%ecx");
_asm("lpf:");
_asm("movq (%eax,%ecx,8),%mm0"); // load 8 bytes of
f1
_asm("pfmul (%edx,%ecx,8),%mm0"); // mul (mem*mm0)->mm0
_asm("movq %mm0,(%edx,%ecx,8)"); // store result back in f2
_asm("incl %ecx");
_asm("cmpl 12(%esp),%ecx");
_asm("jne lpf");
_asm("pfexit:");
_asm("femms");
_asm("ret");
}
int main(void)
{
int i;
float f1[16];
float f2[16];
for (i=0; i<16;i++) {
f1[i] = (float)i+1;
f2[i] = (float)2*(i+1);
}
mul_vec(&f1[0], &f2[0], sizeof(f1)/8);
for (i=0; i<16; i++) {
printf("%f\t%d*%d\t%f\n",f1[i],i+1,2*(i+1),f2[i]);
}
return 0;
}