Assembler with LCC-Win32

The routine below uses the AMD 3DNOW pfmull instruction to multiply two float vectors.

Mul_Vec

#include <stdio.h>

void __declspec(naked) mul_vec(float * f1, float * f2, int n)
{
    _asm("movl 4(%esp),%eax");         // f1 address (memory)
    _asm("movl 8(%esp),%edx");         // f2 address (memory)
    _asm("cmpl $0,12(%esp)");          // none?
    _asm("je pfexit");
    _asm("femms");
    _asm("xorl %ecx,%ecx");
_asm("lpf:");
    _asm("movq (%eax,%ecx,8),%mm0");   // load 8 bytes of f1
    _asm("pfmul (%edx,%ecx,8),%mm0"); // mul (mem*mm0)->mm0
    _asm("movq %mm0,(%edx,%ecx,8)");   // store result back in f2
    _asm("incl %ecx");
    _asm("cmpl 12(%esp),%ecx");
    _asm("jne lpf");
_asm("pfexit:");
    _asm("femms");
    _asm("ret");
}

int main(void)
{
    int i;
    float f1[16];
    float f2[16];
    for (i=0; i<16;i++) {
        f1[i] = (float)i+1;
        f2[i] = (float)2*(i+1);
    }

    mul_vec(&f1[0], &f2[0], sizeof(f1)/8);

    for (i=0; i<16; i++) {
        printf("%f\t%d*%d\t%f\n",f1[i],i+1,2*(i+1),f2[i]);
    }
    return 0;
}

Back to main page