#include <stdio.h>
void __declspec(naked) mmx_packed_add(unsigned char * a, unsigned char * b, int n)
{
// PADDB does a packed addition between its two 64-bit operands,
// storing the result in the destination (second) operand. PADDB
// treats the 64-bit operands as vectors of eight bytes and adds
// each byte individually.
_asm("movl 4(%esp),%eax"); // src address (memory)
_asm("movl 8(%esp),%edx"); // dest address (memory)
_asm("movl 12(%esp),%ecx"); // number of 8 bytes (quads)
- _asm("cmpl
$0,%ecx");
_asm("je lexit");
_asm("lpp:");
_asm("decl %ecx");
_asm("movq (%eax),%mm0"); // load first 8 bytes of src
_asm("movq (%edx),%mm1"); // load first 8 bytes of dest
_asm("paddb %mm0, %mm1"); // add
_asm("movq %mm1,(%edx)"); // store result back in dest
_asm("addl $8,%edx");
// add 8 bytes to each address
_asm("addl $8,%eax");
- _asm("cmpl
$0,%ecx");
_asm("jne lpp");
_asm("lexit:");
-
_asm("ret");
}
int main(void)
{
unsigned char src[16] = {120,120,120,120,120,120,120,
120,120,120,120,120,120,120,120,120};
unsigned char dest[16] = {17,17,17,17,17,17,17,17,17,
17,17,17,17,17,17,17};
mmx_packed_add(src,dest,sizeof(src)/8); // 16/8 == 2
for (int i=0; i<16; i++)
printf("%d ", dest[i]);
return 0;
}