The routine below does a memset using MMX registers. It compares favorably with the run time library's memset. Checks are made for alignment and adjusted where necessary.
#include <string.h>
#include <malloc.h>
#include <stdio.h>
#include <intrinsics.h>
void __declspec(naked) mem_set(void * dest, int set, size_t nBytes)
{
_asm("pushl %edi");
_asm("pushl %ebx");
_asm("movl 12(%esp),%edi"); // dest
_asm("movl 16(%esp),%ebx"); // get the set value
// if num bytes is very small go straight to the last section
// this will also handle zero num bytes
_asm("cmpl $16, 20(%esp)");
_asm("jl L8check");
_asm("andl $0x000000ff,%ebx"); // mask out unwanted bits (if any)
// fill all four bytes of ebx with the value by shifting and or-ing
_asm("movl %ebx,%ecx");
_asm("shll $8,%ebx");
// shift and or so that ebx
_asm("orl %ecx,%ebx"); // is filled with the set byte
_asm("movl %ebx,%ecx");
_asm("shll $16,%ebx");
_asm("orl %ecx,%ebx");
_asm("movl 20(%esp),%eax");
_asm("movl %ebx,16(%esp)"); // store at original place
_asm("movl %ebx,20(%esp)"); // store at original place + 4
_asm("movq 16(%esp),%mm0"); // all eight bytes from memory into mm0
_asm("movl %eax,20(%esp)"); // restore stack value
// align src on 8 boundary. If memory is non-aligned it will seriously
// affect performance, one cannot align both src & dest
_asm("movl %edi,%ecx");
_asm("andl $7, %ecx");
_asm("jpo L64check");
// jump if no parity
_asm("movl $8,%eax");
_asm("subl %ecx,%eax");
_asm("movl %eax,%ecx");
_asm("subl %ecx,20(%esp)"); // adjust total number of bytes
_asm("align:");
_asm("movb %bx, (%edi)");
_asm("inc %edi");
_asm("dec %ecx");
_asm("cmp $0,%ecx");
_asm("jne align");
_asm("L64check:");
_asm("movl 20(%esp),%ecx"); // total number of bytes
_asm("shrl $6,%ecx");
_asm("movl %ecx,%eax"); // tmp
_asm("jecxz L32check");
_asm("shll $6,%eax");
_asm("subl %eax,20(%esp)"); // new value of num bytes (orig - 64's)
_asm("L64:");
_asm("jecxz L32check"); // jump to next section if finished
_asm("dec %ecx");
_asm("movq %mm0,0(%edi)");
_asm("movq %mm0,8(%edi)");
_asm("movq %mm0,16(%edi)");
_asm("movq %mm0,24(%edi)");
_asm("movq %mm0,32(%edi)");
_asm("movq %mm0,40(%edi)");
_asm("movq %mm0,48(%edi)");
_asm("movq %mm0,56(%edi)");
_asm("addl $64,%edi"); // add 64 each loop
_asm("jmp L64");
_asm("L32check:");
_asm("movl 20(%esp),%ecx");
_asm("shrl $5,%ecx");
_asm("movl %ecx,%eax");
_asm("jecxz L16check");
_asm("shll $5,%eax");
_asm("subl %eax,20(%esp)");
_asm("L32:"); // can only be one 32's to copy, no loop
_asm("dec %ecx");
_asm("movq %mm0,0(%edi)");
_asm("movq %mm0,8(%edi)");
_asm("movq %mm0,16(%edi)");
_asm("movq %mm0,24(%edi)");
_asm("addl $32,%edi");
_asm("L16check:");
_asm("movl 20(%esp),%ecx");
_asm("shrl $4,%ecx");
_asm("movl %ecx,%eax");
_asm("jecxz L8check");
_asm("shll $4,%eax");
_asm("subl %eax,20(%esp)");
_asm("L16:"); // can only be one 16's to copy
_asm("dec %ecx");
_asm("movq %mm0,0(%edi)");
_asm("movq %mm0,8(%edi)");
_asm("addl $16,%edi");
_asm("L8check:"); // copy the last remaining bytes
_asm("movl 20(%esp),%ecx");
_asm("L8:");
_asm("jecxz end1");
_asm("movb %bx, (%edi)");
_asm("inc %edi");
_asm("dec %ecx");
_asm("cmp $0,%ecx");
_asm("jne L8");
_asm("end1:");
_asm("popl %ebx");
_asm("popl %edi");
_asm("emms");
_asm("ret");
}
int main(void)
{
long long t2,t1,t0;
int i;
#define NUM 0x8113
#define OFF 3
char * dest = malloc(NUM+12);
memset(dest, 0x3456, NUM);
t2 = 0;
for(i = 0; i<10000; i++){
t0 = _rdtsc();
memset(dest+OFF, 0x6753, NUM);
t1 = _rdtsc();
t2 += t1 - t0;
}
xprintf("cycles for memset %lld\n", t2/i);
printf("%d\n", dest[NUM-OFF]);
memset(dest, 0, NUM);
t2 = 0;
for(i = 0; i<10000; i++){
t0 = _rdtsc();
mem_set(dest+OFF, 0x6753, NUM);
t1 = _rdtsc();
t2 += t1 - t0;
}
xprintf("cycles for mem_set %lld\n", t2/i);
printf("%d\n", dest[NUM-OFF]);
free(dest);
return 0;
}