Assembler with LCC-Win32

The routine below does a memset using MMX registers. It compares favorably with the run time library's memset. Checks are made for alignment and adjusted where necessary.

MemSet

#include <string.h>
#include <malloc.h>
#include <stdio.h>
#include <intrinsics.h>

void __declspec(naked) mem_set(void * dest, int set, size_t nBytes)
{
    _asm("pushl %edi");
    _asm("pushl %ebx");

    _asm("movl 12(%esp),%edi"); // dest
    _asm("movl 16(%esp),%ebx"); // get the set value

    // if num bytes is very small go straight to the last section
    // this will also handle zero num bytes
    _asm("cmpl $16, 20(%esp)");
    _asm("jl L8check");

    _asm("andl $0x000000ff,%ebx"); // mask out unwanted bits (if any)
    // fill all four bytes of ebx with the value by shifting and or-ing
    _asm("movl %ebx,%ecx");
    _asm("shll $8,%ebx");       // shift and or so that ebx
    _asm("orl %ecx,%ebx");      // is filled with the set byte
    _asm("movl %ebx,%ecx");
    _asm("shll $16,%ebx");
    _asm("orl %ecx,%ebx");

    _asm("movl 20(%esp),%eax");
    _asm("movl %ebx,16(%esp)"); // store at original place
    _asm("movl %ebx,20(%esp)"); // store at original place + 4
    _asm("movq 16(%esp),%mm0"); // all eight bytes from memory into mm0
    _asm("movl %eax,20(%esp)"); // restore stack value

    // align src on 8 boundary. If memory is non-aligned it will seriously
    // affect performance, one cannot align both src & dest
    _asm("movl %edi,%ecx");
    _asm("andl $7, %ecx");
    _asm("jpo L64check");         // jump if no parity
    _asm("movl $8,%eax");
    _asm("subl %ecx,%eax");
    _asm("movl %eax,%ecx");
    _asm("subl %ecx,20(%esp)");   // adjust total number of bytes

_asm("align:");
    _asm("movb %bx, (%edi)");
    _asm("inc %edi");
    _asm("dec %ecx");
    _asm("cmp $0,%ecx");
    _asm("jne align");

_asm("L64check:");
    _asm("movl 20(%esp),%ecx"); // total number of bytes
    _asm("shrl $6,%ecx");
    _asm("movl %ecx,%eax");     // tmp
    _asm("jecxz L32check");
    _asm("shll $6,%eax");
    _asm("subl %eax,20(%esp)"); // new value of num bytes (orig - 64's)

_asm("L64:");
    _asm("jecxz L32check"); // jump to next section if finished
    _asm("dec %ecx");
    _asm("movq %mm0,0(%edi)");
    _asm("movq %mm0,8(%edi)");
    _asm("movq %mm0,16(%edi)");
    _asm("movq %mm0,24(%edi)");
    _asm("movq %mm0,32(%edi)");
    _asm("movq %mm0,40(%edi)");
    _asm("movq %mm0,48(%edi)");
    _asm("movq %mm0,56(%edi)");
    _asm("addl $64,%edi"); // add 64 each loop
    _asm("jmp L64");

_asm("L32check:");
    _asm("movl 20(%esp),%ecx");
    _asm("shrl $5,%ecx");
    _asm("movl %ecx,%eax");
    _asm("jecxz L16check");
    _asm("shll $5,%eax");
    _asm("subl %eax,20(%esp)");

_asm("L32:"); // can only be one 32's to copy, no loop
    _asm("dec %ecx");
    _asm("movq %mm0,0(%edi)");
    _asm("movq %mm0,8(%edi)");
    _asm("movq %mm0,16(%edi)");
    _asm("movq %mm0,24(%edi)");
    _asm("addl $32,%edi");

_asm("L16check:");
    _asm("movl 20(%esp),%ecx");
    _asm("shrl $4,%ecx");
    _asm("movl %ecx,%eax");
    _asm("jecxz L8check");
    _asm("shll $4,%eax");
    _asm("subl %eax,20(%esp)");

_asm("L16:"); // can only be one 16's to copy
    _asm("dec %ecx");
    _asm("movq %mm0,0(%edi)");
    _asm("movq %mm0,8(%edi)");
    _asm("addl $16,%edi");

_asm("L8check:"); // copy the last remaining bytes
    _asm("movl 20(%esp),%ecx");
_asm("L8:");
    _asm("jecxz end1");
    _asm("movb %bx, (%edi)");
    _asm("inc %edi");
    _asm("dec %ecx");
    _asm("cmp $0,%ecx");
    _asm("jne L8");

_asm("end1:");
    _asm("popl %ebx");
    _asm("popl %edi");
    _asm("emms");
    _asm("ret");
}

int main(void)
{
    long long t2,t1,t0;
    int i;
    #define NUM 0x8113
    #define OFF 3

    char * dest = malloc(NUM+12);

    memset(dest, 0x3456, NUM);
    t2 = 0;
    for(i = 0; i<10000; i++){
        t0 = _rdtsc();
        memset(dest+OFF, 0x6753, NUM);
        t1 = _rdtsc();
        t2 += t1 - t0;
    }
    xprintf("cycles for memset %lld\n", t2/i);
    printf("%d\n", dest[NUM-OFF]);

    memset(dest, 0, NUM);

    t2 = 0;
    for(i = 0; i<10000; i++){
        t0 = _rdtsc();
        mem_set(dest+OFF, 0x6753, NUM);
        t1 = _rdtsc();
        t2 += t1 - t0;
    }
    xprintf("cycles for mem_set %lld\n", t2/i);
    printf("%d\n", dest[NUM-OFF]);
    free(dest);
    return 0;
}

Back to main page