MemCpy

_asm("l64:");
    _asm("jecxz l32check");           // jump to next section if finished
    _asm("dec %ecx");
    _asm("movq 0(%esi),%mm0");        // load each 64 bytes from source
    _asm("movq 8(%esi),%mm1");
    _asm("movq 16(%esi),%mm2");
    _asm("movq 24(%esi),%mm3");
    _asm("movq 32(%esi),%mm4");
    _asm("movq 40(%esi),%mm5");
    _asm("movq 48(%esi),%mm6");
    _asm("movq 56(%esi),%mm7");
    _asm("movq %mm0,0(%edi)");
    _asm("movq %mm1,8(%edi)");
    _asm("movq %mm2,16(%edi)");
    _asm("movq %mm3,24(%edi)");
    _asm("movq %mm4,32(%edi)");
    _asm("movq %mm5,40(%edi)");
    _asm("movq %mm6,48(%edi)");
    _asm("movq %mm7,56(%edi)");
    _asm("addl $64,%esi");
    _asm("addl $64,%edi");            // add 64 each loop to edi & esi
    _asm("jmp l64");

_asm("l32check:");
    _asm("movl 20(%esp),%ecx");
    _asm("shrl $5,%ecx");
    _asm("movl %ecx,%eax");
    _asm("jecxz l16check");
    _asm("shll $5,%eax");
    _asm("subl %eax,20(%esp)");

_asm("l32:");                         // can only be one 32's to copy, no loop
    _asm("dec %ecx");
    _asm("movq 0(%esi),%mm0");
    _asm("movq 8(%esi),%mm1");
    _asm("movq 16(%esi),%mm2");
    _asm("movq 24(%esi),%mm3");
    _asm("movq %mm0,0(%edi)");
    _asm("movq %mm1,8(%edi)");
    _asm("movq %mm2,16(%edi)");
    _asm("movq %mm3,24(%edi)");
    _asm("addl $32,%esi");
    _asm("addl $32,%edi");

_asm("l16check:");
    _asm("movl 20(%esp),%ecx");
    _asm("shrl $4,%ecx");
    _asm("movl %ecx,%eax");
    _asm("jecxz l8check");
    _asm("shll $4,%eax");
    _asm("subl %eax,20(%esp)");

_asm("l16:"); // can only be one 16's to copy
    _asm("dec %ecx");
    _asm("movq 0(%esi),%mm0");
    _asm("movq 8(%esi),%mm1");
    _asm("movq %mm0,0(%edi)");
    _asm("movq %mm1,8(%edi)");
    _asm("addl $16,%esi");
    _asm("addl $16,%edi");

_asm("l8check:");                     // copy the last remaining bytes
    _asm("movl 20(%esp),%ecx");
    _asm("jecxz end");
    _asm("rep");
    _asm("movsb");

_asm("end:");
    _asm("popl %esi");
    _asm("popl %edi");

    _asm("emms");                       // required to reset registers
    _asm("ret");
}

int main(void)
{
    #define NUM 0x8113
    #define OFF 3

    long long t2,t1,t0;
    char * src = malloc(NUM+12); // extra for messing offset alignment tests
    char * dest = malloc(NUM+12);

    memset(src, 3, NUM);
    memset(dest, 0, NUM);

    int i;

    t2 = 0;
    for(i = 0; i<10000; i++){
        t0 = _rdtsc();
        memcpy(dest+OFF, src+OFF, NUM);
        t1 = _rdtsc();
        t2 += t1 - t0;
    }
    xprintf("cycles for memcpy %lld\n", t2/i);
    printf("%d\n", dest[NUM-OFF]);

    memset(dest, 0, NUM);

    t2 = 0;
    for(i = 0; i<10000; i++){
        t0 = _rdtsc();
        mem_cpy(dest+OFF, src+OFF, NUM);
        t1 = _rdtsc();
        t2 += t1 - t0;
    }
    xprintf("cycles for mem_cpy %lld\n", t2/i);
    printf("%d\n", dest[NUM-OFF]);
    free(src);
    free(dest);
    return 0;
}
}