Assembler with LCC-Win32 (2 bugs fixed 9th June 2004)
The routine below does a memory copy using MMX registers. It compares favorably with the run time library's memcpy. Checks are made for alignment and adjusted where necessary.
#include <string.h>
#include <malloc.h>
#include <stdio.h>
#include <intrinsics.h>
void __declspec(naked) mem_cpy(void * dest, void * src, size_t nBytes)
{
_asm("pushl %edi");
_asm("pushl %esi");
_asm("movl 12(%esp),%edi");
// dest
_asm("movl 16(%esp),%esi");
// src
// if num bytes is very small go straight to the last section
// this will also handle zero num bytes
_asm("cmpl $16, 20(%esp)");
_asm("jl l8check");
// align src on 8 boundary. If memory is non-aligned it will seriously
// affect performance, one cannot align both src & dest
_asm("movl %esi,%ecx");
_asm("cld");
_asm("andl $7, %ecx");
_asm("jpe l64check"); // jump if
even parity
_asm("movl $8,%eax");
_asm("subl %ecx,%eax");
_asm("movl %eax,%ecx");
_asm("rep");
_asm("movsb");
// do the copy
_asm("subl
%eax,20(%esp) // new value of num bytes [orig - align bytes]
_asm("l64check:");
_asm("movl 20(%esp),%ecx");
// total number of bytes
_asm("shrl $6,%ecx");
_asm("movl %ecx,%eax"); // tmp
_asm("jecxz l32check");
_asm("shll $6,%eax");
_asm("subl %eax,20(%esp)");
// new value of num bytes (orig - 64's)