Earlier we looked at using asm when compiling with the optimise switch on, this required a different approach to gaining access to a functions passed arguments than when compiling normally.
When using the 'declare special' modifier [naked] it is also necessary to arrange things slightly differently. Like optimised, the naked function has the first argument at offset 4(%esp) just as it was with optimise but another difference is that no epilogue or prologue code is generated by the compiler.
Normally this would mean you would not have a working function because no arguments could be accessed and neither could you return anything to the calling function. However, because we are working in assembler we can do it ourselves. The function below uses the naked keyword in it's declaration -- not only are the args gained from ESP but we also have to explicitly use the instruction 'ret'. Without the _asm("ret") instruction the program flow will be lost and a probable blue screen will be the result.
The function discovers if a floating point value has its sign bit set, if yes it returns 1, if not it returns zero. The value is first loaded onto the FP stack then the fxam instruction is used which examines the value and sets various bits in the FP status register.
The FP status register is stored in AX with fstsw, bit 9 is tested and if there is no carry, jump to label nosign1 where EAX is cleared before being returned. If there is a carry bit, movl $1 into EAX and jump to exit1 where the FP stack is pop-ed and then 'ret' returns us to the calling function which receives the value in EAX.
#include <stdio.h>
int __declspec(naked) sign_bit(double a)
{
_asm("fldl 4(%esp)");
_asm("fxam");
_asm("fstsw %ax");
_asm("bt $9, %ax");
// check for bit 9 only
_asm("jnc nosign1");
// positive
_asm("movl $1, %eax"); // negative (return 1)
_asm("jmp s_exit1");
_asm("nosign1:");
_asm("xorl %eax, %eax"); // clear eax (return 0)
_asm("s_exit1:");
_asm("fstp %st(0)");
// pop
_asm("ret");
}
int main(void)
{
double dd = -2.345;
printf("%d\n", sign_bit(dd));
return 0;
}
As there is no epilogue or prologue code generated by the compiler with __declspec(naked) there is no change between optimised and not optimised so it's the best option for asm routines for use with LCC-Win32. A few things need to be taken care of by the programmer however.
For a simple routine one can just pass parameters from the stack as shown above.
int __declspec(naked) sign_bit(double a)
{
_asm("fldl 4(%esp)");
For more demanding routines that require local variables one needs to set up one's own stack. A typical way would be to push the frame pointer (EBP) onto the stack (ESP), copy the current stack pointer from the ESP register into the EBP register, and adjust the ESP register which has the current stack-pointer value minus the value required for local variables. Then push various registers onto the stack so that these registers can be used within the routine.
_asm("pushl
%ebp"); // EBP onto the stack
_asm("movl %esp,%ebp"); // copy ESP into EBP
_asm("subl $20,%esp"); // make some room for 5 variables on the stack
_asm("pushl %ebx"); // save regs
_asm("pushl %esi");
_asm("pushl %edi");
One can now store values on the stack by mov value, x(%esp)
Before exiting the routine one must clean up properly.
Always cleanup in reverse order
_asm("popl
%edi"); // restore regs
_asm("popl %esi");
_asm("popl %ebx");
_asm("addl $20,%esp"); // clean up stack
_asm("popl %ebp"); // and pop EBP
_asm("ret");
This approach can be seen in the cipher code (which has been updated since its original posting) listed on the main assembler page.
If your asm routine does many jumps (loops) some advantage can be gained by aligning labels on an even boundary. For example, open the debugger's 'machine instructions' window and look at the label's address - it might be 4014A1
The label is not an instruction but only a place marker. The jump address in the following example is where the mov instruction is, which could be on an odd boundary.
The only way at present to effect alignment with lcc-win32 asm code is to pad the instructions with a nop (no operation) instruction. The nop instruction actually takes one cycle of the MPU's time so adding more than one of them to align on a 4 bytes boundary is counter productive. Using one nop to move the label from an odd to an even address does make a difference. With the above example when the mov (or any other) instruction falls on an odd boundary try adding one nop and see the difference.
_asm("stl2:");
_asm("nop");
_asm("movl (%ecx),%eax"); // new address on an
even boundary
Although for efficiency one should pass a pointer to a structure you may one day need to pass the whole structure. The example below uses __declspec(naked) so that we are in control of the stack frame and as with normal variables, (their value is passed on the stack) a structure is passed on the stack.
#include <stdio.h>
typedef struct{
int a;
int b;
int c;
int d;
int e;
}STRUCT;
STRUCT __declspec(naked) bigone(STRUCT st)
{
_asm("pushl %ebp");
// save
_asm("movl %esp, %ebp"); // copy stack pointer to stack frame
_asm("pushl %esi");
// save
_asm("pushl %edi");
// push-ing moves our vars down
_asm("leal -20(%ebp),%edi"); // make a stack frame enough for 5 ints
_asm("leal 12(%ebp),%esi"); // load effective
address of calling RECT
_asm("movl $5,%ecx");
// number of loops
_asm("rep"); //
_asm("movsl");
// copy 5 ints from stack to frame
_asm("movl $17, -4(%ebp)"); // 5th member of RECT
_asm("movl $27, -8(%ebp)"); // 4th member of RECT
_asm("movl $37, -12(%ebp)"); // 3rd member of RECT
_asm("movl $47, -16(%ebp)"); // 2nd member of RECT
_asm("movl $57, -20(%ebp)"); // 1st member of RECT
_asm("movl 8(%ebp),%edi"); // return stack address
_asm("leal -20(%ebp),%esi"); // our frame stack
_asm("movl $5,%ecx");
// 5 ints to copy
_asm("rep");
_asm("movsl");
// do it
_asm("popl %edi");
// clean up
_asm("popl %esi");
_asm("popl %ebp");
_asm("ret");
}
int main(void)
{
STRUCT st1, st2 = { 3, 4, 5, 6, 7};
st1 = bigone(st2);
printf ("%d %d %d %d %d \n", st1.a, st1.b, st1.c, st1.d, st1.e);
return 0;
}
First EBP is pushed and then the stack pointer (ESP) is copied to EBP.
ESI & EDI are also pushed as they will be used to copy the struct from the calling stack to our frame stack using the string instruction movsl.
The instruction leal computes the address of EBP -20 and loads that address into EDI, the next leal instruction computes the address of EBP + 12.
EBP -20 is our stack frame used to store the struct, EBP + 12 holds the address where the passed struct resides. Mov 5 into ECX so that we copy 5 ints and the instructions rep & movsl will do the copying.
Then a few movl instructions change the values in the struct members, next we have to copy the struct back to the return stack. The address for the return stack is at 8(%ebp) and our stack is at -20(%ebp), rep & movsl will do the copying.
Look in the debug CPU windows to see how the stack changes as you step through each assembler line.
The following example shows this by adding two long longs and returning the 8 byte integer in EAX & EDX
#include <stdio.h>
long long __declspec(naked)
add_ll(long long x, long long y)
{
_asm("movl 4(%esp), %eax");
// low-part of x
_asm("movl 8(%esp), %edx");
// high-part of x
_asm("addl 12(%esp), %eax"); // add the two low-parts
_asm("addl 16(%esp), %edx"); // add the two high-parts
_asm("cmpl 4(%esp),%eax");
// if low part has overflowed inc high part
_asm("jb lower");
_asm("jmp exit");
_asm("lower:");
_asm("incl %edx");
_asm("exit:");
_asm("ret");
// combined answer in EAX & EDX
}
int main(void)
{
long long a = 0x0000002080000001L, b = 0x0000002090000001L;
xprintf("%.16llx\n", add_ll(a, b));
return 0;
}
This is not necessary in a C program but as it can be done here it is. Also you may need to do it with an assembler like NASM one day. The data is stored at the end of this listing, just below the code. Place these data segments outside of functions otherwise it will not work.
#include <stdio.h>
char * __declspec(naked) getstring(void)
{
_asm("leal d3, %eax");
_asm("ret");
}
long __declspec(naked) getlong(void)
{
_asm("leal d1, %ecx");
_asm("movl (%ecx), %eax");
_asm("ret");
}
short __declspec(naked) getshort(void)
{
_asm("leal d2, %ecx");
_asm("movw (%ecx), %eax");
_asm("ret");
}
int main(void)
{
printf("%s\n", getstring());
printf("%x\n", getlong());
printf("%x\n", getshort());
return 0;
}
// Here is the data stored as you would in an assembler.
_asm(".data");
// marks the data area
_asm(".align 2"); // padding that ensures an even boundary for the
data
_asm("d1:"); // label that
enables accessing the data
_asm(".long 0x3409820");
_asm(".align 2");
_asm("d2:");
_asm(".short 0x20");
_asm("d3:");
_asm(".byte
83,111,109,109,101,114,116,105,109,101,0"); //
null terminated string
You can copy memory faster than the run time libraries memcpy() if your memory size is a multiple of 64 with the following routine. MMX registers are 8 bytes wide but one cannot copy from memory to memory. First load 8 bytes into an MXX register and then store it in memory from that register.
The asm instruction LEA can be used for fast additions + multiply.
_asm("movl 4(%esp), %edx"); // base into edx
_asm("movl 8(%esp), %ecx");
// index into ecx
_asm("leal (%edx,%ecx,4), %eax"); // base + (index * 4)
into eax
This is particularly useful when calculating offsets for arrays but can equally be used for normal calculations. LEA (load effective address) does not access memory it only calculates.
Here's some examples.
_asm("leal (%eax,%eax,2),%eax"); // 3*eax
_asm("leal (%eax,%eax,8),%eax"); // 9*eax
_asm("leal (%ecx,%eax,8),%eax"); // (8*eax) + ecx
Using this method to multiply numbers is much faster than using MUL.
In primer1 I showed certain differences between the Intel syntax and AT&T's syntax, here are some more instructions that show the differences between the two.
Intel Syntaxinst [base+index*scale+disp] mov eax,[ebx+20h] add eax,[ebx+ecx*2h lea eax,[ebx+ecx] sub eax,[ebx+ecx*4h-20h] | AT&T Syntaxinst disp(base,index,scale) movl 0x20(%ebx),%eax addl (%ebx,%ecx,0x2),%eax leal (%ebx,%ecx),%eax subl -0x20(%ebx,%ecx,0x4),%eax |
As of the 7th Sep 2002 one can compile assembler as an *.asm file with LCC-Win32.
Copy these two functions (see below) and create a file called something.asm. Add the file to your project and compile - you of course will need the prototypes somewhere the compiler can see them.
Unfortunately the debugger does not extend to working with asm files yet, maybe one day it will.
Note: Comments at the top of the code can be created with one semi-colon, comments within the body of code need two semi-colons.
;-------------------------------------------------------
;
; Prototypes for mem_cpy & mem_set
;
; void mem_cpy(void * dest, void * src, size_t nBytes);
; void mem_set(void * dest, int set, size_t nBytes);
;
;-------------------------------------------------------
.type _mem_cpy,function
_mem_cpy:
pushl %edi
pushl %esi
movl 12(%esp),%edi ;; dest
movl 16(%esp),%esi ;; src
cmpl $16,20(%esp)
jl l8check
movl %esi,%ecx
cld
andl $7, %ecx
jpo l64check
movl $8,%eax
subl %ecx,%eax
movl %eax,%ecx
rep
movsb
l64check:
movl 20(%esp),%ecx
shrl $6,%ecx
movl %ecx,%eax
jecxz l32check
shll $6,%eax
subl %eax,20(%esp)
l64:
jecxz l32check
decl %ecx
movq 0(%esi),%mm0
movq 8(%esi),%mm1
movq 16(%esi),%mm2
movq 24(%esi),%mm3
movq 32(%esi),%mm4
movq 40(%esi),%mm5
movq 48(%esi),%mm6
movq 56(%esi),%mm7
movq %mm0,0(%edi)
movq %mm1,8(%edi)
movq %mm2,16(%edi)
movq %mm3,24(%edi)
movq %mm4,32(%edi)
movq %mm5,40(%edi)
movq %mm6,48(%edi)
movq %mm7,56(%edi)
addl $64,%esi
addl $64,%edi
jmp l64
l32check:
movl 20(%esp),%ecx
shrl $5,%ecx
movl %ecx,%eax
jecxz l16check
shll $5,%eax
subl %eax,20(%esp)
l32:
decl %ecx
movq 0(%esi),%mm0
movq 8(%esi),%mm1
movq 16(%esi),%mm2
movq 24(%esi),%mm3
movq %mm0,0(%edi)
movq %mm1,8(%edi)
movq %mm2,16(%edi)
movq %mm3,24(%edi)
addl $32,%esi
addl $32,%edi
l16check:
movl 20(%esp),%ecx
shrl $4,%ecx
movl %ecx,%eax
jecxz l8check
shll $4,%eax
subl %eax,20(%esp)
l16:
decl %ecx
movq 0(%esi),%mm0
movq 8(%esi),%mm1
movq %mm0,0(%edi)
movq %mm1,8(%edi)
addl $16,%esi
addl $16,%edi
l8check:
movl 20(%esp),%ecx
jecxz end
rep
movsb
end:
popl %esi
popl %edi
emms
ret
.globl _mem_cpy
;----------------------------------------------------
; void mem_set(void * dest, int set, size_t nBytes);
;----------------------------------------------------
.type _mem_set,function
_mem_set:
pushl %edi
pushl %ebx
movl 12(%esp),%edi
movl 16(%esp),%ebx
cmpl $16, 20(%esp)
jl L8check
andl $0x000000ff,%ebx
movl %ebx,%ecx
shll $8,%ebx
orl %ecx,%ebx
movl %ebx,%ecx
shll $16,%ebx
orl %ecx,%ebx
movl 20(%esp),%eax
movl %ebx,16(%esp)
movl %ebx,20(%esp)
movq 16(%esp),%mm0
movl %eax,20(%esp)
movl %edi,%ecx
andl $7, %ecx
jpo L64check
movl $8,%eax
subl %ecx,%eax
movl %eax,%ecx
subl %ecx,20(%esp) ;; adjust total number of bytes
align:
movb %bx, (%edi)
inc %edi
decl %ecx
cmp $0,%ecx
jne align
L64check:
movl 20(%esp),%ecx
shrl $6,%ecx
movl %ecx,%eax
jecxz L32check
shll $6,%eax
subl %eax,20(%esp)
L64:
jecxz L32check
decl %ecx
movq %mm0,0(%edi)
movq %mm0,8(%edi)
movq %mm0,16(%edi)
movq %mm0,24(%edi)
movq %mm0,32(%edi)
movq %mm0,40(%edi)
movq %mm0,48(%edi)
movq %mm0,56(%edi)
addl $64,%edi
jmp L64
L32check:
movl 20(%esp),%ecx
shrl $5,%ecx
movl %ecx,%eax
jecxz L16check
shll $5,%eax
subl %eax,20(%esp)
L32:
decl %ecx
movq %mm0,0(%edi)
movq %mm0,8(%edi)
movq %mm0,16(%edi)
movq %mm0,24(%edi)
addl $32,%edi
L16check:
movl 20(%esp),%ecx
shrl $4,%ecx
movl %ecx,%eax
jecxz L8check
shll $4,%eax
subl %eax,20(%esp)
L16:
decl %ecx
movq %mm0,0(%edi)
movq %mm0,8(%edi)
addl $16,%edi
L8check:
movl 20(%esp),%ecx
L8:
jecxz end1
movb %bx, (%edi)
inc %edi
decl %ecx
cmpl $0,%ecx
jne L8
end1:
popl %ebx
popl %edi
emms
ret
.globl _mem_set