Assembler with LCC-Win32

Although the text below says that one can use a different number of iterations I have found that it only works with 32 iterations.

The assembly listing supplied was for 16 bit, so I made it 32 bit. There's no doubt the code could be made faster if one spent time on it.

It may not be obvious what one of the instructions in this code does. It is a extended MOV instruction that takes a base address, an index and a scale value. A very useful instruction especially when when accessing an array.

_asm("movl (%eax,%edx,4),%esi");

base,index,scale, dest

The effective address copied from is the ( base + ( index * scale ) ). For an array of longs one would use a scale value of 4, for an array of shorts one would use an index of 2 and so on.

John

You can visit the site here http://vader.brad.ac.uk/tea/tea.shtml

Cipher

#include <stdio.h>

/************************************************

    The Tiny Encryption Algorithm (TEA) by David Wheeler and
    Roger Needham of the Cambridge Computer Laboratory.

    Placed in the Public Domain by David Wheeler and Roger Needham.

    **** ANSI C VERSION (New Variant) ****

    Notes:

    TEA is a Feistel cipher with XOR and and addition as the
    non-linear mixing functions.

    Takes 64 bits of data in v[0] and v[1]. Returns 64 bits of
    data in w[0] and w[1]. Takes 128 bits of key in k[0] - k[3].

    TEA can be operated in any of the modes of DES. Cipher Block
    Chaining is, for example, simple to implement.

    n is the number of iterations. 32 is ample, 16 is sufficient,
    as few as eight may be OK. The algorithm achieves good
    dispersion after six iterations. The iteration count can be
    made variable if required.

    Note this is optimised for 32-bit CPUs with fast shift
    capabilities. It can very easily be ported to assembly
    language on most CPUs.

    delta is chosen to be the real part of (the golden ratio
    Sqrt(5/4) - 1/2 ~ 0.618034 multiplied by 2^32).

    This version has been amended to foil two weaknesses
    identified by David A. Wagner (daw@cs.berkeley.edu):
    1) effective key length of old-variant TEA was 126 not 128 bits
    2) a related key attack was possible although impractical.

************************************************/

void encipher(unsigned long * v,unsigned long * w, unsigned long * k)
{
    unsigned long v0 = v[0], v1 = v[1], sum = 0, delta = 0x9E3779B9, n = 32;

    while(n-->0){
        v0 += ((v1<<4 ^ v1>>5) + v1) ^ (sum + k[sum&3]);
        sum += delta;
        v1 += ((v0<<4 ^ v0>>5) + v0) ^ (sum + k[(sum>>11) & 3]);
    }
    w[0] = v0; w[1] = v1;
}

void decipher(unsigned long * v,unsigned long * w, unsigned long * k)
{
    unsigned long v0 = v[0], v1 = v[1], sum = 0xC6EF3720, delta = 0x9E3779B9, n = 32;

    // sum = delta<<5, in general sum = delta * n

    while(n-->0){
        v1 -= (v0<<4 ^ v0>>5) + v0 ^ (sum + k[(sum>>11) & 3]);
        sum -= delta;
        v0 -= (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
    }
    w[0] = v0; w[1] = v1;
}

void __declspec(naked) encipher_asm(unsigned long *v,unsigned long *w,unsigned long *k)
{
    //------------------------------------------------------
    // &v =   8(%ebp)    &w = 12(%ebp) &k = 16(%ebp)
    // v0 =   4(%esp)    v1 = 8(%esp)
    // sum = 16(%esp) delta = 12(%ebp)
    //------------------------------------------------------
    _asm("pushl %ebp");                // save it
    _asm("movl %esp,%ebp");            // copy stack pointer
    _asm("subl $20,%esp");             // make some room on the stack
    _asm("pushl %ebx");                // save regs
    _asm("pushl %esi");
    _asm("pushl %edi");
    _asm("movl 4(%ebp),%ebx");         // address of v0 -> ebx
    _asm("movl (%ebx),%edi");          // v0 -> edi
    _asm("movl %edi,4(%esp)");         // v0 -> 4(%esp)
    _asm("movl 4(%ebx),%edi");         // next address v1 -> edi
    _asm("movl %edi,8(%esp)");         // v1 -> 8(%esp)
    _asm("movl $0x9e3779b9,12(%esp)"); // delta -> 12(%esp)
    _asm("movl $32,%ecx");             // 32 loops ecx
    _asm("movl $0,16(%esp)");          // sum in 16(%esp)
_asm("stl1:");
    _asm("jecxz endl1");               // is ecx zero?
    _asm("decl %ecx");
    //------------------------------------------------------
    // v0 += ((v1<<4 ^ v1>>5) + v1) ^ (sum + k[sum&3]);
    //------------------------------------------------------
    _asm("movl 8(%esp),%eax");         // v1 -> eax
    _asm("movl %eax,%ebx");            // v1 -> ebx
    _asm("shll $4,%eax");              // v1 << 4
    _asm("shrl $5,%ebx");              // v1 >> 5
    _asm("xorl %eax,%ebx");            // (v1<<4 ^ v1>>5)
    _asm("addl 8(%esp),%ebx");         // (v1<<4 ^ v1>>5) + v1
    _asm("movl 16(%esp),%edx");        // sum -> ebx
    _asm("andl $3,%edx");              // sum&3
    _asm("movl 12(%ebp),%eax");        // address of k -> eax
    _asm("movl (%eax,%edx,4),%esi");   // k[sum&3] -> esi
    _asm("addl 16(%esp),%esi");        // (sum + k[sum&3])
    _asm("xorl %ebx,%esi");         // (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3])
    _asm("addl %esi,4(%esp)");      // v0 += (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
    //------------------------------------------------------
    // sum += delta;
    //------------------------------------------------------
    _asm("movl 12(%esp),%eax");         // delta -> eax
    _asm("addl %eax,16(%esp)");         // sum += delta
    //------------------------------------------------------
    // v1 += (v0<<4 ^ v0>>5) + v0 ^ (sum + k[sum>>11 & 3]);
    //------------------------------------------------------
    _asm("movl 4(%esp),%eax");          // v0 -> eax
    _asm("movl %eax,%ebx");             // v0 -> ebx
    _asm("shll $4,%eax");               // v0 << 4
    _asm("shrl $5,%ebx");               // v0 >> 5
    _asm("xorl %eax,%ebx");             // (v0<<4 ^ v0>>5)
    _asm("addl 4(%esp),%ebx");          // (v0<<4 ^ v0>>5) + v0
    _asm("movl 16(%esp),%edx");         // sum -> ebx
    _asm("shrl $11,%edx");              // sum >> 11
    _asm("andl $3,%edx");               // sum >> 11 &3
    _asm("movl 12(%ebp),%eax");         // address of k -> eax
    _asm("movl (%eax,%edx,4),%esi");    // k[sum>>11 &3] -> esi
    _asm("addl 16(%esp),%esi");         // (sum + k[sum>>11 &3])
    _asm("xorl %ebx,%esi");         // (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum>>11 &3])
    _asm("addl %esi,8(%esp)");      // v1 += (v0<<4 ^ v0>>5) + v0 ^ (sum + k[sum>>11 & 3])
    _asm("jmp stl1");
_asm("endl1:");
    //------------------------------------------------------
    // w[0] = v0; w[1] = v1;
    //------------------------------------------------------
    _asm("movl 8(%ebp),%ebx");         // address of w0[0] -> ebx
    _asm("movl 4(%esp),%esi");         // v0 -> esi
    _asm("movl %esi,(%ebx)");          // v0 -> w0[0]
    _asm("addl $4,%ebx");              // address of w0[1] -> ebx
    _asm("movl 8(%esp),%esi");         // v1 -> esi
    _asm("movl %esi,(%ebx)");          // v1 -> w0[1]
    _asm("popl %edi");                 // restore regs
    _asm("popl %esi");
    _asm("popl %ebx");
    _asm("addl $20,%esp");             // clean up stack
    _asm("popl %ebp");
    _asm("ret");
}

void __declspec(naked) decipher_asm(unsigned long *v,unsigned long *w,unsigned long *k)
{
    //------------------------------------------------------
    // &v =   8(%ebp)    &w = 12(%ebp) &k = 16(%ebp)
    // v0 =   4(%esp)    v1 = 8(%esp)
    // sum = 16(%esp) delta = 12(%ebp)
    //------------------------------------------------------
    _asm("pushl %ebp");                // save it
    _asm("movl %esp,%ebp");            // copy stack pointer
    _asm("subl $20,%esp");             // make some room on the stack
    _asm("pushl %ebx");                // save regs
    _asm("pushl %esi");
    _asm("pushl %edi");
    _asm("movl 8(%ebp),%ebx");         // address of v0 -> ebx
    _asm("movl (%ebx),%edi");          // v0 -> edi
    _asm("movl %edi,4(%esp)");         // v0 -> 4(%esp)
    _asm("movl 4(%ebx),%edi");         // next address v1 -> edi
    _asm("movl %edi,8(%esp)");         // v1 -> 8(%esp)
    _asm("movl $0x9e3779b9,12(%esp)"); // delta -> 12(%esp)
    _asm("movl $32,%ecx");             // 32 loops ecx
    _asm("movl $0xC6EF3720,16(%esp)"); // sum in 16(%esp)
_asm("stl2:");
    _asm("jecxz endl2");               // is ecx zero?
    _asm("decl %ecx");
    //-------------------------------------------------------
    // v1 -= (v0<<4 ^ v0>>5) + v0 ^ (sum + k[(sum>>11) & 3]);
    //-------------------------------------------------------
    _asm("movl 4(%esp),%eax");         // v0 -> eax
    _asm("movl %eax,%ebx");            // v0 -> ebx
    _asm("shll $4,%eax");              // v0 << 4
    _asm("shrl $5,%ebx");              // v0 >> 5
    _asm("xorl %eax,%ebx");            // (v0<<4 ^ v0>>5)
    _asm("addl 4(%esp),%ebx");         // (v0<<4 ^ v0>>5) + v0
    _asm("movl 16(%esp),%edx");        // sum -> ebx
    _asm("shrl $11,%edx");             // sum >> 11
    _asm("andl $3,%edx");              // sum >> 11 &3
    _asm("movl 16(%ebp),%eax");        // address of k -> eax
    _asm("movl (%eax,%edx,4),%esi");   // k[sum>>11 &3] -> esi
    _asm("addl 16(%esp),%esi");        // (sum + k[sum>>11 &3])
    _asm("xorl %ebx,%esi");       // (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum>>11 &3])
    _asm("subl %esi,8(%esp)");    // v1 -= (v0<<4 ^ v0>>5) + v0 ^ (sum + k[sum>>11 & 3])
    //------------------------------------------------------
    // sum += delta;
    //------------------------------------------------------
    _asm("movl 12(%esp),%eax");        // delta -> eax
    _asm("subl %eax,16(%esp)");        // sum -= delta
    //------------------------------------------------------
    // v0 -= (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
    //------------------------------------------------------
    _asm("movl 8(%esp),%eax");         // v1 -> eax
    _asm("movl %eax,%ebx");            // v1 -> ebx
    _asm("shll $4,%eax");              // v1 << 4
    _asm("shrl $5,%ebx");              // v1 >> 5
    _asm("xorl %eax,%ebx");            // (v1<<4 ^ v1>>5)
    _asm("addl 8(%esp),%ebx");         // (v1<<4 ^ v1>>5) + v1
    _asm("movl 16(%esp),%edx");        // sum -> ebx
    _asm("andl $3,%edx");              // sum&3
    _asm("movl 16(%ebp),%eax");        // address of k -> eax
    _asm("movl (%eax,%edx,4),%esi");   // k[sum&3] -> esi
    _asm("addl 16(%esp),%esi");        // (sum + k[sum&3])
    _asm("xorl %ebx,%esi");         // (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3])
    _asm("subl %esi,4(%esp)");      // v0 -= (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
    _asm("jmp stl2");
    _asm("endl2:");
    //------------------------------------------------------
    // w[0] = v0; w[1] = v1;
    //------------------------------------------------------
    _asm("movl 12(%ebp),%ebx");     // address of w0[0] -> ebx
    _asm("movl 4(%esp),%esi");      // v0 -> esi
    _asm("movl %esi,(%ebx)");       // v0 -> w0[0]
    _asm("addl $4,%ebx");           // address of w0[1] -> ebx
    _asm("movl 8(%esp),%esi");      // v1 -> esi
    _asm("movl %esi,(%ebx)");       // v1 -> w0[1]
    _asm("popl %edi");              // restore regs
    _asm("popl %esi");
    _asm("popl %ebx");
    _asm("addl $20,%esp");          // clean up stack
    _asm("popl %ebp");
    _asm("ret");
}

int main(void)
{
    unsigned int data[2] = {0x12345678, 0x12345678};
    unsigned int key[4] = {0x0fea5734, 0xa4f4e678, 0x19d673ab, 0x64c834b3};
    unsigned int ret[2];

    encipher( data, ret, key); // encipher data, returned in ret
    printf(" c encipher %x %x\n\n", ret[0], ret[1]);

    decipher( ret, data, key); // decipher ret, returned in data
    printf(" c decipher %x %x\n\n", data[0], data[1]);

    encipher_asm( data, ret, key); // encipher_asm data, returned in ret
    printf("asm encipher %x %x\n\n", ret[0], ret[1]);

    decipher_asm( ret, data, key); // decipher_asm ret, returned in data
    printf("asm decipher %x %x\n", data[0], data[1]);

    return 0;
}

Back to main page