Although the text below says that one can use a different number of iterations I have found that it only works with 32 iterations.
The assembly listing supplied was for 16 bit, so I made it 32 bit.
There's no doubt the code could be made faster if one spent time on it.
It may not be obvious what one of the instructions in this code does. It is a extended MOV instruction that takes a base address, an index and a scale value. A very useful instruction especially when when accessing an array.
_asm("movl (%eax,%edx,4),%esi");
base,index,scale, dest
The effective address copied from is the ( base + ( index * scale ) ). For an array of longs one would use a scale value of 4, for an array of shorts one would use an index of 2 and so on.
John
You can visit the site here http://vader.brad.ac.uk/tea/tea.shtml
#include <stdio.h>
/************************************************
The Tiny Encryption Algorithm (TEA) by David Wheeler and
Roger Needham of the Cambridge Computer Laboratory.
Placed in the Public Domain by David Wheeler and Roger Needham.
**** ANSI C VERSION (New Variant) ****
Notes:
TEA is a Feistel cipher with XOR and and addition as the
non-linear mixing functions.
Takes 64 bits of data in v[0] and v[1]. Returns 64 bits of
data in w[0] and w[1]. Takes 128 bits of key in k[0] - k[3].
TEA can be operated in any of the modes of DES. Cipher Block
Chaining is, for example, simple to implement.
n is the number of iterations. 32 is ample, 16 is sufficient,
as few as eight may be OK. The algorithm achieves good
dispersion after six iterations. The iteration count can be
made variable if required.
Note this is optimised for 32-bit CPUs with fast shift
capabilities. It can very easily be ported to assembly
language on most CPUs.
delta is chosen to be the real part of (the golden ratio
Sqrt(5/4) - 1/2 ~ 0.618034 multiplied by 2^32).
This version has been amended to foil two weaknesses
identified by David A. Wagner (daw@cs.berkeley.edu):
1) effective key length of old-variant TEA was 126 not 128 bits
2) a related key attack was possible although impractical.
************************************************/
void encipher(unsigned long * v,unsigned long * w, unsigned long * k)
{
unsigned long v0 = v[0], v1 = v[1], sum = 0, delta = 0x9E3779B9, n = 32;
while(n-->0){
v0 += ((v1<<4 ^ v1>>5) + v1) ^ (sum + k[sum&3]);
sum += delta;
v1 += ((v0<<4 ^ v0>>5) + v0) ^ (sum +
k[(sum>>11) & 3]);
}
w[0] = v0; w[1] = v1;
}
void decipher(unsigned long * v,unsigned long * w, unsigned long * k)
{
unsigned long v0 = v[0], v1 = v[1], sum = 0xC6EF3720, delta = 0x9E3779B9, n = 32;
// sum = delta<<5, in general sum = delta * n
while(n-->0){
v1 -= (v0<<4 ^ v0>>5) + v0 ^ (sum +
k[(sum>>11) & 3]);
sum -= delta;
v0 -= (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
}
w[0] = v0; w[1] = v1;
}
void __declspec(naked) encipher_asm(unsigned long *v,unsigned long *w,unsigned long *k)
{
//------------------------------------------------------
// &v = 8(%ebp) &w = 12(%ebp) &k = 16(%ebp)
// v0 = 4(%esp) v1 = 8(%esp)
// sum = 16(%esp) delta = 12(%ebp)
//------------------------------------------------------
_asm("pushl %ebp");
// save it
_asm("movl %esp,%ebp");
// copy stack pointer
_asm("subl $20,%esp");
// make some room on the stack
_asm("pushl %ebx");
// save regs
_asm("pushl %esi");
_asm("pushl %edi");
_asm("movl 4(%ebp),%ebx");
// address of v0 -> ebx
_asm("movl (%ebx),%edi");
// v0 -> edi
_asm("movl %edi,4(%esp)");
// v0 -> 4(%esp)
_asm("movl 4(%ebx),%edi");
// next address v1 -> edi
_asm("movl %edi,8(%esp)");
// v1 -> 8(%esp)
_asm("movl $0x9e3779b9,12(%esp)"); // delta -> 12(%esp)
_asm("movl $32,%ecx");
// 32 loops ecx
_asm("movl $0,16(%esp)");
// sum in 16(%esp)
_asm("stl1:");
_asm("jecxz endl1");
// is ecx zero?
_asm("decl %ecx");
//------------------------------------------------------
// v0 += ((v1<<4 ^ v1>>5) + v1) ^ (sum + k[sum&3]);
//------------------------------------------------------
_asm("movl 8(%esp),%eax");
// v1 -> eax
_asm("movl %eax,%ebx");
// v1 -> ebx
_asm("shll $4,%eax");
// v1 << 4
_asm("shrl $5,%ebx");
// v1 >> 5
_asm("xorl %eax,%ebx");
// (v1<<4 ^ v1>>5)
_asm("addl 8(%esp),%ebx");
// (v1<<4 ^ v1>>5) + v1
_asm("movl 16(%esp),%edx");
// sum -> ebx
_asm("andl $3,%edx");
// sum&3
_asm("movl 12(%ebp),%eax");
// address of k -> eax
_asm("movl (%eax,%edx,4),%esi"); // k[sum&3] -> esi
_asm("addl 16(%esp),%esi");
// (sum + k[sum&3])
_asm("xorl %ebx,%esi");
// (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3])
_asm("addl %esi,4(%esp)");
// v0 += (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
//------------------------------------------------------
// sum += delta;
//------------------------------------------------------
_asm("movl 12(%esp),%eax");
// delta -> eax
_asm("addl %eax,16(%esp)");
// sum += delta
//------------------------------------------------------
// v1 += (v0<<4 ^ v0>>5) + v0 ^ (sum + k[sum>>11 & 3]);
//------------------------------------------------------
_asm("movl 4(%esp),%eax");
// v0 -> eax
_asm("movl %eax,%ebx");
// v0 -> ebx
_asm("shll $4,%eax");
// v0 << 4
_asm("shrl $5,%ebx");
// v0 >> 5
_asm("xorl %eax,%ebx");
// (v0<<4 ^ v0>>5)
_asm("addl 4(%esp),%ebx");
// (v0<<4 ^ v0>>5) + v0
_asm("movl 16(%esp),%edx");
// sum -> ebx
_asm("shrl $11,%edx");
// sum >> 11
_asm("andl $3,%edx");
// sum >> 11 &3
_asm("movl 12(%ebp),%eax");
// address of k -> eax
_asm("movl (%eax,%edx,4),%esi"); //
k[sum>>11 &3] -> esi
_asm("addl 16(%esp),%esi");
// (sum + k[sum>>11 &3])
_asm("xorl %ebx,%esi");
// (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum>>11 &3])
_asm("addl %esi,8(%esp)");
// v1 += (v0<<4 ^ v0>>5) + v0 ^ (sum + k[sum>>11 & 3])
_asm("jmp stl1");
_asm("endl1:");
//------------------------------------------------------
// w[0] = v0; w[1] = v1;
//------------------------------------------------------
_asm("movl 8(%ebp),%ebx");
// address of w0[0] -> ebx
_asm("movl 4(%esp),%esi");
// v0 -> esi
_asm("movl %esi,(%ebx)");
// v0 -> w0[0]
_asm("addl $4,%ebx");
// address of w0[1] -> ebx
_asm("movl 8(%esp),%esi");
// v1 -> esi
_asm("movl %esi,(%ebx)");
// v1 -> w0[1]
_asm("popl %edi");
// restore regs
_asm("popl %esi");
_asm("popl %ebx");
_asm("addl $20,%esp");
// clean up stack
_asm("popl %ebp");
_asm("ret");
}
void __declspec(naked) decipher_asm(unsigned long *v,unsigned long *w,unsigned long *k)
{
//------------------------------------------------------
// &v = 8(%ebp) &w = 12(%ebp) &k = 16(%ebp)
// v0 = 4(%esp) v1 = 8(%esp)
// sum = 16(%esp) delta = 12(%ebp)
//------------------------------------------------------
_asm("pushl %ebp");
// save it
_asm("movl %esp,%ebp");
// copy stack pointer
_asm("subl $20,%esp");
// make some room on the stack
_asm("pushl %ebx");
// save regs
_asm("pushl %esi");
_asm("pushl %edi");
_asm("movl 8(%ebp),%ebx");
// address of v0 -> ebx
_asm("movl (%ebx),%edi");
// v0 -> edi
_asm("movl %edi,4(%esp)");
// v0 -> 4(%esp)
_asm("movl 4(%ebx),%edi");
// next address v1 -> edi
_asm("movl %edi,8(%esp)");
// v1 -> 8(%esp)
_asm("movl $0x9e3779b9,12(%esp)"); // delta -> 12(%esp)
_asm("movl $32,%ecx");
// 32 loops ecx
_asm("movl $0xC6EF3720,16(%esp)"); // sum in 16(%esp)
_asm("stl2:");
_asm("jecxz endl2");
// is ecx zero?
_asm("decl %ecx");
//-------------------------------------------------------
// v1 -= (v0<<4 ^ v0>>5) + v0 ^ (sum + k[(sum>>11) & 3]);
//-------------------------------------------------------
_asm("movl 4(%esp),%eax");
// v0 -> eax
_asm("movl %eax,%ebx");
// v0 -> ebx
_asm("shll $4,%eax");
// v0 << 4
_asm("shrl $5,%ebx");
// v0 >> 5
_asm("xorl %eax,%ebx");
// (v0<<4 ^ v0>>5)
_asm("addl 4(%esp),%ebx");
// (v0<<4 ^ v0>>5) + v0
_asm("movl 16(%esp),%edx");
// sum -> ebx
_asm("shrl $11,%edx");
// sum >> 11
_asm("andl $3,%edx");
// sum >> 11 &3
_asm("movl 16(%ebp),%eax");
// address of k -> eax
_asm("movl (%eax,%edx,4),%esi"); //
k[sum>>11 &3] -> esi
_asm("addl 16(%esp),%esi");
// (sum + k[sum>>11 &3])
_asm("xorl %ebx,%esi");
// (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum>>11 &3])
_asm("subl %esi,8(%esp)"); // v1 -= (v0<<4 ^ v0>>5) + v0 ^ (sum +
k[sum>>11 & 3])
//------------------------------------------------------
// sum += delta;
//------------------------------------------------------
_asm("movl 12(%esp),%eax");
// delta -> eax
_asm("subl %eax,16(%esp)");
// sum -= delta
//------------------------------------------------------
// v0 -= (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
//------------------------------------------------------
_asm("movl 8(%esp),%eax");
// v1 -> eax
_asm("movl %eax,%ebx");
// v1 -> ebx
_asm("shll $4,%eax");
// v1 << 4
_asm("shrl $5,%ebx");
// v1 >> 5
_asm("xorl %eax,%ebx");
// (v1<<4 ^ v1>>5)
_asm("addl 8(%esp),%ebx");
// (v1<<4 ^ v1>>5) + v1
_asm("movl 16(%esp),%edx");
// sum -> ebx
_asm("andl $3,%edx");
// sum&3
_asm("movl 16(%ebp),%eax");
// address of k -> eax
_asm("movl (%eax,%edx,4),%esi"); // k[sum&3] -> esi
_asm("addl 16(%esp),%esi");
// (sum + k[sum&3])
_asm("xorl %ebx,%esi");
// (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3])
_asm("subl %esi,4(%esp)");
// v0 -= (v1<<4 ^ v1>>5) + v1 ^ (sum + k[sum&3]);
_asm("jmp stl2");
_asm("endl2:");
//------------------------------------------------------
// w[0] = v0; w[1] = v1;
//------------------------------------------------------
_asm("movl 12(%ebp),%ebx");
// address of w0[0] -> ebx
_asm("movl 4(%esp),%esi");
// v0 -> esi
_asm("movl %esi,(%ebx)");
// v0 -> w0[0]
_asm("addl $4,%ebx");
// address of w0[1] -> ebx
_asm("movl 8(%esp),%esi");
// v1 -> esi
_asm("movl %esi,(%ebx)");
// v1 -> w0[1]
_asm("popl %edi");
// restore regs
_asm("popl %esi");
_asm("popl %ebx");
_asm("addl $20,%esp");
// clean up stack
_asm("popl %ebp");
_asm("ret");
}
int main(void)
{
unsigned int data[2] = {0x12345678, 0x12345678};
unsigned int key[4] = {0x0fea5734, 0xa4f4e678, 0x19d673ab, 0x64c834b3};
unsigned int ret[2];
encipher( data, ret, key); // encipher data, returned in ret
printf(" c encipher %x %x\n\n", ret[0], ret[1]);
decipher( ret, data, key); // decipher ret, returned in data
printf(" c decipher %x %x\n\n", data[0], data[1]);
encipher_asm( data, ret, key); // encipher_asm data, returned in ret
printf("asm encipher %x %x\n\n", ret[0], ret[1]);
decipher_asm( ret, data, key); // decipher_asm ret, returned in data
printf("asm decipher %x %x\n", data[0], data[1]);
return 0;
}