www.pudn.com > aes-src-22-11-16.zip > aes_x86_v2.asm


 
; --------------------------------------------------------------------------- 
; Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved. 
; 
; LICENSE TERMS 
; 
; The free distribution and use of this software in both source and binary 
; form is allowed (with or without changes) provided that: 
; 
;   1. distributions of this source code include the above copyright 
;      notice, this list of conditions and the following disclaimer; 
; 
;   2. distributions in binary form include the above copyright 
;      notice, this list of conditions and the following disclaimer 
;      in the documentation and/or other associated materials; 
; 
;   3. the copyright holder's name is not used to endorse products 
;      built using this software without specific written permission. 
; 
; ALTERNATIVELY, provided that this notice is retained in full, this product 
; may be distributed under the terms of the GNU General Public License (GPL), 
; in which case the provisions of the GPL apply INSTEAD OF those given above. 
; 
; DISCLAIMER 
; 
; This software is provided 'as is' with no explicit or implied warranties 
; in respect of its properties, including, but not limited to, correctness 
; and/or fitness for purpose. 
; --------------------------------------------------------------------------- 
; Issue 09/09/2006 
 
; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h 
; and the same define to be set here as well. If AES_V2C is set this file 
; requires the C files aeskey.c and aestab.c for support. 
 
; An AES implementation for x86 processors using the YASM (or NASM) assembler. 
; This is a full assembler implementation covering encryption, decryption and 
; key scheduling. It uses 2k bytes of tables but its encryption and decryption 
; performance is very close to that obtained using large tables.  Key schedule 
; expansion is slower for both encryption and decryption but this is likely to 
; be offset by the much smaller load that this version places on the processor 
; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of 
; the design of the AES round function used here. 
; 
; This code provides the standard AES block size (128 bits, 16 bytes) and the 
; three standard AES key sizes (128, 192 and 256 bits). It has the same call 
; interface as my C implementation. The ebx, esi, edi and ebp registers are 
; preserved across calls but eax, ecx and edx and the artihmetic status flags 
; are not.  Although this is a full assembler implementation, it can be used 
; in conjunction with my C code which provides faster key scheduling using 
; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C 
; defined.  It is also important that the defines below match those used in the 
; C code.  This code uses the VC++ register saving conentions; if it is used 
; with another compiler, conventions for using and saving registers may need 
; to be checked (and calling conventions).  The YASM command line for the VC++ 
; custom build step is: 
; 
;    yasm -Xvc -f win32 -D  -o "$(TargetDir)\$(InputName).obj" "$(InputPath)" 
; 
; where  is ASM_X86_V2 or ASM_X86_V2C.  The calling intefaces are: 
; 
;     AES_RETURN aes_encrypt(const unsigned char in_blk[], 
;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_decrypt(const unsigned char in_blk[], 
;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_encrypt_key(const unsigned char key[], 
;                                            const aes_encrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_decrypt_key(const unsigned char key[], 
;                                            const aes_decrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_encrypt_key(const unsigned char key[], 
;                           unsigned int len, const aes_decrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_decrypt_key(const unsigned char key[], 
;                           unsigned int len, const aes_decrypt_ctx cx[1]); 
; 
; where  is 128, 102 or 256.  In the last two calls the length can be in 
; either bits or bytes. 
; 
; Comment in/out the following lines to obtain the desired subroutines. These 
; selections MUST match those in the C header file aes.h 
 
%define AES_128                 ; define if AES with 128 bit keys is needed 
%define AES_192                 ; define if AES with 192 bit keys is needed 
%define AES_256                 ; define if AES with 256 bit keys is needed 
%define AES_VAR                 ; define if a variable key size is needed 
%define ENCRYPTION              ; define if encryption is needed 
%define DECRYPTION              ; define if decryption is needed 
%define AES_REV_DKS             ; define if key decryption schedule is reversed 
 
%ifndef ASM_X86_V2C 
%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed 
%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed 
%endif 
 
; The encryption key schedule has the following in memory layout where N is the 
; number of rounds (10, 12 or 14): 
; 
; lo: | input key (round 0)  |  ; each round is four 32-bit words 
;     | encryption round 1   | 
;     | encryption round 2   | 
;     .... 
;     | encryption round N-1 | 
; hi: | encryption round N   | 
; 
; The decryption key schedule is normally set up so that it has the same 
; layout as above by actually reversing the order of the encryption key 
; schedule in memory (this happens when AES_REV_DKS is set): 
; 
; lo: | decryption round 0   | =              | encryption round N   | 
;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ] 
;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ] 
;     ....                       .... 
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ] 
; hi: | decryption round N   | =              | input key (round 0)  | 
; 
; with rounds except the first and last modified using inv_mix_column() 
; But if AES_REV_DKS is NOT set the order of keys is left as it is for 
; encryption so that it has to be accessed in reverse when used for 
; decryption (although the inverse mix column modifications are done) 
; 
; lo: | decryption round 0   | =              | input key (round 0)  | 
;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ] 
;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ] 
;     ....                       .... 
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 
; hi: | decryption round N   | =              | encryption round N   | 
; 
; This layout is faster when the assembler key scheduling provided here 
; is used. 
; 
; The DLL interface must use the _stdcall convention in which the number 
; of bytes of parameter space is added after an @ to the sutine's name. 
; We must also remove our parameters from the stack before return (see 
; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. 
 
;%define DLL_EXPORT 
 
; End of user defines 
 
%ifdef AES_VAR 
%ifndef AES_128 
%define AES_128 
%endif 
%ifndef AES_192 
%define AES_192 
%endif 
%ifndef AES_256 
%define AES_256 
%endif 
%endif 
 
%ifdef AES_VAR 
%define KS_LENGTH       60 
%elifdef AES_256 
%define KS_LENGTH       60 
%elifdef AES_192 
%define KS_LENGTH       52 
%else 
%define KS_LENGTH       44 
%endif 
 
; These macros implement stack based local variables 
 
%macro  save 2 
    mov     [esp+4*%1],%2 
%endmacro 
 
%macro  restore 2 
    mov     %1,[esp+4*%2] 
%endmacro 
 
; the DLL has to implement the _stdcall calling interface on return 
; In this case we have to take our parameters (3 4-byte pointers) 
; off the stack 
 
%define parms 12 
 
%macro  do_name 1-2 parms 
%ifndef DLL_EXPORT 
    align 32 
    global  %1 
%1: 
%else 
    align 32 
    global  %1@%2 
    export  %1@%2 
%1@%2: 
%endif 
%endmacro 
 
%macro  do_call 1-2 parms 
%ifndef DLL_EXPORT 
    call    %1 
    add     esp,%2 
%else 
    call    %1@%2 
%endif 
%endmacro 
 
%macro  do_exit  0-1 parms 
%ifdef DLL_EXPORT 
    ret %1 
%else 
    ret 
%endif 
%endmacro 
 
; finite field multiplies by {02}, {04} and {08} 
 
%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b)) 
%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) 
%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) 
 
; finite field multiplies required in table generation 
 
%define f3(x)   (f2(x) ^ x) 
%define f9(x)   (f8(x) ^ x) 
%define fb(x)   (f8(x) ^ f2(x) ^ x) 
%define fd(x)   (f8(x) ^ f4(x) ^ x) 
%define fe(x)   (f8(x) ^ f4(x) ^ f2(x)) 
 
%define etab_0(x)   [enc_tab+4+8*x] 
%define etab_1(x)   [enc_tab+3+8*x] 
%define etab_2(x)   [enc_tab+2+8*x] 
%define etab_3(x)   [enc_tab+1+8*x] 
%define etab_b(x)   byte [enc_tab+1+8*x] ; used with movzx for 0x000000xx 
%define etab_w(x)   word [enc_tab+8*x]   ; used with movzx for 0x0000xx00 
 
%define btab_0(x)   [enc_tab+6+8*x] 
%define btab_1(x)   [enc_tab+5+8*x] 
%define btab_2(x)   [enc_tab+4+8*x] 
%define btab_3(x)   [enc_tab+3+8*x] 
 
; ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the 
; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX. 
; 
; Input: 
; 
;   EAX     column[0] 
;   EBX     column[1] 
;   ECX     column[2] 
;   EDX     column[3] 
;   ESI     column key[round][2] 
;   EDI     column key[round][3] 
;   EBP     scratch 
; 
; Output: 
; 
;   EBP     column[0]   unkeyed 
;   EBX     column[1]   unkeyed 
;   ESI     column[2]   keyed 
;   EDI     column[3]   keyed 
;   EAX     scratch 
;   ECX     scratch 
;   EDX     scratch 
 
%macro rnd_fun 2 
 
    rol     ebx,16 
    %1      esi, cl, 0, ebp 
    %1      esi, dh, 1, ebp 
    %1      esi, bh, 3, ebp 
    %1      edi, dl, 0, ebp 
    %1      edi, ah, 1, ebp 
    %1      edi, bl, 2, ebp 
    %2      ebp, al, 0, ebp 
    shr     ebx,16 
    and     eax,0xffff0000 
    or      eax,ebx 
    shr     edx,16 
    %1      ebp, ah, 1, ebx 
    %1      ebp, dh, 3, ebx 
    %2      ebx, dl, 2, ebx 
    %1      ebx, ch, 1, edx 
    %1      ebx, al, 0, edx 
    shr     eax,16 
    shr     ecx,16 
    %1      ebp, cl, 2, edx 
    %1      edi, ch, 3, edx 
    %1      esi, al, 2, edx 
    %1      ebx, ah, 3, edx 
 
%endmacro 
 
; Basic MOV and XOR Operations for normal rounds 
 
%macro  nr_xor  4 
    movzx   %4,%2 
    xor     %1,etab_%3(%4) 
%endmacro 
 
%macro  nr_mov  4 
    movzx   %4,%2 
    mov     %1,etab_%3(%4) 
%endmacro 
 
; Basic MOV and XOR Operations for last round 
 
%if 1 
 
    %macro  lr_xor  4 
        movzx   %4,%2 
        movzx   %4,etab_b(%4) 
    %if %3 != 0 
        shl     %4,8*%3 
    %endif 
        xor     %1,%4 
    %endmacro 
 
    %macro  lr_mov  4 
        movzx   %4,%2 
        movzx   %1,etab_b(%4) 
    %if %3 != 0 
        shl     %1,8*%3 
    %endif 
    %endmacro 
 
%else       ; less effective but worth leaving as an option 
 
    %macro  lr_xor  4 
        movzx   %4,%2 
        mov     %4,btab_%3(%4) 
        and     %4,0x000000ff << 8 * %3 
        xor     %1,%4 
    %endmacro 
 
    %macro  lr_mov  4 
        movzx   %4,%2 
        mov     %1,btab_%3(%4) 
        and     %1,0x000000ff << 8 * %3 
    %endmacro 
 
%endif 
 
; Apply S-Box to the 4 bytes in a 32-bit word and rotate left 3 byte positions 
; 
;   %1 : output is xored into this register 
;   %2 : input: a => eax; b => ebx; c => ecx; d => edx 
;   %3 : scratch register 
 
%macro l3s_col 3 
    lr_xor  %1,%2h,0,%3 
    lr_xor  %1,%2l,3,%3 
    shr     e%2x,16 
    lr_xor  %1,%2h,2,%3 
    lr_xor  %1,%2l,1,%3 
%endmacro 
 
; offsets to parameters 
 
in_blk  equ     4   ; input byte array address parameter 
out_blk equ     8   ; output byte array address parameter 
ctx     equ    12   ; AES context structure 
stk_spc equ    20   ; stack space 
 
%ifdef  ENCRYPTION 
 
%define ENCRYPTION_TABLE 
 
%macro enc_round 0 
 
    add     ebp,16 
    save    0,ebp 
    mov     esi,[ebp+8] 
    mov     edi,[ebp+12] 
 
    rnd_fun nr_xor, nr_mov 
 
    mov     eax,ebp 
    mov     ecx,esi 
    mov     edx,edi 
    restore ebp,0 
    xor     eax,[ebp] 
    xor     ebx,[ebp+4] 
 
%endmacro 
 
%macro enc_last_round 0 
 
    add     ebp,16 
    save    0,ebp 
    mov     esi,[ebp+8] 
    mov     edi,[ebp+12] 
 
    rnd_fun lr_xor, lr_mov 
 
    mov     eax,ebp 
    restore ebp,0 
    xor     eax,[ebp] 
    xor     ebx,[ebp+4] 
 
%endmacro 
 
    section .text align=32 
 
; AES Encryption Subroutine 
 
    do_name _aes_encrypt,12 
 
    sub     esp,stk_spc 
    mov     [esp+16],ebp 
    mov     [esp+12],ebx 
    mov     [esp+ 8],esi 
    mov     [esp+ 4],edi 
 
    mov     esi,[esp+in_blk+stk_spc] ; input pointer 
    mov     eax,[esi   ] 
    mov     ebx,[esi+ 4] 
    mov     ecx,[esi+ 8] 
    mov     edx,[esi+12] 
 
    mov     ebp,[esp+ctx+stk_spc]    ; key pointer 
    movzx   edi,byte [ebp+4*KS_LENGTH] 
    xor     eax,[ebp   ] 
    xor     ebx,[ebp+ 4] 
    xor     ecx,[ebp+ 8] 
    xor     edx,[ebp+12] 
 
; determine the number of rounds 
 
    cmp     edi,10*16 
    je      .3 
    cmp     edi,12*16 
    je      .2 
    cmp     edi,14*16 
    je      .1 
    mov     eax,-1 
    jmp     .5 
 
.1: enc_round 
    enc_round 
.2: enc_round 
    enc_round 
.3: enc_round 
    enc_round 
    enc_round 
    enc_round 
    enc_round 
    enc_round 
    enc_round 
    enc_round 
    enc_round 
    enc_last_round 
 
    mov     edx,[esp+out_blk+stk_spc] 
    mov     [edx],eax 
    mov     [edx+4],ebx 
    mov     [edx+8],esi 
    mov     [edx+12],edi 
    xor     eax,eax 
 
.5: mov     ebp,[esp+16] 
    mov     ebx,[esp+12] 
    mov     esi,[esp+ 8] 
    mov     edi,[esp+ 4] 
    add     esp,stk_spc 
    do_exit 12 
 
%endif 
 
%macro f_key 2 
 
    l3s_col esi,a,ebx 
    xor     esi,rc_val 
 
    mov     [ebp+%1*%2],esi 
    xor     edi,esi 
    mov     [ebp+%1*%2+4],edi 
    xor     ecx,edi 
    mov     [ebp+%1*%2+8],ecx 
    xor     edx,ecx 
    mov     [ebp+%1*%2+12],edx 
    mov     eax,edx 
 
%if %2 == 24 
 
%if %1 < 7 
    xor     eax,[ebp+%1*%2+16-%2] 
    mov     [ebp+%1*%2+16],eax 
    xor     eax,[ebp+%1*%2+20-%2] 
    mov     [ebp+%1*%2+20],eax 
%endif 
 
%elif %2 == 32 
 
%if %1 < 6 
    rol     eax,8 
    push    edx 
    mov     edx,[ebp+%1*%2+16-%2] 
    l3s_col edx,a,ebx 
    mov     eax,edx 
    pop     edx 
    mov     [ebp+%1*%2+16],eax 
    xor     eax,[ebp+%1*%2+20-%2] 
    mov     [ebp+%1*%2+20],eax 
    xor     eax,[ebp+%1*%2+24-%2] 
    mov     [ebp+%1*%2+24],eax 
    xor     eax,[ebp+%1*%2+28-%2] 
    mov     [ebp+%1*%2+28],eax 
%endif 
 
%endif 
 
%assign rc_val f2(rc_val) 
 
%endmacro 
 
%ifdef ENCRYPTION_KEY_SCHEDULE 
 
%ifdef  AES_128 
 
%ifndef ENCRYPTION_TABLE 
%define ENCRYPTION_TABLE 
%endif 
 
%assign rc_val  1 
 
    do_name _aes_encrypt_key128,8 
 
    push    ebp 
    push    ebx 
    push    esi 
    push    edi 
 
    mov     ebp,[esp+24] 
    mov     [ebp+4*KS_LENGTH],dword 10*16 
    mov     ebx,[esp+20] 
 
    mov     esi,[ebx] 
    mov     [ebp],esi 
    mov     edi,[ebx+4] 
    mov     [ebp+4],edi 
    mov     ecx,[ebx+8] 
    mov     [ebp+8],ecx 
    mov     edx,[ebx+12] 
    mov     [ebp+12],edx 
    add     ebp,16 
    mov     eax,edx 
 
    f_key   0,16        ; 11 * 4 = 44 unsigned longs 
    f_key   1,16        ; 4 + 4 * 10 generated = 44 
    f_key   2,16 
    f_key   3,16 
    f_key   4,16 
    f_key   5,16 
    f_key   6,16 
    f_key   7,16 
    f_key   8,16 
    f_key   9,16 
 
    pop     edi 
    pop     esi 
    pop     ebx 
    pop     ebp 
    xor     eax,eax 
    do_exit  8 
 
%endif 
 
%ifdef  AES_192 
 
%ifndef ENCRYPTION_TABLE 
%define ENCRYPTION_TABLE 
%endif 
 
%assign rc_val  1 
 
    do_name _aes_encrypt_key192,8 
 
    push    ebp 
    push    ebx 
    push    esi 
    push    edi 
 
    mov     ebp,[esp+24] 
    mov     [ebp+4*KS_LENGTH],dword 12 * 16 
    mov     ebx,[esp+20] 
 
    mov     esi,[ebx] 
    mov     [ebp],esi 
    mov     edi,[ebx+4] 
    mov     [ebp+4],edi 
    mov     ecx,[ebx+8] 
    mov     [ebp+8],ecx 
    mov     edx,[ebx+12] 
    mov     [ebp+12],edx 
    mov     eax,[ebx+16] 
    mov     [ebp+16],eax 
    mov     eax,[ebx+20] 
    mov     [ebp+20],eax 
    add     ebp,24 
 
    f_key   0,24        ; 13 * 4 = 52 unsigned longs 
    f_key   1,24        ; 6 + 6 * 8 generated = 54 
    f_key   2,24 
    f_key   3,24 
    f_key   4,24 
    f_key   5,24 
    f_key   6,24 
    f_key   7,24 
 
    pop     edi 
    pop     esi 
    pop     ebx 
    pop     ebp 
    xor     eax,eax 
    do_exit  8 
 
%endif 
 
%ifdef  AES_256 
 
%ifndef ENCRYPTION_TABLE 
%define ENCRYPTION_TABLE 
%endif 
 
%assign rc_val  1 
 
    do_name _aes_encrypt_key256,8 
 
    push    ebp 
    push    ebx 
    push    esi 
    push    edi 
 
    mov     ebp,[esp+24] 
    mov     [ebp+4*KS_LENGTH],dword 14 * 16 
    mov     ebx,[esp+20] 
 
    mov     esi,[ebx] 
    mov     [ebp],esi 
    mov     edi,[ebx+4] 
    mov     [ebp+4],edi 
    mov     ecx,[ebx+8] 
    mov     [ebp+8],ecx 
    mov     edx,[ebx+12] 
    mov     [ebp+12],edx 
    mov     eax,[ebx+16] 
    mov     [ebp+16],eax 
    mov     eax,[ebx+20] 
    mov     [ebp+20],eax 
    mov     eax,[ebx+24] 
    mov     [ebp+24],eax 
    mov     eax,[ebx+28] 
    mov     [ebp+28],eax 
    add     ebp,32 
 
    f_key   0,32        ; 15 * 4 = 60 unsigned longs 
    f_key   1,32        ; 8 + 8 * 7 generated = 64 
    f_key   2,32 
    f_key   3,32 
    f_key   4,32 
    f_key   5,32 
    f_key   6,32 
 
    pop     edi 
    pop     esi 
    pop     ebx 
    pop     ebp 
    xor     eax,eax 
    do_exit  8 
 
%endif 
 
%ifdef  AES_VAR 
 
%ifndef ENCRYPTION_TABLE 
%define ENCRYPTION_TABLE 
%endif 
 
    do_name _aes_encrypt_key,12 
 
    mov     ecx,[esp+4] 
    mov     eax,[esp+8] 
    mov     edx,[esp+12] 
    push    edx 
    push    ecx 
 
    cmp     eax,16 
    je      .1 
    cmp     eax,128 
    je      .1 
 
    cmp     eax,24 
    je      .2 
    cmp     eax,192 
    je      .2 
 
    cmp     eax,32 
    je      .3 
    cmp     eax,256 
    je      .3 
    mov     eax,-1 
    add     esp,8 
    do_exit 12 
 
.1: do_call _aes_encrypt_key128,8 
    do_exit 12 
.2: do_call _aes_encrypt_key192,8 
    do_exit 12 
.3: do_call _aes_encrypt_key256,8 
    do_exit 12 
 
%endif 
 
%endif 
 
%ifdef ENCRYPTION_TABLE 
 
; S-box data - 256 entries 
 
    section .data align=32 
    align 32 
 
%define u8(x)   0, x, x, f3(x), f2(x), x, x, f3(x) 
 
enc_tab: 
    db  u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5) 
    db  u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76) 
    db  u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0) 
    db  u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0) 
    db  u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc) 
    db  u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15) 
    db  u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a) 
    db  u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75) 
    db  u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0) 
    db  u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84) 
    db  u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b) 
    db  u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf) 
    db  u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85) 
    db  u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8) 
    db  u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5) 
    db  u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2) 
    db  u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17) 
    db  u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73) 
    db  u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88) 
    db  u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb) 
    db  u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c) 
    db  u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79) 
    db  u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9) 
    db  u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08) 
    db  u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6) 
    db  u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a) 
    db  u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e) 
    db  u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e) 
    db  u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94) 
    db  u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf) 
    db  u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68) 
    db  u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16) 
 
%endif 
 
%ifdef  DECRYPTION 
 
%define DECRYPTION_TABLE 
 
%define dtab_0(x)   [dec_tab+  8*x] 
%define dtab_1(x)   [dec_tab+3+8*x] 
%define dtab_2(x)   [dec_tab+2+8*x] 
%define dtab_3(x)   [dec_tab+1+8*x] 
%define dtab_x(x)   byte [dec_tab+7+8*x] 
 
%macro irn_fun 2 
 
    rol eax,16 
    %1      esi, cl, 0, ebp 
    %1      esi, bh, 1, ebp 
    %1      esi, al, 2, ebp 
    %1      edi, dl, 0, ebp 
    %1      edi, ch, 1, ebp 
    %1      edi, ah, 3, ebp 
    %2      ebp, bl, 0, ebp 
    shr     eax,16 
    and     ebx,0xffff0000 
    or      ebx,eax 
    shr     ecx,16 
    %1      ebp, bh, 1, eax 
    %1      ebp, ch, 3, eax 
    %2      eax, cl, 2, ecx 
    %1      eax, bl, 0, ecx 
    %1      eax, dh, 1, ecx 
    shr     ebx,16 
    shr     edx,16 
    %1      esi, dh, 3, ecx 
    %1      ebp, dl, 2, ecx 
    %1      eax, bh, 3, ecx 
    %1      edi, bl, 2, ecx 
 
%endmacro 
 
; Basic MOV and XOR Operations for normal rounds 
 
%macro  ni_xor  4 
    movzx   %4,%2 
    xor     %1,dtab_%3(%4) 
%endmacro 
 
%macro  ni_mov  4 
    movzx   %4,%2 
    mov     %1,dtab_%3(%4) 
%endmacro 
 
; Basic MOV and XOR Operations for last round 
 
%macro  li_xor  4 
    movzx   %4,%2 
    movzx   %4,dtab_x(%4) 
%if %3 != 0 
    shl     %4,8*%3 
%endif 
    xor     %1,%4 
%endmacro 
 
%macro  li_mov  4 
    movzx   %4,%2 
    movzx   %1,dtab_x(%4) 
%if %3 != 0 
    shl     %1,8*%3 
%endif 
%endmacro 
 
%macro dec_round 0 
 
%ifdef AES_REV_DKS 
    add     ebp,16 
%else 
    sub     ebp,16 
%endif 
    save    0,ebp 
    mov     esi,[ebp+8] 
    mov     edi,[ebp+12] 
 
    irn_fun ni_xor, ni_mov 
 
    mov     ebx,ebp 
    mov     ecx,esi 
    mov     edx,edi 
    restore ebp,0 
    xor     eax,[ebp] 
    xor     ebx,[ebp+4] 
 
%endmacro 
 
%macro dec_last_round 0 
 
%ifdef AES_REV_DKS 
    add     ebp,16 
%else 
    sub     ebp,16 
%endif 
    save    0,ebp 
    mov     esi,[ebp+8] 
    mov     edi,[ebp+12] 
 
    irn_fun li_xor, li_mov 
 
    mov     ebx,ebp 
    restore ebp,0 
    xor     eax,[ebp] 
    xor     ebx,[ebp+4] 
 
%endmacro 
 
    section .text align=32 
 
; AES Decryption Subroutine 
 
    do_name _aes_decrypt,12 
 
    sub     esp,stk_spc 
    mov     [esp+16],ebp 
    mov     [esp+12],ebx 
    mov     [esp+ 8],esi 
    mov     [esp+ 4],edi 
 
; input four columns and xor in first round key 
 
    mov     esi,[esp+in_blk+stk_spc] ; input pointer 
    mov     eax,[esi   ] 
    mov     ebx,[esi+ 4] 
    mov     ecx,[esi+ 8] 
    mov     edx,[esi+12] 
    lea     esi,[esi+16] 
 
    mov     ebp,[esp+ctx+stk_spc]    ; key pointer 
    movzx   edi,byte[ebp+4*KS_LENGTH] 
%ifndef  AES_REV_DKS        ; if decryption key schedule is not reversed 
    lea     ebp,[ebp+edi] ; we have to access it from the top down 
%endif 
    xor     eax,[ebp   ]  ; key schedule 
    xor     ebx,[ebp+ 4] 
    xor     ecx,[ebp+ 8] 
    xor     edx,[ebp+12] 
 
; determine the number of rounds 
 
    cmp     edi,10*16 
    je      .3 
    cmp     edi,12*16 
    je      .2 
    cmp     edi,14*16 
    je      .1 
    mov     eax,-1 
    jmp     .5 
 
.1: dec_round 
    dec_round 
.2: dec_round 
    dec_round 
.3: dec_round 
    dec_round 
    dec_round 
    dec_round 
    dec_round 
    dec_round 
    dec_round 
    dec_round 
    dec_round 
    dec_last_round 
 
; move final values to the output array. 
 
    mov     ebp,[esp+out_blk+stk_spc] 
    mov     [ebp],eax 
    mov     [ebp+4],ebx 
    mov     [ebp+8],esi 
    mov     [ebp+12],edi 
    xor     eax,eax 
 
.5: mov     ebp,[esp+16] 
    mov     ebx,[esp+12] 
    mov     esi,[esp+ 8] 
    mov     edi,[esp+ 4] 
    add     esp,stk_spc 
    do_exit 12 
 
%endif 
 
%macro  inv_mix_col 0 
 
    movzx   ebx,dl 
    movzx   ebx,etab_b(ebx) 
    mov     eax,dtab_0(ebx) 
    movzx   ebx,dh 
    shr     edx,16 
    movzx   ebx,etab_b(ebx) 
    xor     eax,dtab_1(ebx) 
    movzx   ebx,dl 
    movzx   ebx,etab_b(ebx) 
    xor     eax,dtab_2(ebx) 
    movzx   ebx,dh 
    movzx   ebx,etab_b(ebx) 
    xor     eax,dtab_3(ebx) 
 
%endmacro 
 
%ifdef DECRYPTION_KEY_SCHEDULE 
 
%ifdef AES_128 
 
%ifndef DECRYPTION_TABLE 
%define DECRYPTION_TABLE 
%endif 
 
    do_name _aes_decrypt_key128,8 
 
    push    ebp 
    push    ebx 
    push    esi 
    push    edi 
    mov     eax,[esp+24]    ; context 
    mov     edx,[esp+20]    ; key 
    push    eax 
    push    edx 
    do_call _aes_encrypt_key128,8   ; generate expanded encryption key 
    mov     eax,10*16 
    mov     esi,[esp+24]    ; pointer to first round key 
    lea     edi,[esi+eax]   ; pointer to last round key 
    add     esi,32 
                            ; the inverse mix column transformation 
    mov     edx,[esi-16]    ; needs to be applied to all round keys 
    inv_mix_col             ; except first and last. Hence start by 
    mov     [esi-16],eax    ; transforming the four sub-keys in the 
    mov     edx,[esi-12]    ; second round key 
    inv_mix_col 
    mov     [esi-12],eax    ; transformations for subsequent rounds 
    mov     edx,[esi-8]     ; can then be made more efficient by 
    inv_mix_col             ; noting that for three of the four sub-keys 
    mov     [esi-8],eax     ; in the encryption round key ek[r]: 
    mov     edx,[esi-4]     ; 
    inv_mix_col             ;   ek[r][n] = ek[r][n-1] ^ ek[r-1][n] 
    mov     [esi-4],eax     ; 
                            ; where n is 1..3. Hence the corresponding 
.0: mov     edx,[esi]       ; subkeys in the decryption round key dk[r] 
    inv_mix_col             ; also obey since inv_mix_col is linear in 
    mov     [esi],eax       ; GF(256): 
    xor     eax,[esi-12]    ; 
    mov     [esi+4],eax     ;   dk[r][n] = dk[r][n-1] ^ dk[r-1][n] 
    xor     eax,[esi-8]     ; 
    mov     [esi+8],eax     ; So we only need one inverse mix column 
    xor     eax,[esi-4]     ; operation (n = 0) for each four word cycle 
    mov     [esi+12],eax    ; in the expanded key. 
    add     esi,16 
    cmp     edi,esi 
    jg      .0 
    jmp     dec_end 
 
%endif 
 
%ifdef AES_192 
 
%ifndef DECRYPTION_TABLE 
%define DECRYPTION_TABLE 
%endif 
 
    do_name _aes_decrypt_key192,8 
 
    push    ebp 
    push    ebx 
    push    esi 
    push    edi 
    mov     eax,[esp+24]    ; context 
    mov     edx,[esp+20]    ; key 
    push    eax 
    push    edx 
    do_call _aes_encrypt_key192,8   ; generate expanded encryption key 
    mov     eax,12*16 
    mov     esi,[esp+24]    ; first round key 
    lea     edi,[esi+eax]   ; last round key 
    add     esi,48          ; the first 6 words are the key, of 
                            ; which the top 2 words are part of 
    mov     edx,[esi-32]    ; the second round key and hence 
    inv_mix_col             ; need to be modified. After this we 
    mov     [esi-32],eax    ; need to do a further six values prior 
    mov     edx,[esi-28]    ; to using a more efficient technique 
    inv_mix_col             ; based on: 
    mov     [esi-28],eax    ; 
                            ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] 
    mov     edx,[esi-24]    ; 
    inv_mix_col             ; for n = 1 .. 5 where the key expansion 
    mov     [esi-24],eax    ; cycle is now 6 words long 
    mov     edx,[esi-20] 
    inv_mix_col 
    mov     [esi-20],eax 
    mov     edx,[esi-16] 
    inv_mix_col 
    mov     [esi-16],eax 
    mov     edx,[esi-12] 
    inv_mix_col 
    mov     [esi-12],eax 
    mov     edx,[esi-8] 
    inv_mix_col 
    mov     [esi-8],eax 
    mov     edx,[esi-4] 
    inv_mix_col 
    mov     [esi-4],eax 
 
.0: mov     edx,[esi]       ; the expanded key is 13 * 4 = 44 32-bit words 
    inv_mix_col             ; of which 11 * 4 = 44 have to be modified 
    mov     [esi],eax       ; using inv_mix_col.  We have already done 8 
    xor     eax,[esi-20]    ; of these so 36 are left - hence we need 
    mov     [esi+4],eax     ; exactly 6 loops of six here 
    xor     eax,[esi-16] 
    mov     [esi+8],eax 
    xor     eax,[esi-12] 
    mov     [esi+12],eax 
    xor     eax,[esi-8] 
    mov     [esi+16],eax 
    xor     eax,[esi-4] 
    mov     [esi+20],eax 
    add     esi,24 
    cmp     edi,esi 
    jg      .0 
    jmp     dec_end 
 
%endif 
 
%ifdef AES_256 
 
%ifndef DECRYPTION_TABLE 
%define DECRYPTION_TABLE 
%endif 
 
    do_name _aes_decrypt_key256,8 
 
    push    ebp 
    push    ebx 
    push    esi 
    push    edi 
    mov     eax,[esp+24] 
    mov     edx,[esp+20] 
    push    eax 
    push    edx 
    do_call _aes_encrypt_key256,8   ; generate expanded encryption key 
    mov     eax,14*16 
    mov     esi,[esp+24] 
    lea     edi,[esi+eax] 
    add     esi,64 
 
    mov     edx,[esi-48]    ; the primary key is 8 words, of which 
    inv_mix_col             ; the top four require modification 
    mov     [esi-48],eax 
    mov     edx,[esi-44] 
    inv_mix_col 
    mov     [esi-44],eax 
    mov     edx,[esi-40] 
    inv_mix_col 
    mov     [esi-40],eax 
    mov     edx,[esi-36] 
    inv_mix_col 
    mov     [esi-36],eax 
 
    mov     edx,[esi-32]    ; the encryption key expansion cycle is 
    inv_mix_col             ; now eight words long so we need to 
    mov     [esi-32],eax    ; start by doing one complete block 
    mov     edx,[esi-28] 
    inv_mix_col 
    mov     [esi-28],eax 
    mov     edx,[esi-24] 
    inv_mix_col 
    mov     [esi-24],eax 
    mov     edx,[esi-20] 
    inv_mix_col 
    mov     [esi-20],eax 
    mov     edx,[esi-16] 
    inv_mix_col 
    mov     [esi-16],eax 
    mov     edx,[esi-12] 
    inv_mix_col 
    mov     [esi-12],eax 
    mov     edx,[esi-8] 
    inv_mix_col 
    mov     [esi-8],eax 
    mov     edx,[esi-4] 
    inv_mix_col 
    mov     [esi-4],eax 
 
.0: mov     edx,[esi]       ; we can now speed up the remaining 
    inv_mix_col             ; rounds by using the technique 
    mov     [esi],eax       ; outlined earlier.  But note that 
    xor     eax,[esi-28]    ; there is one extra inverse mix 
    mov     [esi+4],eax     ; column operation as the 256 bit 
    xor     eax,[esi-24]    ; key has an extra non-linear step 
    mov     [esi+8],eax     ; for the midway element. 
    xor     eax,[esi-20] 
    mov     [esi+12],eax    ; the expanded key is 15 * 4 = 60 
    mov     edx,[esi+16]    ; 32-bit words of which 52 need to 
    inv_mix_col             ; be modified.  We have already done 
    mov     [esi+16],eax    ; 12 so 40 are left - which means 
    xor     eax,[esi-12]    ; that we need exactly 5 loops of 8 
    mov     [esi+20],eax 
    xor     eax,[esi-8] 
    mov     [esi+24],eax 
    xor     eax,[esi-4] 
    mov     [esi+28],eax 
    add     esi,32 
    cmp     edi,esi 
    jg      .0 
 
%endif 
 
dec_end: 
 
%ifdef AES_REV_DKS 
 
    mov     esi,[esp+24]    ; this reverses the order of the 
.1: mov     eax,[esi]       ; round keys if required 
    mov     ebx,[esi+4] 
    mov     ebp,[edi] 
    mov     edx,[edi+4] 
    mov     [esi],ebp 
    mov     [esi+4],edx 
    mov     [edi],eax 
    mov     [edi+4],ebx 
 
    mov     eax,[esi+8] 
    mov     ebx,[esi+12] 
    mov     ebp,[edi+8] 
    mov     edx,[edi+12] 
    mov     [esi+8],ebp 
    mov     [esi+12],edx 
    mov     [edi+8],eax 
    mov     [edi+12],ebx 
 
    add     esi,16 
    sub     edi,16 
    cmp     edi,esi 
    jg      .1 
 
%endif 
 
    pop     edi 
    pop     esi 
    pop     ebx 
    pop     ebp 
    xor     eax,eax 
    do_exit  8 
 
%ifdef AES_VAR 
 
    do_name _aes_decrypt_key,12 
 
    mov     ecx,[esp+4] 
    mov     eax,[esp+8] 
    mov     edx,[esp+12] 
    push    edx 
    push    ecx 
 
    cmp     eax,16 
    je      .1 
    cmp     eax,128 
    je      .1 
 
    cmp     eax,24 
    je      .2 
    cmp     eax,192 
    je      .2 
 
    cmp     eax,32 
    je      .3 
    cmp     eax,256 
    je      .3 
    mov     eax,-1 
    add     esp,8 
    do_exit 12 
 
.1: do_call _aes_decrypt_key128,8 
    do_exit 12 
.2: do_call _aes_decrypt_key192,8 
    do_exit 12 
.3: do_call _aes_decrypt_key256,8 
    do_exit 12 
 
%endif 
 
%endif 
 
%ifdef DECRYPTION_TABLE 
 
; Inverse S-box data - 256 entries 
 
    section .data align=32 
    align 32 
 
%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x 
 
dec_tab: 
    db  v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38) 
    db  v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb) 
    db  v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87) 
    db  v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb) 
    db  v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d) 
    db  v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e) 
    db  v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2) 
    db  v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25) 
    db  v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16) 
    db  v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92) 
    db  v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda) 
    db  v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84) 
    db  v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a) 
    db  v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06) 
    db  v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02) 
    db  v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b) 
    db  v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea) 
    db  v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73) 
    db  v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85) 
    db  v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e) 
    db  v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89) 
    db  v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b) 
    db  v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20) 
    db  v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4) 
    db  v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31) 
    db  v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f) 
    db  v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d) 
    db  v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef) 
    db  v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0) 
    db  v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61) 
    db  v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26) 
    db  v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d) 
 
%endif 
 
    end