www.pudn.com > aes-src-22-11-16.zip > aes_amd64.asm


 
; --------------------------------------------------------------------------- 
; Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved. 
; 
; LICENSE TERMS 
; 
; The free distribution and use of this software in both source and binary 
; form is allowed (with or without changes) provided that: 
; 
;   1. distributions of this source code include the above copyright 
;      notice, this list of conditions and the following disclaimer; 
; 
;   2. distributions in binary form include the above copyright 
;      notice, this list of conditions and the following disclaimer 
;      in the documentation and/or other associated materials; 
; 
;   3. the copyright holder's name is not used to endorse products 
;      built using this software without specific written permission. 
; 
; ALTERNATIVELY, provided that this notice is retained in full, this product 
; may be distributed under the terms of the GNU General Public License (GPL), 
; in which case the provisions of the GPL apply INSTEAD OF those given above. 
; 
; DISCLAIMER 
; 
; This software is provided 'as is' with no explicit or implied warranties 
; in respect of its properties, including, but not limited to, correctness 
; and/or fitness for purpose. 
; --------------------------------------------------------------------------- 
; Issue 09/09/2006 
 
; I am grateful to Dag Arne Osvik for many discussions of the techniques that 
; can be used to optimise AES assembler code on AMD64/EM64T architectures. 
; Some of the techniques used in this implementation are the result of 
; suggestions made by him for which I am most grateful. 
 
; An AES implementation for AMD64 processors using the YASM assembler.  This 
; implemetation provides only encryption, decryption and hence requires key 
; scheduling support in C. It uses 8k bytes of tables but its encryption and 
; decryption performance is very close to that obtained using large tables. 
; It can use either Windows or Gnu/Linux calling conventions, which are as 
; follows: 
;               windows  gnu/linux 
; 
;   in_blk          rcx     rdi 
;   out_blk         rdx     rsi 
;   context (cx)     r8     rdx 
; 
;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15 
;   registers       rdi      -      on both 
; 
;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11 
;   registers        -      rdi     on both 
; 
; The default convention is that for windows, the gnu/linux convention being 
; used if __GNUC__ is defined. 
; 
; This code provides the standard AES block size (128 bits, 16 bytes) and the 
; three standard AES key sizes (128, 192 and 256 bits). It has the same call 
; interface as my C implementation.  It uses the Microsoft C AMD64 calling 
; conventions in which the three parameters are placed in  rcx, rdx and r8 
; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. 
; 
;     AES_RETURN aes_encrypt(const unsigned char in_blk[], 
;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_decrypt(const unsigned char in_blk[], 
;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_encrypt_key(const unsigned char key[], 
;                                            const aes_encrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_decrypt_key(const unsigned char key[], 
;                                            const aes_decrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_encrypt_key(const unsigned char key[], 
;                           unsigned int len, const aes_decrypt_ctx cx[1]); 
; 
;     AES_RETURN aes_decrypt_key(const unsigned char key[], 
;                           unsigned int len, const aes_decrypt_ctx cx[1]); 
; 
; where  is 128, 102 or 256.  In the last two calls the length can be in 
; either bits or bytes. 
; 
; Comment in/out the following lines to obtain the desired subroutines. These 
; selections MUST match those in the C header file aes.h 
 
%define AES_128                 ; define if AES with 128 bit keys is needed 
%define AES_192                 ; define if AES with 192 bit keys is needed 
%define AES_256                 ; define if AES with 256 bit keys is needed 
%define AES_VAR                 ; define if a variable key size is needed 
%define ENCRYPTION              ; define if encryption is needed 
%define DECRYPTION              ; define if decryption is needed 
%define AES_REV_DKS             ; define if key decryption schedule is reversed 
%define LAST_ROUND_TABLES       ; define for the faster version using extra tables 
 
; The encryption key schedule has the following in memory layout where N is the 
; number of rounds (10, 12 or 14): 
; 
; lo: | input key (round 0)  |  ; each round is four 32-bit words 
;     | encryption round 1   | 
;     | encryption round 2   | 
;     .... 
;     | encryption round N-1 | 
; hi: | encryption round N   | 
; 
; The decryption key schedule is normally set up so that it has the same 
; layout as above by actually reversing the order of the encryption key 
; schedule in memory (this happens when AES_REV_DKS is set): 
; 
; lo: | decryption round 0   | =              | encryption round N   | 
;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ] 
;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ] 
;     ....                       .... 
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ] 
; hi: | decryption round N   | =              | input key (round 0)  | 
; 
; with rounds except the first and last modified using inv_mix_column() 
; But if AES_REV_DKS is NOT set the order of keys is left as it is for 
; encryption so that it has to be accessed in reverse when used for 
; decryption (although the inverse mix column modifications are done) 
; 
; lo: | decryption round 0   | =              | input key (round 0)  | 
;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ] 
;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ] 
;     ....                       .... 
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 
; hi: | decryption round N   | =              | encryption round N   | 
; 
; This layout is faster when the assembler key scheduling provided here 
; is used. 
; 
; The DLL interface must use the _stdcall convention in which the number 
; of bytes of parameter space is added after an @ to the sutine's name. 
; We must also remove our parameters from the stack before return (see 
; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. 
 
;%define DLL_EXPORT 
 
; End of user defines 
 
%ifdef AES_VAR 
%ifndef AES_128 
%define AES_128 
%endif 
%ifndef AES_192 
%define AES_192 
%endif 
%ifndef AES_256 
%define AES_256 
%endif 
%endif 
 
%ifdef AES_VAR 
%define KS_LENGTH       60 
%elifdef AES_256 
%define KS_LENGTH       60 
%elifdef AES_192 
%define KS_LENGTH       52 
%else 
%define KS_LENGTH       44 
%endif 
 
%define     r0  rax 
%define     r1  rdx 
%define     r2  rcx 
%define     r3  rbx 
%define     r4  rsi 
%define     r5  rdi 
%define     r6  rbp 
%define     r7  rsp 
 
%define     raxd    eax 
%define     rdxd    edx 
%define     rcxd    ecx 
%define     rbxd    ebx 
%define     rsid    esi 
%define     rdid    edi 
%define     rbpd    ebp 
%define     rspd    esp 
 
%define     raxb    al 
%define     rdxb    dl 
%define     rcxb    cl 
%define     rbxb    bl 
%define     rsib    sil 
%define     rdib    dil 
%define     rbpb    bpl 
%define     rspb    spl 
 
%define     r0h ah 
%define     r1h dh 
%define     r2h ch 
%define     r3h bh 
 
%define     r0d eax 
%define     r1d edx 
%define     r2d ecx 
%define     r3d ebx 
 
; finite field multiplies by {02}, {04} and {08} 
 
%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b)) 
%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) 
%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) 
 
; finite field multiplies required in table generation 
 
%define f3(x)   (f2(x) ^ x) 
%define f9(x)   (f8(x) ^ x) 
%define fb(x)   (f8(x) ^ f2(x) ^ x) 
%define fd(x)   (f8(x) ^ f4(x) ^ x) 
%define fe(x)   (f8(x) ^ f4(x) ^ f2(x)) 
 
; macro for expanding S-box data 
 
%macro enc_vals 1 
    db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5) 
    db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76) 
    db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0) 
    db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0) 
    db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc) 
    db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15) 
    db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a) 
    db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75) 
    db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0) 
    db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84) 
    db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b) 
    db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf) 
    db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85) 
    db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8) 
    db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5) 
    db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2) 
    db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17) 
    db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73) 
    db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88) 
    db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb) 
    db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c) 
    db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79) 
    db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9) 
    db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08) 
    db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6) 
    db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a) 
    db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e) 
    db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e) 
    db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94) 
    db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf) 
    db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68) 
    db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16) 
%endmacro 
 
%macro dec_vals 1 
    db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38) 
    db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb) 
    db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87) 
    db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb) 
    db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d) 
    db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e) 
    db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2) 
    db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25) 
    db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16) 
    db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92) 
    db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda) 
    db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84) 
    db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a) 
    db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06) 
    db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02) 
    db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b) 
    db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea) 
    db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73) 
    db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85) 
    db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e) 
    db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89) 
    db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b) 
    db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20) 
    db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4) 
    db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31) 
    db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f) 
    db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d) 
    db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef) 
    db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0) 
    db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61) 
    db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26) 
    db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d) 
%endmacro 
 
%define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x) 
%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x 
%define w8(x)   x, 0, 0, 0, x, 0, 0, 0 
 
%define tptr    rbp     ; table pointer 
%define kptr    r8      ; key schedule pointer 
%define fofs    128     ; adjust offset in key schedule to keep |disp| < 128 
%define fk_ref(x,y) [kptr-16*x+fofs+4*y] 
%ifdef  AES_REV_DKS 
%define rofs    128 
%define ik_ref(x,y) [kptr-16*x+rofs+4*y] 
%else 
%define rofs    -128 
%define ik_ref(x,y) [kptr+16*x+rofs+4*y] 
%endif 
 
%define tab_0(x)   [tptr+8*x] 
%define tab_1(x)   [tptr+8*x+3] 
%define tab_2(x)   [tptr+8*x+2] 
%define tab_3(x)   [tptr+8*x+1] 
%define tab_f(x)   byte [tptr+8*x+1] 
%define tab_i(x)   byte [tptr+8*x+7] 
%define t_ref(x,r) tab_ %+ x(r) 
 
%macro ff_rnd 5                 ; normal forward round 
    mov     %1d, fk_ref(%5,0) 
    mov     %2d, fk_ref(%5,1) 
    mov     %3d, fk_ref(%5,2) 
    mov     %4d, fk_ref(%5,3) 
 
    movzx   esi, al 
    movzx   edi, ah 
    shr     eax, 16 
    xor     %1d, t_ref(0,rsi) 
    xor     %4d, t_ref(1,rdi) 
    movzx   esi, al 
    movzx   edi, ah 
    xor     %3d, t_ref(2,rsi) 
    xor     %2d, t_ref(3,rdi) 
 
    movzx   esi, bl 
    movzx   edi, bh 
    shr     ebx, 16 
    xor     %2d, t_ref(0,rsi) 
    xor     %1d, t_ref(1,rdi) 
    movzx   esi, bl 
    movzx   edi, bh 
    xor     %4d, t_ref(2,rsi) 
    xor     %3d, t_ref(3,rdi) 
 
    movzx   esi, cl 
    movzx   edi, ch 
    shr     ecx, 16 
    xor     %3d, t_ref(0,rsi) 
    xor     %2d, t_ref(1,rdi) 
    movzx   esi, cl 
    movzx   edi, ch 
    xor     %1d, t_ref(2,rsi) 
    xor     %4d, t_ref(3,rdi) 
 
    movzx   esi, dl 
    movzx   edi, dh 
    shr     edx, 16 
    xor     %4d, t_ref(0,rsi) 
    xor     %3d, t_ref(1,rdi) 
    movzx   esi, dl 
    movzx   edi, dh 
    xor     %2d, t_ref(2,rsi) 
    xor     %1d, t_ref(3,rdi) 
 
    mov     eax,%1d 
    mov     ebx,%2d 
    mov     ecx,%3d 
    mov     edx,%4d 
%endmacro 
 
%ifdef LAST_ROUND_TABLES 
 
%macro fl_rnd 5                 ; last forward round 
    add     tptr, 2048 
    mov     %1d, fk_ref(%5,0) 
    mov     %2d, fk_ref(%5,1) 
    mov     %3d, fk_ref(%5,2) 
    mov     %4d, fk_ref(%5,3) 
 
    movzx   esi, al 
    movzx   edi, ah 
    shr     eax, 16 
    xor     %1d, t_ref(0,rsi) 
    xor     %4d, t_ref(1,rdi) 
    movzx   esi, al 
    movzx   edi, ah 
    xor     %3d, t_ref(2,rsi) 
    xor     %2d, t_ref(3,rdi) 
 
    movzx   esi, bl 
    movzx   edi, bh 
    shr     ebx, 16 
    xor     %2d, t_ref(0,rsi) 
    xor     %1d, t_ref(1,rdi) 
    movzx   esi, bl 
    movzx   edi, bh 
    xor     %4d, t_ref(2,rsi) 
    xor     %3d, t_ref(3,rdi) 
 
    movzx   esi, cl 
    movzx   edi, ch 
    shr     ecx, 16 
    xor     %3d, t_ref(0,rsi) 
    xor     %2d, t_ref(1,rdi) 
    movzx   esi, cl 
    movzx   edi, ch 
    xor     %1d, t_ref(2,rsi) 
    xor     %4d, t_ref(3,rdi) 
 
    movzx   esi, dl 
    movzx   edi, dh 
    shr     edx, 16 
    xor     %4d, t_ref(0,rsi) 
    xor     %3d, t_ref(1,rdi) 
    movzx   esi, dl 
    movzx   edi, dh 
    xor     %2d, t_ref(2,rsi) 
    xor     %1d, t_ref(3,rdi) 
%endmacro 
 
%else 
 
%macro fl_rnd 5                 ; last forward round 
    mov     %1d, fk_ref(%5,0) 
    mov     %2d, fk_ref(%5,1) 
    mov     %3d, fk_ref(%5,2) 
    mov     %4d, fk_ref(%5,3) 
 
    movzx   esi, al 
    movzx   edi, ah 
    shr     eax, 16 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    xor     %1d, esi 
    rol     edi, 8 
    xor     %4d, edi 
    movzx   esi, al 
    movzx   edi, ah 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %3d, esi 
    xor     %2d, edi 
 
    movzx   esi, bl 
    movzx   edi, bh 
    shr     ebx, 16 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    xor     %2d, esi 
    rol     edi, 8 
    xor     %1d, edi 
    movzx   esi, bl 
    movzx   edi, bh 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %4d, esi 
    xor     %3d, edi 
 
    movzx   esi, cl 
    movzx   edi, ch 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    shr     ecx, 16 
    xor     %3d, esi 
    rol     edi, 8 
    xor     %2d, edi 
    movzx   esi, cl 
    movzx   edi, ch 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %1d, esi 
    xor     %4d, edi 
 
    movzx   esi, dl 
    movzx   edi, dh 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    shr     edx, 16 
    xor     %4d, esi 
    rol     edi, 8 
    xor     %3d, edi 
    movzx   esi, dl 
    movzx   edi, dh 
    movzx   esi, t_ref(f,rsi) 
    movzx   edi, t_ref(f,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %2d, esi 
    xor     %1d, edi 
%endmacro 
 
%endif 
 
%macro ii_rnd 5                 ; normal inverse round 
    mov     %1d, ik_ref(%5,0) 
    mov     %2d, ik_ref(%5,1) 
    mov     %3d, ik_ref(%5,2) 
    mov     %4d, ik_ref(%5,3) 
 
    movzx   esi, al 
    movzx   edi, ah 
    shr     eax, 16 
    xor     %1d, t_ref(0,rsi) 
    xor     %2d, t_ref(1,rdi) 
    movzx   esi, al 
    movzx   edi, ah 
    xor     %3d, t_ref(2,rsi) 
    xor     %4d, t_ref(3,rdi) 
 
    movzx   esi, bl 
    movzx   edi, bh 
    shr     ebx, 16 
    xor     %2d, t_ref(0,rsi) 
    xor     %3d, t_ref(1,rdi) 
    movzx   esi, bl 
    movzx   edi, bh 
    xor     %4d, t_ref(2,rsi) 
    xor     %1d, t_ref(3,rdi) 
 
    movzx   esi, cl 
    movzx   edi, ch 
    shr     ecx, 16 
    xor     %3d, t_ref(0,rsi) 
    xor     %4d, t_ref(1,rdi) 
    movzx   esi, cl 
    movzx   edi, ch 
    xor     %1d, t_ref(2,rsi) 
    xor     %2d, t_ref(3,rdi) 
 
    movzx   esi, dl 
    movzx   edi, dh 
    shr     edx, 16 
    xor     %4d, t_ref(0,rsi) 
    xor     %1d, t_ref(1,rdi) 
    movzx   esi, dl 
    movzx   edi, dh 
    xor     %2d, t_ref(2,rsi) 
    xor     %3d, t_ref(3,rdi) 
 
    mov     eax,%1d 
    mov     ebx,%2d 
    mov     ecx,%3d 
    mov     edx,%4d 
%endmacro 
 
%ifdef LAST_ROUND_TABLES 
 
%macro il_rnd 5                 ; last inverse round 
    add     tptr, 2048 
    mov     %1d, ik_ref(%5,0) 
    mov     %2d, ik_ref(%5,1) 
    mov     %3d, ik_ref(%5,2) 
    mov     %4d, ik_ref(%5,3) 
 
    movzx   esi, al 
    movzx   edi, ah 
    shr     eax, 16 
    xor     %1d, t_ref(0,rsi) 
    xor     %2d, t_ref(1,rdi) 
    movzx   esi, al 
    movzx   edi, ah 
    xor     %3d, t_ref(2,rsi) 
    xor     %4d, t_ref(3,rdi) 
 
    movzx   esi, bl 
    movzx   edi, bh 
    shr     ebx, 16 
    xor     %2d, t_ref(0,rsi) 
    xor     %3d, t_ref(1,rdi) 
    movzx   esi, bl 
    movzx   edi, bh 
    xor     %4d, t_ref(2,rsi) 
    xor     %1d, t_ref(3,rdi) 
 
    movzx   esi, cl 
    movzx   edi, ch 
    shr     ecx, 16 
    xor     %3d, t_ref(0,rsi) 
    xor     %4d, t_ref(1,rdi) 
    movzx   esi, cl 
    movzx   edi, ch 
    xor     %1d, t_ref(2,rsi) 
    xor     %2d, t_ref(3,rdi) 
 
    movzx   esi, dl 
    movzx   edi, dh 
    shr     edx, 16 
    xor     %4d, t_ref(0,rsi) 
    xor     %1d, t_ref(1,rdi) 
    movzx   esi, dl 
    movzx   edi, dh 
    xor     %2d, t_ref(2,rsi) 
    xor     %3d, t_ref(3,rdi) 
%endmacro 
 
%else 
 
%macro il_rnd 5                 ; last inverse round 
    mov     %1d, ik_ref(%5,0) 
    mov     %2d, ik_ref(%5,1) 
    mov     %3d, ik_ref(%5,2) 
    mov     %4d, ik_ref(%5,3) 
 
    movzx   esi, al 
    movzx   edi, ah 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    shr     eax, 16 
    xor     %1d, esi 
    rol     edi, 8 
    xor     %2d, edi 
    movzx   esi, al 
    movzx   edi, ah 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %3d, esi 
    xor     %4d, edi 
 
    movzx   esi, bl 
    movzx   edi, bh 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    shr     ebx, 16 
    xor     %2d, esi 
    rol     edi, 8 
    xor     %3d, edi 
    movzx   esi, bl 
    movzx   edi, bh 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %4d, esi 
    xor     %1d, edi 
 
    movzx   esi, cl 
    movzx   edi, ch 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    shr     ecx, 16 
    xor     %3d, esi 
    rol     edi, 8 
    xor     %4d, edi 
    movzx   esi, cl 
    movzx   edi, ch 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %1d, esi 
    xor     %2d, edi 
 
    movzx   esi, dl 
    movzx   edi, dh 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    shr     edx, 16 
    xor     %4d, esi 
    rol     edi, 8 
    xor     %1d, edi 
    movzx   esi, dl 
    movzx   edi, dh 
    movzx   esi, t_ref(i,rsi) 
    movzx   edi, t_ref(i,rdi) 
    rol     esi, 16 
    rol     edi, 24 
    xor     %2d, esi 
    xor     %3d, edi 
%endmacro 
 
%endif 
 
%ifdef ENCRYPTION 
 
    global  aes_encrypt 
%ifdef DLL_EXPORT 
    export  aes_encrypt 
%endif 
 
    section .data align=64 
    align   64 
enc_tab: 
    enc_vals u8 
%ifdef LAST_ROUND_TABLES 
    enc_vals w8 
%endif 
 
    section .text align=16 
    align   16 
aes_encrypt: 
 
%ifdef __GNUC__ 
    sub     rsp, 4*8        ; gnu/linux binary interface 
    mov     [rsp+0*8], rsi  ; output pointer 
    mov     r8, rdx         ; context 
%else 
    sub     rsp, 6*8        ; windows binary interface 
    mov     [rsp+4*8], rsi 
    mov     [rsp+5*8], rdi 
    mov     [rsp+0*8], rdx  ; output pointer 
    mov     rdi, rcx        ; input pointer 
%endif 
    mov     [rsp+1*8], rbx  ; input pointer in rdi 
    mov     [rsp+2*8], rbp  ; output pointer in [rsp] 
    mov     [rsp+3*8], r12  ; context in r8 
 
    movzx   esi, byte [kptr+4*KS_LENGTH] 
    lea     tptr,[enc_tab wrt rip] 
    sub     kptr, fofs 
 
    mov     eax, [rdi+0*4] 
    mov     ebx, [rdi+1*4] 
    mov     ecx, [rdi+2*4] 
    mov     edx, [rdi+3*4] 
 
    xor     eax, [kptr+fofs] 
    xor     ebx, [kptr+fofs+4] 
    xor     ecx, [kptr+fofs+8] 
    xor     edx, [kptr+fofs+12] 
 
    lea     kptr,[kptr+rsi] 
    cmp     esi, 10*16 
    je      .3 
    cmp     esi, 12*16 
    je      .2 
    cmp     esi, 14*16 
    je      .1 
    mov     rax, -1 
    jmp     .4 
 
.1: ff_rnd  r9, r10, r11, r12, 13 
    ff_rnd  r9, r10, r11, r12, 12 
.2: ff_rnd  r9, r10, r11, r12, 11 
    ff_rnd  r9, r10, r11, r12, 10 
.3: ff_rnd  r9, r10, r11, r12, 9 
    ff_rnd  r9, r10, r11, r12, 8 
    ff_rnd  r9, r10, r11, r12, 7 
    ff_rnd  r9, r10, r11, r12, 6 
    ff_rnd  r9, r10, r11, r12, 5 
    ff_rnd  r9, r10, r11, r12, 4 
    ff_rnd  r9, r10, r11, r12, 3 
    ff_rnd  r9, r10, r11, r12, 2 
    ff_rnd  r9, r10, r11, r12, 1 
    fl_rnd  r9, r10, r11, r12, 0 
 
    mov     rbx, [rsp] 
    mov     [rbx], r9d 
    mov     [rbx+4], r10d 
    mov     [rbx+8], r11d 
    mov     [rbx+12], r12d 
    xor     rax, rax 
.4: 
    mov     rbx, [rsp+1*8] 
    mov     rbp, [rsp+2*8] 
    mov     r12, [rsp+3*8] 
%ifdef __GNUC__ 
    add     rsp, 4*8 
%else 
    mov     rsi, [rsp+4*8] 
    mov     rdi, [rsp+5*8] 
    add     rsp, 6*8 
%endif 
    ret 
 
%endif 
 
%ifdef DECRYPTION 
 
    global  aes_decrypt 
%ifdef DLL_EXPORT 
    export  aes_decrypt 
%endif 
 
    section .data align=64 
    align   64 
dec_tab: 
    dec_vals v8 
%ifdef LAST_ROUND_TABLES 
    dec_vals w8 
%endif 
 
    section .text align=16 
    align   16 
aes_decrypt: 
 
%ifdef __GNUC__ 
    sub     rsp, 4*8        ; gnu/linux binary interface 
    mov     [rsp+0*8], rsi  ; output pointer 
    mov     r8, rdx         ; context 
%else 
    sub     rsp, 6*8        ; windows binary interface 
    mov     [rsp+4*8], rsi 
    mov     [rsp+5*8], rdi 
    mov     [rsp+0*8], rdx  ; output pointer 
    mov     rdi, rcx        ; input pointer 
%endif 
    mov     [rsp+1*8], rbx  ; input pointer in rdi 
    mov     [rsp+2*8], rbp  ; output pointer in [rsp] 
    mov     [rsp+3*8], r12  ; context in r8 
 
    movzx   esi,byte[kptr+4*KS_LENGTH] 
    lea     tptr,[dec_tab wrt rip] 
    sub     kptr, rofs 
 
    mov     eax, [rdi+0*4] 
    mov     ebx, [rdi+1*4] 
    mov     ecx, [rdi+2*4] 
    mov     edx, [rdi+3*4] 
 
%ifdef      AES_REV_DKS 
    mov     rdi, kptr 
    lea     kptr,[kptr+rsi] 
%else 
    lea     rdi,[kptr+rsi] 
%endif 
 
    xor     eax, [rdi+rofs] 
    xor     ebx, [rdi+rofs+4] 
    xor     ecx, [rdi+rofs+8] 
    xor     edx, [rdi+rofs+12] 
 
    cmp     esi, 10*16 
    je      .3 
    cmp     esi, 12*16 
    je      .2 
    cmp     esi, 14*16 
    je      .1 
    mov     rax, -1 
    jmp     .4 
 
.1: ii_rnd  r9, r10, r11, r12, 13 
    ii_rnd  r9, r10, r11, r12, 12 
.2: ii_rnd  r9, r10, r11, r12, 11 
    ii_rnd  r9, r10, r11, r12, 10 
.3: ii_rnd  r9, r10, r11, r12, 9 
    ii_rnd  r9, r10, r11, r12, 8 
    ii_rnd  r9, r10, r11, r12, 7 
    ii_rnd  r9, r10, r11, r12, 6 
    ii_rnd  r9, r10, r11, r12, 5 
    ii_rnd  r9, r10, r11, r12, 4 
    ii_rnd  r9, r10, r11, r12, 3 
    ii_rnd  r9, r10, r11, r12, 2 
    ii_rnd  r9, r10, r11, r12, 1 
    il_rnd  r9, r10, r11, r12, 0 
 
    mov     rbx, [rsp] 
    mov     [rbx], r9d 
    mov     [rbx+4], r10d 
    mov     [rbx+8], r11d 
    mov     [rbx+12], r12d 
    xor     rax, rax 
.4: mov     rbx, [rsp+1*8] 
    mov     rbp, [rsp+2*8] 
    mov     r12, [rsp+3*8] 
%ifdef __GNUC__ 
    add     rsp, 4*8 
%else 
    mov     rsi, [rsp+4*8] 
    mov     rdi, [rsp+5*8] 
    add     rsp, 6*8 
%endif 
    ret 
 
%endif 
 
    end