| /* SPDX-License-Identifier: GPL-2.0-or-later */ |
| // |
| // AES block cipher using AES-NI instructions |
| // |
| // Copyright 2026 Google LLC |
| // |
| // The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require |
| // AVX. It does use up to SSE4.1, which all CPUs with AES-NI have. |
| #include <linux/linkage.h> |
| |
| .section .rodata |
| #ifdef __x86_64__ |
| #define RODATA(label) label(%rip) |
| #else |
| #define RODATA(label) label |
| #endif |
| |
| // A mask for pshufb that extracts the last dword, rotates it right by 8 |
| // bits, and copies the result to all four dwords. |
| .p2align 4 |
| .Lmask: |
| .byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 |
| |
| // The AES round constants, used during key expansion |
| .Lrcon: |
| .long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 |
| |
| .text |
| |
| // Transform four dwords [a0, a1, a2, a3] in \a into |
| // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register. |
| // |
| // Note: this could be done in four instructions, shufps + pxor + shufps + pxor, |
| // if the temporary register were zero-initialized ahead of time. We instead do |
| // it in an easier-to-understand way that doesn't require zero-initialization |
| // and avoids the unusual shufps instruction. movdqa is usually "free" anyway. |
| .macro _prefix_sum a, tmp |
| movdqa \a, \tmp // [a0, a1, a2, a3] |
| pslldq $4, \a // [0, a0, a1, a2] |
| pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3] |
| movdqa \a, \tmp |
| pslldq $8, \a // [0, 0, a0, a0^a1] |
| pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3] |
| .endm |
| |
| .macro _gen_round_key a, b |
| // Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is |
| // the last dword of the previous round key (given in \b). |
| // |
| // 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)). |
| // It is used here solely for the SubBytes and the XOR. The ShiftRows |
| // is a no-op because all four columns are the same here. |
| // |
| // Don't use the 'aeskeygenassist' instruction, since: |
| // - On most Intel CPUs it is microcoded, making it have a much higher |
| // latency and use more execution ports than 'aesenclast'. |
| // - It cannot be used in a loop, since it requires an immediate. |
| // - It doesn't do much more than 'aesenclast' in the first place. |
| movdqa \b, %xmm2 |
| pshufb MASK, %xmm2 |
| aesenclast RCON, %xmm2 |
| |
| // XOR in the prefix sum of the four dwords of \a, which is the |
| // previous round key (AES-128) or the first round key in the previous |
| // pair of round keys (AES-256). The result is the next round key. |
| _prefix_sum \a, tmp=%xmm3 |
| pxor %xmm2, \a |
| |
| // Store the next round key to memory. Also leave it in \a. |
| movdqu \a, (RNDKEYS) |
| .endm |
| |
| .macro _aes_expandkey_aesni is_aes128 |
| #ifdef __x86_64__ |
| // Arguments |
| .set RNDKEYS, %rdi |
| .set INV_RNDKEYS, %rsi |
| .set IN_KEY, %rdx |
| |
| // Other local variables |
| .set RCON_PTR, %rcx |
| .set COUNTER, %eax |
| #else |
| // Arguments, assuming -mregparm=3 |
| .set RNDKEYS, %eax |
| .set INV_RNDKEYS, %edx |
| .set IN_KEY, %ecx |
| |
| // Other local variables |
| .set RCON_PTR, %ebx |
| .set COUNTER, %esi |
| #endif |
| .set RCON, %xmm6 |
| .set MASK, %xmm7 |
| |
| #ifdef __i386__ |
| push %ebx |
| push %esi |
| #endif |
| |
| .if \is_aes128 |
| // AES-128: the first round key is simply a copy of the raw key. |
| movdqu (IN_KEY), %xmm0 |
| movdqu %xmm0, (RNDKEYS) |
| .else |
| // AES-256: the first two round keys are simply a copy of the raw key. |
| movdqu (IN_KEY), %xmm0 |
| movdqu %xmm0, (RNDKEYS) |
| movdqu 16(IN_KEY), %xmm1 |
| movdqu %xmm1, 16(RNDKEYS) |
| add $32, RNDKEYS |
| .endif |
| |
| // Generate the remaining round keys. |
| movdqa RODATA(.Lmask), MASK |
| .if \is_aes128 |
| lea RODATA(.Lrcon), RCON_PTR |
| mov $10, COUNTER |
| .Lgen_next_aes128_round_key: |
| add $16, RNDKEYS |
| movd (RCON_PTR), RCON |
| pshufd $0x00, RCON, RCON |
| add $4, RCON_PTR |
| _gen_round_key %xmm0, %xmm0 |
| dec COUNTER |
| jnz .Lgen_next_aes128_round_key |
| .else |
| // AES-256: only the first 7 round constants are needed, so instead of |
| // loading each one from memory, just start by loading [1, 1, 1, 1] and |
| // then generate the rest by doubling. |
| pshufd $0x00, RODATA(.Lrcon), RCON |
| pxor %xmm5, %xmm5 // All-zeroes |
| mov $7, COUNTER |
| .Lgen_next_aes256_round_key_pair: |
| // Generate the next AES-256 round key: either the first of a pair of |
| // two, or the last one. |
| _gen_round_key %xmm0, %xmm1 |
| |
| dec COUNTER |
| jz .Lgen_aes256_round_keys_done |
| |
| // Generate the second AES-256 round key of the pair. Compared to the |
| // first, there's no rotation and no XOR of a round constant. |
| pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword |
| aesenclast %xmm5, %xmm2 // Just does SubBytes |
| _prefix_sum %xmm1, tmp=%xmm3 |
| pxor %xmm2, %xmm1 |
| movdqu %xmm1, 16(RNDKEYS) |
| add $32, RNDKEYS |
| paddd RCON, RCON // RCON <<= 1 |
| jmp .Lgen_next_aes256_round_key_pair |
| .Lgen_aes256_round_keys_done: |
| .endif |
| |
| // If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent |
| // Inverse Cipher to it. To do that, reverse the standard round keys, |
| // and apply aesimc (InvMixColumn) to each except the first and last. |
| test INV_RNDKEYS, INV_RNDKEYS |
| jz .Ldone\@ |
| movdqu (RNDKEYS), %xmm0 // Last standard round key |
| movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key |
| .if \is_aes128 |
| mov $9, COUNTER |
| .else |
| mov $13, COUNTER |
| .endif |
| .Lgen_next_inv_round_key\@: |
| sub $16, RNDKEYS |
| add $16, INV_RNDKEYS |
| movdqu (RNDKEYS), %xmm0 |
| aesimc %xmm0, %xmm0 |
| movdqu %xmm0, (INV_RNDKEYS) |
| dec COUNTER |
| jnz .Lgen_next_inv_round_key\@ |
| movdqu -16(RNDKEYS), %xmm0 // First standard round key |
| movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key |
| |
| .Ldone\@: |
| #ifdef __i386__ |
| pop %esi |
| pop %ebx |
| #endif |
| RET |
| .endm |
| |
| // void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, |
| // const u8 in_key[AES_KEYSIZE_128]); |
| SYM_FUNC_START(aes128_expandkey_aesni) |
| _aes_expandkey_aesni 1 |
| SYM_FUNC_END(aes128_expandkey_aesni) |
| |
| // void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, |
| // const u8 in_key[AES_KEYSIZE_256]); |
| SYM_FUNC_START(aes256_expandkey_aesni) |
| _aes_expandkey_aesni 0 |
| SYM_FUNC_END(aes256_expandkey_aesni) |
| |
| .macro _aes_crypt_aesni enc |
| #ifdef __x86_64__ |
| .set RNDKEYS, %rdi |
| .set NROUNDS, %esi |
| .set OUT, %rdx |
| .set IN, %rcx |
| #else |
| // Assuming -mregparm=3 |
| .set RNDKEYS, %eax |
| .set NROUNDS, %edx |
| .set OUT, %ecx |
| .set IN, %ebx // Passed on stack |
| #endif |
| |
| #ifdef __i386__ |
| push %ebx |
| mov 8(%esp), %ebx |
| #endif |
| |
| // Zero-th round |
| movdqu (IN), %xmm0 |
| movdqu (RNDKEYS), %xmm1 |
| pxor %xmm1, %xmm0 |
| |
| // Normal rounds |
| add $16, RNDKEYS |
| dec NROUNDS |
| .Lnext_round\@: |
| movdqu (RNDKEYS), %xmm1 |
| .if \enc |
| aesenc %xmm1, %xmm0 |
| .else |
| aesdec %xmm1, %xmm0 |
| .endif |
| add $16, RNDKEYS |
| dec NROUNDS |
| jne .Lnext_round\@ |
| |
| // Last round |
| movdqu (RNDKEYS), %xmm1 |
| .if \enc |
| aesenclast %xmm1, %xmm0 |
| .else |
| aesdeclast %xmm1, %xmm0 |
| .endif |
| movdqu %xmm0, (OUT) |
| |
| #ifdef __i386__ |
| pop %ebx |
| #endif |
| RET |
| .endm |
| |
| // void aes_encrypt_aesni(const u32 rndkeys[], int nrounds, |
| // u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); |
| SYM_FUNC_START(aes_encrypt_aesni) |
| _aes_crypt_aesni 1 |
| SYM_FUNC_END(aes_encrypt_aesni) |
| |
| // void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds, |
| // u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); |
| SYM_FUNC_START(aes_decrypt_aesni) |
| _aes_crypt_aesni 0 |
| SYM_FUNC_END(aes_decrypt_aesni) |