| /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ |
| // |
| // Copyright 2025 Google LLC |
| // |
| // Author: Eric Biggers <ebiggers@google.com> |
| // |
| // This file is dual-licensed, meaning that you can use it under your choice of |
| // either of the following two licenses: |
| // |
| // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy |
| // of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // or |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // 1. Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // |
| // 2. Redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| //------------------------------------------------------------------------------ |
| // |
| // This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR |
| // using the following sets of CPU features: |
| // - AES-NI && AVX |
| // - VAES && AVX2 |
| // - VAES && AVX512BW && AVX512VL && BMI2 |
| // |
| // See the function definitions at the bottom of the file for more information. |
| |
| #include <linux/linkage.h> |
| #include <linux/cfi_types.h> |
| |
| .section .rodata |
| .p2align 4 |
| |
| .Lbswap_mask: |
| .octa 0x000102030405060708090a0b0c0d0e0f |
| |
| .Lctr_pattern: |
| .quad 0, 0 |
| .Lone: |
| .quad 1, 0 |
| .Ltwo: |
| .quad 2, 0 |
| .quad 3, 0 |
| |
| .Lfour: |
| .quad 4, 0 |
| |
| .text |
| |
| // Move a vector between memory and a register. |
| .macro _vmovdqu src, dst |
| .if VL < 64 |
| vmovdqu \src, \dst |
| .else |
| vmovdqu8 \src, \dst |
| .endif |
| .endm |
| |
| // Move a vector between registers. |
| .macro _vmovdqa src, dst |
| .if VL < 64 |
| vmovdqa \src, \dst |
| .else |
| vmovdqa64 \src, \dst |
| .endif |
| .endm |
| |
| // Broadcast a 128-bit value from memory to all 128-bit lanes of a vector |
| // register. |
| .macro _vbroadcast128 src, dst |
| .if VL == 16 |
| vmovdqu \src, \dst |
| .elseif VL == 32 |
| vbroadcasti128 \src, \dst |
| .else |
| vbroadcasti32x4 \src, \dst |
| .endif |
| .endm |
| |
| // XOR two vectors together. |
| .macro _vpxor src1, src2, dst |
| .if VL < 64 |
| vpxor \src1, \src2, \dst |
| .else |
| vpxord \src1, \src2, \dst |
| .endif |
| .endm |
| |
| // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst |
| // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. |
| .macro _load_partial_block src, dst, tmp64, tmp32 |
| sub $8, %ecx // LEN - 8 |
| jle .Lle8\@ |
| |
| // Load 9 <= LEN <= 15 bytes. |
| vmovq (\src), \dst // Load first 8 bytes |
| mov (\src, %rcx), %rax // Load last 8 bytes |
| neg %ecx |
| shl $3, %ecx |
| shr %cl, %rax // Discard overlapping bytes |
| vpinsrq $1, %rax, \dst, \dst |
| jmp .Ldone\@ |
| |
| .Lle8\@: |
| add $4, %ecx // LEN - 4 |
| jl .Llt4\@ |
| |
| // Load 4 <= LEN <= 8 bytes. |
| mov (\src), %eax // Load first 4 bytes |
| mov (\src, %rcx), \tmp32 // Load last 4 bytes |
| jmp .Lcombine\@ |
| |
| .Llt4\@: |
| // Load 1 <= LEN <= 3 bytes. |
| add $2, %ecx // LEN - 2 |
| movzbl (\src), %eax // Load first byte |
| jl .Lmovq\@ |
| movzwl (\src, %rcx), \tmp32 // Load last 2 bytes |
| .Lcombine\@: |
| shl $3, %ecx |
| shl %cl, \tmp64 |
| or \tmp64, %rax // Combine the two parts |
| .Lmovq\@: |
| vmovq %rax, \dst |
| .Ldone\@: |
| .endm |
| |
| // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. |
| // Clobbers %rax, %rcx, and \tmp{64,32}. |
| .macro _store_partial_block src, dst, tmp64, tmp32 |
| sub $8, %ecx // LEN - 8 |
| jl .Llt8\@ |
| |
| // Store 8 <= LEN <= 15 bytes. |
| vpextrq $1, \src, %rax |
| mov %ecx, \tmp32 |
| shl $3, %ecx |
| ror %cl, %rax |
| mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes |
| vmovq \src, (\dst) // Store first 8 bytes |
| jmp .Ldone\@ |
| |
| .Llt8\@: |
| add $4, %ecx // LEN - 4 |
| jl .Llt4\@ |
| |
| // Store 4 <= LEN <= 7 bytes. |
| vpextrd $1, \src, %eax |
| mov %ecx, \tmp32 |
| shl $3, %ecx |
| ror %cl, %eax |
| mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes |
| vmovd \src, (\dst) // Store first 4 bytes |
| jmp .Ldone\@ |
| |
| .Llt4\@: |
| // Store 1 <= LEN <= 3 bytes. |
| vpextrb $0, \src, 0(\dst) |
| cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? |
| jl .Ldone\@ |
| vpextrb $1, \src, 1(\dst) |
| je .Ldone\@ |
| vpextrb $2, \src, 2(\dst) |
| .Ldone\@: |
| .endm |
| |
| // Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and |
| // XOR each with the zero-th round key. Also update LE_CTR if !\final. |
| .macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 |
| .if \is_xctr |
| .if USE_AVX512 |
| vmovdqa64 LE_CTR, AESDATA\i0 |
| vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 |
| .else |
| vpxor XCTR_IV, LE_CTR, AESDATA\i0 |
| vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 |
| .endif |
| vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 |
| |
| .if USE_AVX512 |
| vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 |
| .else |
| vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 |
| vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 |
| .endif |
| .else |
| vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 |
| _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 |
| vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 |
| vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 |
| _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 |
| .endif |
| .if !\final |
| vpaddq LE_CTR_INC2, LE_CTR, LE_CTR |
| .endif |
| .endm |
| |
| // Do all AES rounds on the data in the given AESDATA vectors, excluding the |
| // zero-th and last rounds. |
| .macro _aesenc_loop vecs:vararg |
| mov KEY, %rax |
| 1: |
| _vbroadcast128 (%rax), RNDKEY |
| .irp i, \vecs |
| vaesenc RNDKEY, AESDATA\i, AESDATA\i |
| .endr |
| add $16, %rax |
| cmp %rax, RNDKEYLAST_PTR |
| jne 1b |
| .endm |
| |
| // Finalize the keystream blocks in the given AESDATA vectors by doing the last |
| // AES round, then XOR those keystream blocks with the corresponding data. |
| // Reduce latency by doing the XOR before the vaesenclast, utilizing the |
| // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). |
| .macro _aesenclast_and_xor vecs:vararg |
| .irp i, \vecs |
| _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY |
| vaesenclast RNDKEY, AESDATA\i, AESDATA\i |
| .endr |
| .irp i, \vecs |
| _vmovdqu AESDATA\i, \i*VL(DST) |
| .endr |
| .endm |
| |
| // XOR the keystream blocks in the specified AESDATA vectors with the |
| // corresponding data. |
| .macro _xor_data vecs:vararg |
| .irp i, \vecs |
| _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i |
| .endr |
| .irp i, \vecs |
| _vmovdqu AESDATA\i, \i*VL(DST) |
| .endr |
| .endm |
| |
| .macro _aes_ctr_crypt is_xctr |
| |
| // Define register aliases V0-V15 that map to the xmm, ymm, or zmm |
| // registers according to the selected Vector Length (VL). |
| .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 |
| .if VL == 16 |
| .set V\i, %xmm\i |
| .elseif VL == 32 |
| .set V\i, %ymm\i |
| .elseif VL == 64 |
| .set V\i, %zmm\i |
| .else |
| .error "Unsupported Vector Length (VL)" |
| .endif |
| .endr |
| |
| // Function arguments |
| .set KEY, %rdi // Initially points to the start of the |
| // crypto_aes_ctx, then is advanced to |
| // point to the index 1 round key |
| .set KEY32, %edi // Available as temp register after all |
| // keystream blocks have been generated |
| .set SRC, %rsi // Pointer to next source data |
| .set DST, %rdx // Pointer to next destination data |
| .set LEN, %ecx // Remaining length in bytes. |
| // Note: _load_partial_block relies on |
| // this being in %ecx. |
| .set LEN64, %rcx // Zero-extend LEN before using! |
| .set LEN8, %cl |
| .if \is_xctr |
| .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; |
| .set XCTR_CTR, %r9 // u64 ctr; |
| .else |
| .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; |
| .endif |
| |
| // Additional local variables |
| .set RNDKEYLAST_PTR, %r10 |
| .set AESDATA0, V0 |
| .set AESDATA0_XMM, %xmm0 |
| .set AESDATA1, V1 |
| .set AESDATA1_XMM, %xmm1 |
| .set AESDATA2, V2 |
| .set AESDATA3, V3 |
| .set AESDATA4, V4 |
| .set AESDATA5, V5 |
| .set AESDATA6, V6 |
| .set AESDATA7, V7 |
| .if \is_xctr |
| .set XCTR_IV, V8 |
| .else |
| .set BSWAP_MASK, V8 |
| .endif |
| .set LE_CTR, V9 |
| .set LE_CTR_XMM, %xmm9 |
| .set LE_CTR_INC1, V10 |
| .set LE_CTR_INC2, V11 |
| .set RNDKEY0, V12 |
| .set RNDKEYLAST, V13 |
| .set RNDKEY, V14 |
| |
| // Create the first vector of counters. |
| .if \is_xctr |
| .if VL == 16 |
| vmovq XCTR_CTR, LE_CTR |
| .elseif VL == 32 |
| vmovq XCTR_CTR, LE_CTR_XMM |
| inc XCTR_CTR |
| vmovq XCTR_CTR, AESDATA0_XMM |
| vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR |
| .else |
| vpbroadcastq XCTR_CTR, LE_CTR |
| vpsrldq $8, LE_CTR, LE_CTR |
| vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR |
| .endif |
| _vbroadcast128 (XCTR_IV_PTR), XCTR_IV |
| .else |
| _vbroadcast128 (LE_CTR_PTR), LE_CTR |
| .if VL > 16 |
| vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR |
| .endif |
| _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK |
| .endif |
| |
| .if VL == 16 |
| _vbroadcast128 .Lone(%rip), LE_CTR_INC1 |
| .elseif VL == 32 |
| _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 |
| .else |
| _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 |
| .endif |
| vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 |
| |
| // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). |
| movl 480(KEY), %eax |
| |
| // Compute the pointer to the last round key. |
| lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR |
| |
| // Load the zero-th and last round keys. |
| _vbroadcast128 (KEY), RNDKEY0 |
| _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST |
| |
| // Make KEY point to the first round key. |
| add $16, KEY |
| |
| // This is the main loop, which encrypts 8 vectors of data at a time. |
| add $-8*VL, LEN |
| jl .Lloop_8x_done\@ |
| .Lloop_8x\@: |
| _prepare_2_ctr_vecs \is_xctr, 0, 1 |
| _prepare_2_ctr_vecs \is_xctr, 2, 3 |
| _prepare_2_ctr_vecs \is_xctr, 4, 5 |
| _prepare_2_ctr_vecs \is_xctr, 6, 7 |
| _aesenc_loop 0,1,2,3,4,5,6,7 |
| _aesenclast_and_xor 0,1,2,3,4,5,6,7 |
| sub $-8*VL, SRC |
| sub $-8*VL, DST |
| add $-8*VL, LEN |
| jge .Lloop_8x\@ |
| .Lloop_8x_done\@: |
| sub $-8*VL, LEN |
| jz .Ldone\@ |
| |
| // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream |
| // blocks, depending on the remaining LEN. |
| |
| _prepare_2_ctr_vecs \is_xctr, 0, 1 |
| _prepare_2_ctr_vecs \is_xctr, 2, 3 |
| cmp $4*VL, LEN |
| jle .Lenc_tail_atmost4vecs\@ |
| |
| // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the |
| // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. |
| _prepare_2_ctr_vecs \is_xctr, 4, 5 |
| _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 |
| _aesenc_loop 0,1,2,3,4,5,6,7 |
| _aesenclast_and_xor 0,1,2,3 |
| vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 |
| vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 |
| vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 |
| vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 |
| sub $-4*VL, SRC |
| sub $-4*VL, DST |
| add $-4*VL, LEN |
| cmp $1*VL-1, LEN |
| jle .Lxor_tail_partial_vec_0\@ |
| _xor_data 0 |
| cmp $2*VL-1, LEN |
| jle .Lxor_tail_partial_vec_1\@ |
| _xor_data 1 |
| cmp $3*VL-1, LEN |
| jle .Lxor_tail_partial_vec_2\@ |
| _xor_data 2 |
| cmp $4*VL-1, LEN |
| jle .Lxor_tail_partial_vec_3\@ |
| _xor_data 3 |
| jmp .Ldone\@ |
| |
| .Lenc_tail_atmost4vecs\@: |
| cmp $2*VL, LEN |
| jle .Lenc_tail_atmost2vecs\@ |
| |
| // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the |
| // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. |
| _aesenc_loop 0,1,2,3 |
| _aesenclast_and_xor 0,1 |
| vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 |
| vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 |
| sub $-2*VL, SRC |
| sub $-2*VL, DST |
| add $-2*VL, LEN |
| jmp .Lxor_tail_upto2vecs\@ |
| |
| .Lenc_tail_atmost2vecs\@: |
| // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR |
| // the remaining data. |
| _aesenc_loop 0,1 |
| vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 |
| vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 |
| |
| .Lxor_tail_upto2vecs\@: |
| cmp $1*VL-1, LEN |
| jle .Lxor_tail_partial_vec_0\@ |
| _xor_data 0 |
| cmp $2*VL-1, LEN |
| jle .Lxor_tail_partial_vec_1\@ |
| _xor_data 1 |
| jmp .Ldone\@ |
| |
| .Lxor_tail_partial_vec_1\@: |
| add $-1*VL, LEN |
| jz .Ldone\@ |
| sub $-1*VL, SRC |
| sub $-1*VL, DST |
| _vmovdqa AESDATA1, AESDATA0 |
| jmp .Lxor_tail_partial_vec_0\@ |
| |
| .Lxor_tail_partial_vec_2\@: |
| add $-2*VL, LEN |
| jz .Ldone\@ |
| sub $-2*VL, SRC |
| sub $-2*VL, DST |
| _vmovdqa AESDATA2, AESDATA0 |
| jmp .Lxor_tail_partial_vec_0\@ |
| |
| .Lxor_tail_partial_vec_3\@: |
| add $-3*VL, LEN |
| jz .Ldone\@ |
| sub $-3*VL, SRC |
| sub $-3*VL, DST |
| _vmovdqa AESDATA3, AESDATA0 |
| |
| .Lxor_tail_partial_vec_0\@: |
| // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked |
| // loads/stores are available; otherwise it's a bit harder... |
| .if USE_AVX512 |
| mov $-1, %rax |
| bzhi LEN64, %rax, %rax |
| kmovq %rax, %k1 |
| vmovdqu8 (SRC), AESDATA1{%k1}{z} |
| vpxord AESDATA1, AESDATA0, AESDATA0 |
| vmovdqu8 AESDATA0, (DST){%k1} |
| .else |
| .if VL == 32 |
| cmp $16, LEN |
| jl 1f |
| vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM |
| vmovdqu AESDATA1_XMM, (DST) |
| add $16, SRC |
| add $16, DST |
| sub $16, LEN |
| jz .Ldone\@ |
| vextracti128 $1, AESDATA0, AESDATA0_XMM |
| 1: |
| .endif |
| mov LEN, %r10d |
| _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 |
| vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM |
| mov %r10d, %ecx |
| _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 |
| .endif |
| |
| .Ldone\@: |
| .if VL > 16 |
| vzeroupper |
| .endif |
| RET |
| .endm |
| |
| // Below are the definitions of the functions generated by the above macro. |
| // They have the following prototypes: |
| // |
| // |
| // void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, |
| // const u8 *src, u8 *dst, int len, |
| // const u64 le_ctr[2]); |
| // |
| // void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, |
| // const u8 *src, u8 *dst, int len, |
| // const u8 iv[AES_BLOCK_SIZE], u64 ctr); |
| // |
| // Both functions generate |len| bytes of keystream, XOR it with the data from |
| // |src|, and write the result to |dst|. On non-final calls, |len| must be a |
| // multiple of 16. On the final call, |len| can be any value. |
| // |
| // aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated |
| // from a 128-bit big endian counter that increments by 1 for each AES block. |
| // HOWEVER, to keep the assembly code simple, some of the counter management is |
| // left to the caller. aes_ctr64_crypt_* take the counter in little endian |
| // form, only increment the low 64 bits internally, do the conversion to big |
| // endian internally, and don't write the updated counter back to memory. The |
| // caller is responsible for converting the starting IV to the little endian |
| // le_ctr, detecting the (very rare) case of a carry out of the low 64 bits |
| // being needed and splitting at that point with a carry done in between, and |
| // updating le_ctr after each part if the message is multi-part. |
| // |
| // aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption |
| // with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an |
| // easier-to-implement variant of CTR that uses little endian byte order and |
| // eliminates carries. |ctr| is the per-message block counter starting at 1. |
| |
| .set VL, 16 |
| .set USE_AVX512, 0 |
| SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) |
| _aes_ctr_crypt 0 |
| SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) |
| SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) |
| _aes_ctr_crypt 1 |
| SYM_FUNC_END(aes_xctr_crypt_aesni_avx) |
| |
| #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) |
| .set VL, 32 |
| .set USE_AVX512, 0 |
| SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) |
| _aes_ctr_crypt 0 |
| SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) |
| SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) |
| _aes_ctr_crypt 1 |
| SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) |
| |
| .set VL, 64 |
| .set USE_AVX512, 1 |
| SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512) |
| _aes_ctr_crypt 0 |
| SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512) |
| SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512) |
| _aes_ctr_crypt 1 |
| SYM_FUNC_END(aes_xctr_crypt_vaes_avx512) |
| #endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ |