From 1e4cb1df466edd5ffb39b5a5c8887f46b54074eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=B0=E9=9D=99?= Date: Fri, 12 Dec 2025 14:07:48 +0800 Subject: [PATCH] Fixed the compilation failure of the vmv4r.v command Backport (riscv): Performance Optimization of SM4-CBC Encryption and Decryption with Assembly on RISC-V Architecture Performance Optimization of SM4-CBC Encryption and Decryption with Assembly on RISC-V Architecture Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale (Merged from #29137) Added assembly implementation for SM4-CBC encryption and decryption. The code adopts a block processing approach, dynamically selecting different blocks for batch processing based on data length. The encryption function supports 4-block and single-block modes, while the decryption function supports 8-block, 4-block, and single-block modes. After optimization, the performance of both encryption and decryption has been enhanced. Test Verification ``` make test All tests successful. Files=343, Tests=3669, 2003 wallclock secs (321.33 usr 246.33 sys + 9286.09 cusr 4460.92 csys = 14314.67 CPU) Result: PASS ``` The average result was taken from three test runs performed on a RISC-V virtual machine featuring the Zvksed extensions. Performance Improvement ``` openssl speed -elapsed -seconds 3 -evp sm4-cbc openssl speed -decrypt -elapsed -seconds 3 -evp sm4-cbc ``` benchmark results run on qemu_Anolis OS 23.3: | Encrypt Test | Optimized | | ------------ | --------------- | | 16 bytes | 16644.32 k | | 64 bytes | 29947.79k | | 256 bytes | 37251.44k | | 1024 bytes | 39770.23k | | 8192 bytes | 40545.85k | | 16384 bytes | 40594.09k | | Decrypt Test | Optimized | | ------------ | --------------- | | 16 bytes | 12742.69k | | 64 bytes | 24635.21k | | 256 bytes | 31858.40k | | 1024 bytes | 34265.43k | | 8192 bytes | 35008.97k | | 16384 bytes | 35063.58k | Signed-off-by: lianjing --- ...ation-failure-of-the-vmv4r.v-command.patch | 37 + ...with-Assembly-on-RISC-V-Architecture.patch | 802 ++++++++++++++++++ openssl.spec | 10 +- 3 files changed, 847 insertions(+), 2 deletions(-) create mode 100644 3500-Fixed-the-compilation-failure-of-the-vmv4r.v-command.patch create mode 100644 3501-Backport-riscv-Performance-Optimization-of-SM4-CBC-Encryption-and-Decryption-with-Assembly-on-RISC-V-Architecture.patch diff --git a/3500-Fixed-the-compilation-failure-of-the-vmv4r.v-command.patch b/3500-Fixed-the-compilation-failure-of-the-vmv4r.v-command.patch new file mode 100644 index 0000000..2f9e404 --- /dev/null +++ b/3500-Fixed-the-compilation-failure-of-the-vmv4r.v-command.patch @@ -0,0 +1,37 @@ +From 325bef5a481f59ae8bf6dc03ef9552646330813f Mon Sep 17 00:00:00 2001 +From: zhoulu +Date: Fri, 12 Dec 2025 10:24:52 +0800 +Subject: [PATCH] Fixed the compilation failure of the vmv4r.v command + +Fixed the compilation failure of the vmv4r.v command in Anolis OS 23.3 system. + +Signed-off-by: lianjing +--- + crypto/sm4/asm/sm4-riscv64-zvksed.pl | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/crypto/sm4/asm/sm4-riscv64-zvksed.pl b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +index b0251bc..4e13d5d 100644 +--- a/crypto/sm4/asm/sm4-riscv64-zvksed.pl ++++ b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +@@ -350,7 +350,7 @@ rv64i_zvkned_sm4_ecb_encrypt: + @{[vrev8_v $V24, $V24]} + @{[vrgather_vv $V28, $V24, $V16]} + @{[vse32_v $V28, $a1]} +- vmv4r.v $V24, $V20 ++ @{[vmv_v_v $V24, $V20]} + add $a1, $a1, $t6 + mv $t3, $t5 + sub $t5, $t5, $t2 +@@ -440,7 +440,7 @@ rv64i_zvkned_sm4_ecb_decrypt: + @{[vrev8_v $V24, $V24]} + @{[vrgather_vv $V28, $V24, $V16]} + @{[vse32_v $V28, $a1]} +- vmv4r.v $V24, $V20 ++ @{[vmv_v_v $V24, $V20]} + add $a1, $a1, $t6 + mv $t3, $t5 + sub $t5, $t5, $t2 +-- +2.27.0 + diff --git a/3501-Backport-riscv-Performance-Optimization-of-SM4-CBC-Encryption-and-Decryption-with-Assembly-on-RISC-V-Architecture.patch b/3501-Backport-riscv-Performance-Optimization-of-SM4-CBC-Encryption-and-Decryption-with-Assembly-on-RISC-V-Architecture.patch new file mode 100644 index 0000000..8722d9c --- /dev/null +++ b/3501-Backport-riscv-Performance-Optimization-of-SM4-CBC-Encryption-and-Decryption-with-Assembly-on-RISC-V-Architecture.patch @@ -0,0 +1,802 @@ +From b14e8cea6a5ea37815184dcb912285da3d5a6ae5 Mon Sep 17 00:00:00 2001 +From: zhoulu +Date: Fri, 12 Dec 2025 10:39:17 +0800 +Subject: [PATCH] Backport (riscv): Performance Optimization of SM4-CBC Encryption and Decryption with Assembly on RISC-V Architecture + +Performance Optimization of SM4-CBC Encryption and Decryption with Assembly on RISC-V Architecture +Reviewed-by: Tomas Mraz +Reviewed-by: Paul Dale +(Merged from #29137) + +Signed-off-by: lianjing +--- + crypto/sm4/asm/sm4-riscv64-zvksed.pl | 646 ++++++++++++++++-- + include/crypto/sm4_platform.h | 6 + + .../ciphers/cipher_sm4_hw_rv64i.inc | 8 + + test/recipes/30-test_evp_data/evpciph_sm4.txt | 6 + + 4 files changed, 591 insertions(+), 75 deletions(-) + +diff --git a/crypto/sm4/asm/sm4-riscv64-zvksed.pl b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +index 4e13d5d..826b610 100644 +--- a/crypto/sm4/asm/sm4-riscv64-zvksed.pl ++++ b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +@@ -59,6 +59,548 @@ my $code=<<___; + .text + ___ + ++my $BLOCK_SIZE = 16; ++my $STRIDE = -4; # Used for reversing word order ++my $FOUR_BLOCKS = 64; ++my $EIGHT_BLOCKS = 128; ++my ($vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7)=("v16","v17","v18","v19","v20","v21","v22","v23"); ++my ($tmp_stride,$tmp_base)=("t1","t2"); ++# Loading with word order reversed ++sub reverse_order_L { ++ my $vreg = shift; ++ my $base_reg = shift; ++ ++ return <<___; ++ addi $tmp_base, $base_reg, 12 ++ @{[vlse32_v $vreg, $tmp_base, $tmp_stride]} ++___ ++} ++ ++# Storing with word order reversed ++sub reverse_order_S { ++ my $vreg = shift; ++ my $base_reg = shift; ++ ++ return <<___; ++ addi $tmp_base, $base_reg, 12 ++ @{[vsse32_v $vreg, $tmp_base, $tmp_stride]} ++___ ++} ++ ++# Load 32 round keys ++sub enc_load_key { ++ my $keys = shift; ++ ++ my $code=<<___; ++ # Order of elements was adjusted in set_encrypt_key() ++ @{[vle32_v $vk0, $keys]} # rk[0:3] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk1, $keys]} # rk[4:7] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk2, $keys]} # rk[8:11] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk3, $keys]} # rk[12:15] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk4, $keys]} # rk[16:19] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk5, $keys]} # rk[20:23] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk6, $keys]} # rk[24:27] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk7, $keys]} # rk[28:31] ++___ ++ ++ return $code; ++} ++ ++sub dec_load_key { ++ my $keys = shift; ++ ++ my $code=<<___; ++ # Order of elements was adjusted in set_decrypt_key() ++ @{[vle32_v $vk7, $keys]} # rk[31:28] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk6, $keys]} # rk[27:24] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk5, $keys]} # rk[23:20] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk4, $keys]} # rk[19:16] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk3, $keys]} # rk[15:12] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk2, $keys]} # rk[11:8] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk1, $keys]} # rk[7:4] ++ addi $keys, $keys, $BLOCK_SIZE ++ @{[vle32_v $vk0, $keys]} # rk[3:0] ++___ ++ ++ return $code; ++} ++ ++# Encrypt with all keys ++sub enc_blk { ++ my $data = shift; ++ ++ my $code=<<___; ++ @{[vsm4r_vs $data, $vk0]} ++ @{[vsm4r_vs $data, $vk1]} ++ @{[vsm4r_vs $data, $vk2]} ++ @{[vsm4r_vs $data, $vk3]} ++ @{[vsm4r_vs $data, $vk4]} ++ @{[vsm4r_vs $data, $vk5]} ++ @{[vsm4r_vs $data, $vk6]} ++ @{[vsm4r_vs $data, $vk7]} ++___ ++ ++ return $code; ++} ++ ++# Decrypt with all keys ++sub dec_blk { ++ my $data = shift; ++ ++ my $code=<<___; ++ @{[vsm4r_vs $data, $vk7]} ++ @{[vsm4r_vs $data, $vk6]} ++ @{[vsm4r_vs $data, $vk5]} ++ @{[vsm4r_vs $data, $vk4]} ++ @{[vsm4r_vs $data, $vk3]} ++ @{[vsm4r_vs $data, $vk2]} ++ @{[vsm4r_vs $data, $vk1]} ++ @{[vsm4r_vs $data, $vk0]} ++___ ++ ++ return $code; ++} ++ ++# Decrypt 4 blocks with all keys ++sub dec_4blks { ++ my $data0 = shift; ++ my $data1 = shift; ++ my $data2 = shift; ++ my $data3 = shift; ++ ++ my $code=<<___; ++ @{[vsm4r_vs $data0, $vk7]} ++ @{[vsm4r_vs $data1, $vk7]} ++ @{[vsm4r_vs $data2, $vk7]} ++ @{[vsm4r_vs $data3, $vk7]} ++ ++ @{[vsm4r_vs $data0, $vk6]} ++ @{[vsm4r_vs $data1, $vk6]} ++ @{[vsm4r_vs $data2, $vk6]} ++ @{[vsm4r_vs $data3, $vk6]} ++ ++ @{[vsm4r_vs $data0, $vk5]} ++ @{[vsm4r_vs $data1, $vk5]} ++ @{[vsm4r_vs $data2, $vk5]} ++ @{[vsm4r_vs $data3, $vk5]} ++ ++ @{[vsm4r_vs $data0, $vk4]} ++ @{[vsm4r_vs $data1, $vk4]} ++ @{[vsm4r_vs $data2, $vk4]} ++ @{[vsm4r_vs $data3, $vk4]} ++ ++ @{[vsm4r_vs $data0, $vk3]} ++ @{[vsm4r_vs $data1, $vk3]} ++ @{[vsm4r_vs $data2, $vk3]} ++ @{[vsm4r_vs $data3, $vk3]} ++ ++ @{[vsm4r_vs $data0, $vk2]} ++ @{[vsm4r_vs $data1, $vk2]} ++ @{[vsm4r_vs $data2, $vk2]} ++ @{[vsm4r_vs $data3, $vk2]} ++ ++ @{[vsm4r_vs $data0, $vk1]} ++ @{[vsm4r_vs $data1, $vk1]} ++ @{[vsm4r_vs $data2, $vk1]} ++ @{[vsm4r_vs $data3, $vk1]} ++ ++ @{[vsm4r_vs $data0, $vk0]} ++ @{[vsm4r_vs $data1, $vk0]} ++ @{[vsm4r_vs $data2, $vk0]} ++ @{[vsm4r_vs $data3, $vk0]} ++___ ++ ++ return $code; ++} ++ ++#### ++# void rv64i_zvksed_sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++# size_t len, const SM4_KEY *key, ++# unsigned char *iv, int enc); ++# ++{ ++my ($in,$out,$len,$keys,$ivp)=("a0","a1","a2","a3","a4"); ++my ($tmp,$base)=("t0","t2"); ++my ($vdata0,$vdata1,$vdata2,$vdata3,$vdata4,$vdata5,$vdata6,$vdata7)=("v1","v2","v3","v4","v5","v6","v7","v24"); ++my ($vivec)=("v8"); ++ ++$code .= <<___; ++.p2align 3 ++.globl rv64i_zvksed_sm4_cbc_encrypt ++.type rv64i_zvksed_sm4_cbc_encrypt,\@function ++rv64i_zvksed_sm4_cbc_encrypt: ++ # check whether the length is a multiple of 16 and >= 16 ++ li $tmp, $BLOCK_SIZE ++ bltu $len, $tmp, .Lcbc_enc_end ++ andi $tmp, $len, 15 ++ bnez $tmp, .Lcbc_enc_end ++ ++ @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++ # Load 32 round keys ++ @{[enc_load_key $keys]} ++ ++ # Load IV ++ @{[vle32_v $vivec, $ivp]} ++# ===================================================== ++# If data length ≥ 64 bytes, process 4 blocks in batch: ++# 4-block CBC encryption pipeline: ++# 1. Load 4 plaintext blocks ++# 2. Reverse bytes for SM4 endianness ++# 3. Perform XOR operation with IV or previous ciphertext block (CBC chain) ++# 4. Encrypt each data block using the enc_blk function ++# 5. Adjust the byte order and store the ciphertext block ++# 6. Update the initialization vector (IV) ++# If data length < 64 bytes, process it block by block using the Lcbc_enc_single function ++# ===================================================== ++.Lcbc_enc_loop: ++ li $tmp, $FOUR_BLOCKS ++ bltu $len, $tmp, .Lcbc_enc_single ++ # Load input data0-data3 ++ @{[vle32_v $vdata0, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata1, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata2, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata3, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ #XOR with IV ++ @{[vxor_vv $vdata0, $vdata0, $vivec]} ++ ++ @{[vrev8_v $vdata0, $vdata0]} ++ # Encrypt with all keys ++ @{[enc_blk $vdata0]} ++ @{[vrev8_v $vdata0, $vdata0]} ++ ++ # Save the ciphertext (in reverse element order) ++ li $tmp_stride, $STRIDE ++ @{[reverse_order_S $vdata0, $out]} ++ #Update IV to ciphertext block 0 ++ @{[vle32_v $vivec, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata1, $vdata1, $vivec]} ++ ++ @{[vrev8_v $vdata1, $vdata1]} ++ @{[enc_blk $vdata1]} ++ @{[vrev8_v $vdata1, $vdata1]} ++ ++ @{[reverse_order_S $vdata1, $out]} ++ ++ #Update IV to ciphertext block 1 ++ @{[vle32_v $vivec, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata2, $vdata2, $vivec]} ++ ++ @{[vrev8_v $vdata2, $vdata2]} ++ @{[enc_blk $vdata2]} ++ @{[vrev8_v $vdata2, $vdata2]} ++ ++ @{[reverse_order_S $vdata2, $out]} ++ #Update IV to ciphertext block 2 ++ @{[vle32_v $vivec, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata3, $vdata3, $vivec]} ++ ++ @{[vrev8_v $vdata3, $vdata3]} ++ @{[enc_blk $vdata3]} ++ @{[vrev8_v $vdata3, $vdata3]} ++ ++ @{[reverse_order_S $vdata3, $out]} ++ #Update IV to ciphertext block 3 ++ @{[vle32_v $vivec, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ addi $len, $len, -$FOUR_BLOCKS ++ bnez $len, .Lcbc_enc_loop ++ #Save the final IV ++ @{[vse32_v $vivec, $ivp]} ++ ret ++ ++.Lcbc_enc_single: ++ # Load input data0 ++ @{[vle32_v $vdata0, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ #XOR with IV ++ @{[vxor_vv $vdata0, $vdata0, $vivec]} ++ ++ @{[vrev8_v $vdata0, $vdata0]} ++ # Encrypt with all keys ++ @{[enc_blk $vdata0]} ++ @{[vrev8_v $vdata0, $vdata0]} ++ ++ # Save the ciphertext (in reverse element order) ++ li $tmp_stride, $STRIDE ++ @{[reverse_order_S $vdata0, $out]} ++ ++ # Update IV to ciphertext block 0 ++ @{[vle32_v $vivec, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ addi $len, $len, -$BLOCK_SIZE ++ ++ li $tmp, $BLOCK_SIZE ++ bgeu $len, $tmp, .Lcbc_enc_single ++ # Save the final IV ++ @{[vse32_v $vivec, $ivp]} ++.Lcbc_enc_end: ++ ret ++.size rv64i_zvksed_sm4_cbc_encrypt,.-rv64i_zvksed_sm4_cbc_encrypt ++___ ++ ++#### ++# void rv64i_zvksed_sm4_cbc_decrypt(const unsigned char *in, unsigned char *out, ++# size_t len, const SM4_KEY *key, ++# unsigned char *iv, int enc); ++# ++$code .= <<___; ++.p2align 3 ++.globl rv64i_zvksed_sm4_cbc_decrypt ++.type rv64i_zvksed_sm4_cbc_decrypt,\@function ++rv64i_zvksed_sm4_cbc_decrypt: ++ # check whether the length is a multiple of 16 and >= 16 ++ li $tmp, $BLOCK_SIZE ++ bltu $len, $tmp, .Lcbc_dec_end ++ andi $tmp, $len, 15 ++ bnez $tmp, .Lcbc_dec_end ++ ++ @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++ # Load IV (in reverse element order) ++ li $tmp_stride, $STRIDE ++ @{[reverse_order_L $vivec, $ivp]} ++ ++ # Load 32 round keys ++ @{[dec_load_key $keys]} ++# ===================================================== ++# If data length ≥ 128 bytes, process 8 blocks in batch: ++# 8-block CBC decryption pipeline: ++# 1. Load 8 ciphertext blocks ++# 2. Reverse bytes for SM4 endianness ++# 3. Use two calls to dec_4blks for decrypting each data block ++# 4. XOR with previous ciphertext block (CBC chain) ++# 5. Update IV and store plaintext with byte reversal ++# ===================================================== ++.Lcbc_dec_loop: ++ li $tmp, $EIGHT_BLOCKS ++ bltu $len, $tmp, .Lcbc_check_64 ++ # Load input data0-data7 ++ @{[vle32_v $vdata0, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata1, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata2, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata3, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata4, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata5, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata6, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata7, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ ++ @{[vrev8_v $vdata0, $vdata0]} ++ @{[vrev8_v $vdata1, $vdata1]} ++ @{[vrev8_v $vdata2, $vdata2]} ++ @{[vrev8_v $vdata3, $vdata3]} ++ @{[vrev8_v $vdata4, $vdata4]} ++ @{[vrev8_v $vdata5, $vdata5]} ++ @{[vrev8_v $vdata6, $vdata6]} ++ @{[vrev8_v $vdata7, $vdata7]} ++ # Decrypt 8 data blocks ++ @{[dec_4blks $vdata0,$vdata1,$vdata2,$vdata3]} ++ @{[dec_4blks $vdata4,$vdata5,$vdata6,$vdata7]} ++ @{[vrev8_v $vdata0, $vdata0]} ++ @{[vrev8_v $vdata1, $vdata1]} ++ @{[vrev8_v $vdata2, $vdata2]} ++ @{[vrev8_v $vdata3, $vdata3]} ++ @{[vrev8_v $vdata4, $vdata4]} ++ @{[vrev8_v $vdata5, $vdata5]} ++ @{[vrev8_v $vdata6, $vdata6]} ++ @{[vrev8_v $vdata7, $vdata7]} ++ ++ @{[vxor_vv $vdata0, $vdata0, $vivec]} ++ ++ # Update ciphertext to IV (in reverse element order) ++ addi $base, $in, -128 ++ @{[reverse_order_L $vivec, $base]} ++ ++ # Save the plaintext (in reverse element order) ++ @{[reverse_order_S $vdata0, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata1, $vdata1, $vivec]} ++ ++ addi $base, $in, -112 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata1, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata2, $vdata2, $vivec]} ++ ++ addi $base, $in, -96 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata2, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata3, $vdata3, $vivec]} ++ ++ addi $base, $in, -80 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata3, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata4, $vdata4, $vivec]} ++ ++ addi $base, $in, -64 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata4, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata5, $vdata5, $vivec]} ++ ++ addi $base, $in, -48 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata5, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata6, $vdata6, $vivec]} ++ ++ addi $base, $in, -32 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata6, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata7, $vdata7, $vivec]} ++ ++ addi $base, $in, -16 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata7, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ addi $len, $len, -$EIGHT_BLOCKS ++ bnez $len, .Lcbc_dec_loop ++ #Save the final IV (in reverse element order) ++ @{[reverse_order_S $vivec, $ivp]} ++ ret ++# ===================================================== ++# If data length ≥ 64 bytes, process in batches of 4 blocks: ++# 4-block CBC decryption process: ++# 1. Load 4 ciphertext blocks ++# 2. Reverse byte order to fit SM4 byte order ++# 3. Decrypt each data block using the dec_4blks function ++# 4. XOR with previous ciphertext block (CBC chain) ++# 5. Update IV and store plaintext with byte reversal ++# If the data length is less than 64 bytes, process it block by block using the Lcbc_dec_single function ++# ===================================================== ++.Lcbc_check_64: ++ li $tmp, $FOUR_BLOCKS ++ bltu $len, $tmp, .Lcbc_dec_single ++ # Load input data0-data3 ++ @{[vle32_v $vdata0, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata1, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata2, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ @{[vle32_v $vdata3, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ ++ @{[vrev8_v $vdata0, $vdata0]} ++ @{[vrev8_v $vdata1, $vdata1]} ++ @{[vrev8_v $vdata2, $vdata2]} ++ @{[vrev8_v $vdata3, $vdata3]} ++ # Decrypt 4 data blocks ++ @{[dec_4blks $vdata0,$vdata1,$vdata2,$vdata3]} ++ @{[vrev8_v $vdata0, $vdata0]} ++ @{[vrev8_v $vdata1, $vdata1]} ++ @{[vrev8_v $vdata2, $vdata2]} ++ @{[vrev8_v $vdata3, $vdata3]} ++ ++ @{[vxor_vv $vdata0, $vdata0, $vivec]} ++ ++ # Update ciphertext to IV (in reverse element order) ++ addi $base, $in, -64 ++ @{[reverse_order_L $vivec, $base]} ++ # Save the plaintext (in reverse element order) ++ @{[reverse_order_S $vdata0, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata1, $vdata1, $vivec]} ++ ++ addi $base, $in, -48 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata1, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata2, $vdata2, $vivec]} ++ ++ addi $base, $in, -32 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata2, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ @{[vxor_vv $vdata3, $vdata3, $vivec]} ++ ++ addi $base, $in, -16 ++ @{[reverse_order_L $vivec, $base]} ++ @{[reverse_order_S $vdata3, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ ++ addi $len, $len, -$FOUR_BLOCKS ++ bnez $len, .Lcbc_check_64 ++ #Save the final IV (in reverse element order) ++ @{[reverse_order_S $vivec, $ivp]} ++ ret ++ ++.Lcbc_dec_single: ++ # Load input data0 ++ @{[vle32_v $vdata0, $in]} ++ addi $in, $in, $BLOCK_SIZE ++ ++ @{[vrev8_v $vdata0, $vdata0]} ++ # Decrypt with all keys ++ @{[dec_blk $vdata0]} ++ @{[vrev8_v $vdata0, $vdata0]} ++ ++ #XOR with IV ++ @{[vxor_vv $vdata0, $vdata0, $vivec]} ++ ++ # Update ciphertext to IV (in reverse element order) ++ li $tmp_stride, $STRIDE ++ addi $base, $in, -$BLOCK_SIZE ++ @{[reverse_order_L $vivec, $base]} ++ # Save the plaintext (in reverse element order) ++ @{[reverse_order_S $vdata0, $out]} ++ addi $out, $out, $BLOCK_SIZE ++ addi $len, $len, -$BLOCK_SIZE ++ ++ li $tmp, $BLOCK_SIZE ++ bgeu $len, $tmp, .Lcbc_dec_single ++ #Save the final IV (in reverse element order) ++ @{[reverse_order_S $vivec, $ivp]} ++.Lcbc_dec_end: ++ ret ++.size rv64i_zvksed_sm4_cbc_decrypt,.-rv64i_zvksed_sm4_cbc_decrypt ++___ ++} ++ + #### + # int rv64i_zvksed_sm4_set_encrypt_key(const unsigned char *userKey, + # SM4_KEY *key); +@@ -95,19 +637,19 @@ rv64i_zvksed_sm4_set_encrypt_key: + + # Store round keys + @{[vse32_v $vk0, $keys]} # rk[0:3] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk1, $keys]} # rk[4:7] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk2, $keys]} # rk[8:11] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk3, $keys]} # rk[12:15] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk4, $keys]} # rk[16:19] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk5, $keys]} # rk[20:23] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk6, $keys]} # rk[24:27] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vse32_v $vk7, $keys]} # rk[28:31] + + li a0, 1 +@@ -151,21 +693,21 @@ rv64i_zvksed_sm4_set_decrypt_key: + + # Store round keys in reverse order + addi $keys, $keys, 12 +- li $stride, -4 ++ li $stride, $STRIDE + @{[vsse32_v $vk7, $keys, $stride]} # rk[31:28] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk6, $keys, $stride]} # rk[27:24] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk5, $keys, $stride]} # rk[23:20] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk4, $keys, $stride]} # rk[19:16] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk3, $keys, $stride]} # rk[15:12] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk2, $keys, $stride]} # rk[11:8] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk1, $keys, $stride]} # rk[7:4] +- addi $keys, $keys, 16 ++ addi $keys, $keys, $BLOCK_SIZE + @{[vsse32_v $vk0, $keys, $stride]} # rk[3:0] + + li a0, 1 +@@ -179,8 +721,8 @@ ___ + # const SM4_KEY *key); + # + { +-my ($in,$out,$keys,$stride)=("a0","a1","a2","t0"); +-my ($vdata,$vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7,$vgen)=("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10"); ++my ($in,$out,$keys)=("a0","a1","a2"); ++my ($vdata)=("v1"); + $code .= <<___; + .p2align 3 + .globl rv64i_zvksed_sm4_encrypt +@@ -188,42 +730,19 @@ $code .= <<___; + rv64i_zvksed_sm4_encrypt: + @{[vsetivli__x0_4_e32_m1_tu_mu]} + +- # Order of elements was adjusted in set_encrypt_key() +- @{[vle32_v $vk0, $keys]} # rk[0:3] +- addi $keys, $keys, 16 +- @{[vle32_v $vk1, $keys]} # rk[4:7] +- addi $keys, $keys, 16 +- @{[vle32_v $vk2, $keys]} # rk[8:11] +- addi $keys, $keys, 16 +- @{[vle32_v $vk3, $keys]} # rk[12:15] +- addi $keys, $keys, 16 +- @{[vle32_v $vk4, $keys]} # rk[16:19] +- addi $keys, $keys, 16 +- @{[vle32_v $vk5, $keys]} # rk[20:23] +- addi $keys, $keys, 16 +- @{[vle32_v $vk6, $keys]} # rk[24:27] +- addi $keys, $keys, 16 +- @{[vle32_v $vk7, $keys]} # rk[28:31] ++ @{[enc_load_key $keys]} + + # Load input data + @{[vle32_v $vdata, $in]} + @{[vrev8_v $vdata, $vdata]} + + # Encrypt with all keys +- @{[vsm4r_vs $vdata, $vk0]} +- @{[vsm4r_vs $vdata, $vk1]} +- @{[vsm4r_vs $vdata, $vk2]} +- @{[vsm4r_vs $vdata, $vk3]} +- @{[vsm4r_vs $vdata, $vk4]} +- @{[vsm4r_vs $vdata, $vk5]} +- @{[vsm4r_vs $vdata, $vk6]} +- @{[vsm4r_vs $vdata, $vk7]} ++ @{[enc_blk $vdata]} + + # Save the ciphertext (in reverse element order) + @{[vrev8_v $vdata, $vdata]} +- li $stride, -4 +- addi $out, $out, 12 +- @{[vsse32_v $vdata, $out, $stride]} ++ li $tmp_stride, $STRIDE ++ @{[reverse_order_S $vdata, $out]} + + ret + .size rv64i_zvksed_sm4_encrypt,.-rv64i_zvksed_sm4_encrypt +@@ -235,8 +754,8 @@ ___ + # const SM4_KEY *key); + # + { +-my ($in,$out,$keys,$stride)=("a0","a1","a2","t0"); +-my ($vdata,$vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7,$vgen)=("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10"); ++my ($in,$out,$keys)=("a0","a1","a2"); ++my ($vdata)=("v1"); + $code .= <<___; + .p2align 3 + .globl rv64i_zvksed_sm4_decrypt +@@ -244,42 +763,19 @@ $code .= <<___; + rv64i_zvksed_sm4_decrypt: + @{[vsetivli__x0_4_e32_m1_tu_mu]} + +- # Order of elements was adjusted in set_decrypt_key() +- @{[vle32_v $vk7, $keys]} # rk[31:28] +- addi $keys, $keys, 16 +- @{[vle32_v $vk6, $keys]} # rk[27:24] +- addi $keys, $keys, 16 +- @{[vle32_v $vk5, $keys]} # rk[23:20] +- addi $keys, $keys, 16 +- @{[vle32_v $vk4, $keys]} # rk[19:16] +- addi $keys, $keys, 16 +- @{[vle32_v $vk3, $keys]} # rk[15:11] +- addi $keys, $keys, 16 +- @{[vle32_v $vk2, $keys]} # rk[11:8] +- addi $keys, $keys, 16 +- @{[vle32_v $vk1, $keys]} # rk[7:4] +- addi $keys, $keys, 16 +- @{[vle32_v $vk0, $keys]} # rk[3:0] ++ @{[dec_load_key $keys]} + + # Load input data + @{[vle32_v $vdata, $in]} + @{[vrev8_v $vdata, $vdata]} + +- # Encrypt with all keys +- @{[vsm4r_vs $vdata, $vk7]} +- @{[vsm4r_vs $vdata, $vk6]} +- @{[vsm4r_vs $vdata, $vk5]} +- @{[vsm4r_vs $vdata, $vk4]} +- @{[vsm4r_vs $vdata, $vk3]} +- @{[vsm4r_vs $vdata, $vk2]} +- @{[vsm4r_vs $vdata, $vk1]} +- @{[vsm4r_vs $vdata, $vk0]} ++ # Decrypt with all keys ++ @{[dec_blk $vdata]} + +- # Save the ciphertext (in reverse element order) ++ # Save the plaintext (in reverse element order) + @{[vrev8_v $vdata, $vdata]} +- li $stride, -4 +- addi $out, $out, 12 +- @{[vsse32_v $vdata, $out, $stride]} ++ li $tmp_stride, $STRIDE ++ @{[reverse_order_S $vdata, $out]} + + ret + .size rv64i_zvksed_sm4_decrypt,.-rv64i_zvksed_sm4_decrypt +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 4a94711..edd5fcf 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -56,6 +56,12 @@ void rv64i_zvkned_sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, + void rv64i_zvkned_sm4_ecb_decrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); ++void rv64i_zvksed_sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key, ++ unsigned char *iv, int enc); ++void rv64i_zvksed_sm4_cbc_decrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key, ++ unsigned char *iv, int enc); + # endif /* RV64 */ + # endif /* OPENSSL_CPUID_OBJ */ + +diff --git a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc +index 2b80002..7fe3ef2 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc ++++ b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc +@@ -49,6 +49,14 @@ static int cipher_hw_rv64i_zvksed_sm4_initkey(PROV_CIPHER_CTX *ctx, + } + } + ++ if (ctx->mode == EVP_CIPH_CBC_MODE) { ++ if (ctx->enc) { ++ ctx->stream.cbc = (cbc128_f) rv64i_zvksed_sm4_cbc_encrypt; ++ } else { ++ ctx->stream.cbc = (cbc128_f) rv64i_zvksed_sm4_cbc_decrypt; ++ } ++ } ++ + return 1; + } + +diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt +index 993cf7b..f23129c 100644 +--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt ++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt +@@ -13,6 +13,12 @@ Key = 0123456789ABCDEFFEDCBA9876543210 + Plaintext = 0123456789ABCDEFFEDCBA9876543210 + Ciphertext = 681EDF34D206965E86B3E94F536E4246 + ++Cipher = SM4-CBC ++Key = 0123456789ABCDEFFEDCBA9876543210 ++IV = 0123456789ABCDEFFEDCBA9876543210 ++Plaintext = 0123456789ABCDEFFEDCBA9876543210 ++Ciphertext = 2677F46B09C122CC975533105BD4A22A ++ + Cipher = SM4-CBC + Key = 0123456789ABCDEFFEDCBA9876543210 + IV = 0123456789ABCDEFFEDCBA9876543210 +-- +2.27.0 + diff --git a/openssl.spec b/openssl.spec index 0b2e633..a5c312b 100644 --- a/openssl.spec +++ b/openssl.spec @@ -1,4 +1,4 @@ -%define anolis_release 8 +%define anolis_release 9 %global soversion 3 %define srpmhash() %{lua: @@ -97,6 +97,8 @@ Patch: 3100-Anolis-SM3-performance-optimization.patch Patch: 3000-Anolis-SHA512-performance-optimization.patch Patch: 3200-Anolis-AES-128-192-256-ECB-CBC-performance-optimization.patch Patch: 3400-Anolis-SM4-ECB-performance-optimization.patch +Patch: 3500-Fixed-the-compilation-failure-of-the-vmv4r.v-command.patch +Patch: 3501-Backport-riscv-Performance-Optimization-of-SM4-CBC-Encryption-and-Decryption-with-Assembly-on-RISC-V-Architecture.patch BuildRequires: gcc git coreutils perl-interpreter sed zlib-devel /usr/bin/cmp BuildRequires: lksctp-tools-devel @@ -186,7 +188,7 @@ export HASHBANGPERL=/usr/bin/perl --system-ciphers-file=%{_sysconfdir}/crypto-policies/back-ends/openssl.config \ zlib enable-camellia enable-seed enable-rfc3779 enable-sctp \ enable-cms enable-md2 enable-rc5 enable-ktls enable-fips\ - no-mdc2 no-ec2m enable-sm2 no-sm4 no-atexit enable-buildtest-c++\ + no-mdc2 no-ec2m enable-sm2 enable-sm4 no-atexit enable-buildtest-c++\ shared ${sslarch} $RPM_OPT_FLAGS '-DDEVRANDOM="\"/dev/urandom\"" -DREDHAT_FIPS_VERSION="\"%{fips}\""'\ -Wl,--allow-multiple-definition @@ -300,6 +302,10 @@ rm -f $RPM_BUILD_ROOT%{_sysconfdir}/pki/tls/{openssl.cnf.dist,ct_log_list.cnf.di %doc NEWS.md README.md %changelog +* Wed Dec 12 2025 geliya 1:3.5.4-9 +- Fixed-the-compilation-failure-of-the-vmv4r.v-command +- Backport-riscv-Performance-Optimization-of-SM4-CBC-Encryption-and-Decryption-with-Assembly-on-RISC-V-Architecture + * Wed Dec 11 2025 geliya 1:3.5.4-8 - Backport: Add SM2 implementation in generic riscv64 asm -- Gitee