Zvk: Implement Zvksed, vector SM4 Block Cipher

Implement the Zvksed sub-extension, "ShangMi Suite: SM4 Block Cipher": - vsm4k.vi, vector SM4 key expansion, - vsm4r.{vs,vv}, vector SM4 rounds. This also introduces a header for common vector SM4 logic. Co-authored-by: Raghav Gupta <rgupta@rivosinc.com> Co-authored-by: Albert Jakieła <aja@semihalf.com> Signed-off-by: Eric Gouriou <ego@rivosinc.com>
riscv-software-src · Jun 19, 2023 · cbb2b1a · cbb2b1a
1 parent eadb0e1
commit cbb2b1a
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 1 deletion.
diff --git a/riscv/insns/sm4_common.h b/riscv/insns/sm4_common.h
@@ -24,4 +24,3 @@ static const uint8_t sm4_sbox[256] = {
 	0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
 	0xD7, 0xCB, 0x39, 0x48
 };
-
diff --git a/riscv/insns/vsm4k_vi.h b/riscv/insns/vsm4k_vi.h
@@ -0,0 +1,52 @@
+// vsm4k.vi vd, vs2, round#
+
+#include "zvksed_ext_macros.h"
+
+// SM4 Constant Key (CK) - section 7.3.2. of the IETF draft.
+static constexpr uint32_t zvksed_ck[32] = {
+  0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269,
+  0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9,
+  0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249,
+  0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9,
+  0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229,
+  0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299,
+  0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209,
+  0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+};
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_ZIMM5_EGU32x4_NOVM_LOOP(
+  {},
+  // The following statements will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the 'round' variable
+  // declared and defined here  here to be visible in the loop block.
+  // Only consider the bottom 3 bits of the immediate, ensuring that
+  // 'round' is in the valid range [0, 7].
+  const reg_t round = zimm5 & 0x7;,
+  // Per Element Group body.
+  {
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B = rk1 ^ rk2 ^ rk3 ^ zvksed_ck[4 * round];
+    uint32_t S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk4 = ZVKSED_ROUND_KEY(rk0, S);
+
+    B = rk2 ^ rk3 ^ rk4 ^ zvksed_ck[4 * round + 1];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk5 = ZVKSED_ROUND_KEY(rk1, S);
+
+    B = rk3 ^ rk4 ^ rk5 ^ zvksed_ck[4 * round + 2];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk6 = ZVKSED_ROUND_KEY(rk2, S);
+
+    B = rk4 ^ rk5 ^ rk6 ^ zvksed_ck[4 * round + 3];
+    S = ZVKSED_SUB_BYTES(B);
+    uint32_t rk7 = ZVKSED_ROUND_KEY(rk3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, rk4, rk5, rk6, rk7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vs.h b/riscv/insns/vsm4r_vs.h
@@ -0,0 +1,51 @@
+// vsm4r.vs vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+// No overlap of vd and vs2.
+require(insn.rd() != insn.rs2());
+
+VI_ZVK_VD_VS2_NOOPERANDS_PRELOOP_EGU32x4_NOVM_LOOP(
+  {},
+  // This statement will be executed before the first execution
+  // of the loop, and only if the loop is going to be entered.
+  // We cannot use a block ( { ... } ) since we want the variables declared
+  // here to be visible in the loop block.
+  // We capture the "scalar", vs2's first element, by copy, even though
+  // the "no overlap" constraint means that vs2 should remain constant
+  // during the loop.
+  const EGU32x4_t scalar_key = P.VU.elt_group<EGU32x4_t>(vs2_num, 0);
+  const uint32_t rk0 = scalar_key[0];
+  const uint32_t rk1 = scalar_key[1];
+  const uint32_t rk2 = scalar_key[2];
+  const uint32_t rk3 = scalar_key[3];,
+  {
+    EGU32x4_t &state = P.VU.elt_group<EGU32x4_t>(vd_num, idx_eg, true);
+
+    // {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(state, x0, x1, x2, x3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(state, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/insns/vsm4r_vv.h b/riscv/insns/vsm4r_vv.h
@@ -0,0 +1,37 @@
+// vsm4r.vv vd, vs2
+
+#include "zvksed_ext_macros.h"
+
+require_vsm4_constraints;
+
+VI_ZVK_VD_VS2_EGU32x4_NOVM_LOOP(
+  {},
+  {
+    // vd = {x0, x1,x2, x3} <- vd
+    EXTRACT_EGU32x4_WORDS_LE(vd, x0, x1, x2, x3);
+    // {rk0, rk1, rk2, rk3} <- vs2
+    EXTRACT_EGU32x4_WORDS_LE(vs2, rk0, rk1, rk2, rk3);
+
+    uint32_t B;
+    uint32_t S;
+
+    B = x1 ^ x2 ^ x3 ^ rk0;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x4 = ZVKSED_ROUND(x0, S);
+
+    B = x2 ^ x3 ^ x4 ^ rk1;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x5 = ZVKSED_ROUND(x1, S);
+
+    B = x3 ^ x4 ^ x5 ^ rk2;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x6 = ZVKSED_ROUND(x2, S);
+
+    B = x4 ^ x5 ^ x6 ^ rk3;
+    S = ZVKSED_SUB_BYTES(B);
+    const uint32_t x7 = ZVKSED_ROUND(x3, S);
+
+    // Update the destination register.
+    SET_EGU32x4_LE(vd, x4, x5, x6, x7);
+  }
+);
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
@@ -1387,12 +1387,18 @@ riscv_insn_ext_zvknh = \
 	vsha2ch_vv \
 	vsha2ms_vv \
 
+riscv_insn_ext_zvksed = \
+	vsm4k_vi \
+	vsm4r_vs \
+	vsm4r_vv \
+
 riscv_insn_ext_zvk = \
 	$(riscv_insn_ext_zvbb) \
 	$(riscv_insn_ext_zvbc) \
 	$(riscv_insn_ext_zvkg) \
 	$(riscv_insn_ext_zvkned) \
 	$(riscv_insn_ext_zvknh) \
+	$(riscv_insn_ext_zvksed) \
 
 riscv_insn_list = \
 	$(if $(HAVE_INT128),$(riscv_insn_ext_v),) \

diff --git a/riscv/zvksed_ext_macros.h b/riscv/zvksed_ext_macros.h
@@ -0,0 +1,60 @@
+// Helper macros and functions to help implement instructions defined as part of
+// the RISC-V Zvksed extension (vectorized SM4).
+
+#include "insns/sm4_common.h"
+#include "zvk_ext_macros.h"
+
+#ifndef RISCV_ZVKSED_MACROS_H_
+#define RISCV_ZVKSED_MACROS_H_
+
+// Constraints common to all vsm4* instructions:
+//  - Zvksed is enabled
+//  - VSEW == 32
+//  - EGW (128) <= LMUL * VLEN
+//
+// The constraint that vstart and vl are both EGS (4) aligned
+// is checked in the VI_ZVK_..._EGU32x4_..._LOOP macros.
+#define require_vsm4_constraints \
+  do { \
+    require_zvksed; \
+    require(P.VU.vsew == 32); \
+    require_egw_fits(128); \
+  } while (false)
+
+// Returns a uint32_t value constructed from the 4 bytes (uint8_t)
+// provided in "Little Endian" (LE) order, i.e., from least significant (B0)
+// to most significant (B3).
+#define ZVKSED_U32_FROM_U8_LE(B0, B1, B2, B3) \
+  (((uint32_t)(B0)) <<  0 | \
+   ((uint32_t)(B1)) <<  8 | \
+   ((uint32_t)(B2)) << 16 | \
+   ((uint32_t)(B3)) << 24)
+
+// Get byte BYTE of the SBox.
+#define ZVKSED_SBOX(BYTE)  (sm4_sbox[(BYTE)])
+
+// Given an unsigned integer value 'X' and a byte index,
+// returns a uint8_t value for the byte at the given index.
+#define ZVKSED_EXTRACT_U8(X, BYTE_IDX) ((uint8_t)((X) >> (BYTE_IDX * 8)))
+
+// Apply the nonlinear transformation tau to a 32 bit word B - section 6.2.1.
+// of the IETF draft.
+#define ZVKSED_SUB_BYTES(B) \
+  ZVKSED_U32_FROM_U8_LE(ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 0)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 1)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 2)), \
+                        ZVKSED_SBOX(ZVKSED_EXTRACT_U8((B), 3)))
+
+// Perform the linear transformation L to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND(X, S) \
+  ((X) ^ \
+   ((S) ^ ZVK_ROL32((S), 2) ^ ZVK_ROL32((S), 10) ^ \
+    ZVK_ROL32((S), 18) ^ ZVK_ROL32((S), 24)))
+
+// Perform the linear transformation L' to a 32 bit word S and xor it with a 32
+// bit word X - section 6.2.2. of the IETF draft.
+#define ZVKSED_ROUND_KEY(X, S) \
+  ((X) ^ ((S) ^ ZVK_ROL32((S), 13) ^ ZVK_ROL32((S), 23)))
+
+#endif // RISCV_ZVKSED_MACROS_H_