[hardware] 🐛 Parametrize and fix conversion in VMFPU

pulp-platform · Aug 22, 2024 · 2ddce5e · 2ddce5e
1 parent 1fb435e
commit 2ddce5e
Show file tree

Hide file tree

Showing 3 changed files with 146 additions and 84 deletions.
diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
@@ -229,6 +229,66 @@ package ara_pkg;
     logic [51:0] m;
   } fp64_t;
 
+  function automatic int unsigned fp_mantissa_bits(rvv_pkg::vew_e fp_dtype, logic is_alt);
+    unique case ({fp_dtype, is_alt})
+      {rvv_pkg::EW8,  1'b0}: fp_mantissa_bits = 2;
+      {rvv_pkg::EW8,  1'b1}: fp_mantissa_bits = 3;
+      {rvv_pkg::EW16, 1'b0}: fp_mantissa_bits = 10;
+      {rvv_pkg::EW16, 1'b1}: fp_mantissa_bits = 7;
+      {rvv_pkg::EW32, 1'b0}: fp_mantissa_bits = 23;
+      {rvv_pkg::EW64, 1'b0}: fp_mantissa_bits = 52;
+      default: fp_mantissa_bits = -1;
+    endcase
+  endfunction
+
+  function automatic fp32_t fp32_from_fp16(fp16_t fp16, logic [$clog2(fp_mantissa_bits(rvv_pkg::EW16, 0)):0] fp16_m_lzc);
+    automatic fp16_t fp16_temp;
+    automatic fp32_t fp32;
+
+    // Wide sign
+    fp32.s = fp16.s;
+
+    // Wide exponent
+    // 127 - 15 = 112
+    unique case(fp16.e)
+      '0:      fp32.e = (fp16.m == '0) ? '0 : 8'd112 - {4'd0, fp16_m_lzc}; // Zero or Subnormal
+      '1:      fp32.e = '1; // NaN
+      default: fp32.e = 8'd112 + {3'd0, fp16.e}; // Normal
+    endcase
+
+    // Wide mantissa
+    // If the input is NaN, output a quiet NaN mantissa.
+    // Otherwise, append trailing zeros to the mantissa.
+    fp16_temp.m = ((fp16.e == '0) && (fp16.m != '0)) ? (fp16.m << 1) << fp16_m_lzc : fp16.m;
+    fp32.m = ((fp16.e == '1) && (fp16.m != '0) ) ? {1'b1, 22'b0} : {fp16_temp.m, 13'b0};
+
+    fp32_from_fp16 = fp32;
+  endfunction
+
+  function automatic fp64_t fp64_from_fp32(fp32_t fp32, logic [$clog2(fp_mantissa_bits(rvv_pkg::EW32, 0)):0] fp32_m_lzc);
+    automatic fp32_t fp32_temp;
+    automatic fp64_t fp64;
+
+    // Wide sign
+    fp64.s = fp32.s;
+
+    // Wide exponent
+    // 1023 - 127 = 896
+    unique case(fp32.e)
+      '0:      fp64.e = (fp32.m == '0) ? '0 : 11'd896 - {6'd0, fp32_m_lzc}; // Zero or Subnormal
+      '1:      fp64.e = '1; // NaN
+      default: fp64.e = 11'd896 + {3'd0, fp32.e}; // Normal
+    endcase
+
+    // Wide mantissa
+    // If the input is NaN, output a quiet NaN mantissa.
+    // Otherwise, append trailing zeros to the mantissa.
+    fp32_temp.m = ((fp32.e == '0) && (fp32.m != '0)) ? (fp32.m << 1) << fp32_m_lzc : fp32.m;
+    fp64.m = ((fp32.e == '1) && (fp32.m != '0)) ? {1'b1, 51'b0} : {fp32_temp.m, 29'b0};
+
+    fp64_from_fp32 = fp64;
+  endfunction
+
   /////////////////////////////
   //  Accelerator interface  //
   /////////////////////////////

diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv
@@ -155,42 +155,46 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   // Helper to fill with neutral values the last packet
   logic incomplete_packet, last_packet;
 
+  ////////////////////////////////
+  //  Floating-point conversion //
+  ////////////////////////////////
 
-  logic [3:0] lzc_count16[2];
-  logic [4:0] lzc_count32;
+  logic [$clog2(fp_mantissa_bits(EW16, 0))-1:0] fp16_m_lzc[2]; // 4 bits each
+  logic [$clog2(fp_mantissa_bits(EW32, 0))-1:0] fp32_m_lzc;    // 5 bits each
 
   fp16_t fp16[2];
   fp32_t fp32;
 
-  if (FPUSupport != FPUSupportNone) begin
-   // To convert subnormal numbers to normalized form in floating-point numbers,
-   // it is necessary to determine the number of leading zeros in the mantissa.
-   // This is typically accomplished using a lzc (leading zero count) module,
-   // which can accurately count the number of leading zeros in a given number.
-   // By knowing the number of leading zeros in the mantissa, we can properly
-   // adjust the exponent and shift the binary point to achieve a normalized
-   // representation of the number.
-
+  // To convert subnormal numbers to normalized form in floating-point numbers,
+  // it is necessary to determine the number of leading zeros in the mantissa.
+  // This is typically accomplished using a lzc (leading zero count) module,
+  // which can accurately count the number of leading zeros in a given number.
+  // By knowing the number of leading zeros in the mantissa, we can properly
+  // adjust the exponent and shift the binary point to achieve a normalized
+  // representation of the number.
+  if ({RVVH(FPUSupport), RVVF(FPUSupport)} == 2'b11) begin
     // sew: 16-bit
-    for (genvar i = 0; i < 2; i = i + 1) begin
+    for (genvar i = 0; i < 2; i++) begin
       lzc #(
-        .WIDTH(10),
-        .MODE (1 )
+        .WIDTH(fp_mantissa_bits(EW16, 0)),
+        .MODE (1)
       ) leading_zero_e16_i (
-         .in_i    ( fp16[i].m      ),
-         .cnt_o   ( lzc_count16[i] ),
-         .empty_o ( /*Unused*/     )
+        .in_i   (fp16[i].m    ),
+        .cnt_o  (fp16_m_lzc[i]),
+        .empty_o(/*Unused*/   )
       );
     end
+  end
 
+  if ({RVVF(FPUSupport), RVVD(FPUSupport)} == 2'b11) begin
     // sew: 32-bit
     lzc #(
-       .WIDTH (23),
-       .MODE  (1 )
-     ) leading_zero_e32(
-       .in_i    ( fp32.m      ),
-       .cnt_o   ( lzc_count32 ),
-       .empty_o ( /*Unused*/  )
+       .WIDTH(fp_mantissa_bits(EW32, 0)),
+       .MODE (1)
+     ) leading_zero_e32 (
+       .in_i   (fp32.m    ),
+       .cnt_o  (fp32_m_lzc),
+       .empty_o(/*Unused*/)
      );
   end
 
@@ -204,6 +208,9 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     incomplete_packet = 1'b0;
     last_packet       = 1'b0;
 
+    for (int i = 0; i < 2; i++) fp16[i] = '0;
+    for (int i = 0; i < 1; i++) fp32[i] = '0;
+
     // Reductions need to mask away the inactive elements
     // A temporary solution is to send a neutral value directly
     // from the opqueues
@@ -376,56 +383,13 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
           unique casez ({cmd.eew, RVVH(FPUSupport), RVVF(FPUSupport), RVVD(FPUSupport)})
             {EW16, 1'b1, 1'b1, 1'b?}: begin
               for (int e = 0; e < 2; e++) begin
-                automatic fp32_t fp32_o;
-                automatic fp16_t fp16_temp;
-                automatic logic [7:0] fp32_exp;
-
                 fp16[e] = ibuf_operand[8*select + 32*e +: 16];
-
-                fp16_temp.m = ((fp16[e].e == '0) && (fp16[e].m != '0)) ? fp16[e].m << (5'd1 + {1'd0, lzc_count16[e]}) : fp16[e].m;
-
-                fp32_exp = (fp16[e].m == '0) ? '0 : 8'd112 - {4'd0, lzc_count16[e]};  //127 - 15 = 112
-
-                unique case(fp16[e].e)
-                  '0:      fp32_o.e = fp32_exp; // Zero or Subnormal
-                  '1:      fp32_o.e = '1; // NaN
-                  default: fp32_o.e = 8'd112 + {3'd0, fp16[e].e}; // Normal ,127 - 15 = 112
-                endcase
-
-                fp32_o.s = fp16[e].s;
-
-                // If the input is NaN, output a quiet NaN mantissa.
-                // Otherwise, append trailing zeros to the mantissa.
-                fp32_o.m = ((fp16[e].e == '1) && (fp16[e].m != '0) ) ? {1'b1, 22'b0} : {fp16_temp.m, 13'b0};
-
-                conv_operand[32*e +: 32] = fp32_o;
+                conv_operand[32*e +: 32] = fp32_from_fp16(fp16[e], fp16_m_lzc[e]);
               end
             end
             {EW32, 1'b?, 1'b1, 1'b1}: begin
-              automatic fp64_t fp64;
-              automatic fp32_t fp32_temp;
-
-              automatic logic [10:0] fp64_exp;
-
-              fp32  = ibuf_operand[8*select +: 32];
-
-              fp32_temp.m = ((fp32.e == '0) && (fp32.m != '0)) ? fp32.m << (8'd1 + {3'd0, lzc_count32}) : fp32.m;
-
-              fp64_exp = (fp32.m == '0) ? '0 : 11'd896 - {6'd0, lzc_count32}; //1023 - 127 = 896
-
-              unique case(fp32.e)
-                '0:      fp64.e = fp64_exp; // Zero or Subnormal
-                '1:      fp64.e = '1; // NaN
-                default: fp64.e = 11'd896 + {3'd0, fp32.e}; // Normal , 1023 - 127 = 896
-              endcase
-
-              fp64.s = fp32.s;
-
-              // If the input is NaN, output a quiet NaN mantissa.
-              // Otherwise, append trailing zeros to the mantissa.
-              fp64.m = ((fp32.e == '1) && (fp32.m != '0)) ? {1'b1, 51'b0} : {fp32_temp.m, 29'b0};
-
-              conv_operand = fp64;
+              fp32 = ibuf_operand[8*select +: 32];
+              conv_operand = fp64_from_fp32(fp32, fp32_m_lzc);
             end
             default:;
           endcase

diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
@@ -724,6 +724,50 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
       15: reduction_rx_cnt_init = reduction_rx_cnt_t'(4);
     endcase
   endfunction: reduction_rx_cnt_init
+
+  ////////////////////////////////
+  //  Floating-point conversion //
+  ////////////////////////////////
+
+  logic [$clog2(fp_mantissa_bits(EW16, 0))-1:0] fp16_m_lzc[2]; // 4 bits each
+  logic [$clog2(fp_mantissa_bits(EW32, 0))-1:0] fp32_m_lzc;    // 5 bits each
+
+  fp16_t fp16[2];
+  fp32_t fp32;
+
+  // To convert subnormal numbers to normalized form in floating-point numbers,
+  // it is necessary to determine the number of leading zeros in the mantissa.
+  // This is typically accomplished using a lzc (leading zero count) module,
+  // which can accurately count the number of leading zeros in a given number.
+  // By knowing the number of leading zeros in the mantissa, we can properly
+  // adjust the exponent and shift the binary point to achieve a normalized
+  // representation of the number.
+  if ({RVVH(FPUSupport), RVVF(FPUSupport)} == 2'b11) begin
+    // sew: 16-bit
+    for (genvar i = 0; i < 2; i++) begin
+      lzc #(
+        .WIDTH(fp_mantissa_bits(EW16, 0)),
+        .MODE (1)
+      ) leading_zero_e16_i (
+        .in_i   (fp16[i].m    ),
+        .cnt_o  (fp16_m_lzc[i]),
+        .empty_o(/*Unused*/   )
+      );
+    end
+  end
+
+  if ({RVVF(FPUSupport), RVVD(FPUSupport)} == 2'b11) begin
+    // sew: 32-bit
+    lzc #(
+       .WIDTH(fp_mantissa_bits(EW32, 0)),
+       .MODE (1)
+     ) leading_zero_e32 (
+       .in_i   (fp32.m    ),
+       .cnt_o  (fp32_m_lzc),
+       .empty_o(/*Unused*/)
+     );
+  end
+
   ///////////
   //  FPU  //
   ///////////
@@ -1342,6 +1386,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                    ? {vinsn_issue_q.use_vs2, vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs1}
                    : {vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs2, vinsn_issue_q.use_vs1};
 
+    for (int i = 0; i < 2; i++) fp16[i] = '0;
+    for (int i = 0; i < 1; i++) fp32[i] = '0;
+
     first_op_d              = first_op_q;
     simd_red_cnt_d          = simd_red_cnt_q;
     reduction_rx_cnt_d      = reduction_rx_cnt_q;
@@ -2165,24 +2212,15 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
             RVVD(FPUSupport)})
             {EW32, 1'b1, 1'b1, 1'b?}: begin
               for (int e = 0; e < 2; e++) begin
-                automatic fp16_t fp16 =
-                  vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[15:0];
-                automatic fp32_t fp32;
-                fp32.s = fp16.s;
-                fp32.e = (fp16.e - 15) + 127;
-                fp32.m = {fp16.m, 13'b0};
-
-                vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[32*e +: 32] = fp32;
+                fp16[e] = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[15:0];
+                vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[32*e +: 32] =
+                  fp32_from_fp16(fp16[e], fp16_m_lzc[e]);
               end
             end
             {EW64, 1'b?, 1'b1, 1'b1}: begin
-              automatic fp32_t fp32 = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[31:0];
-              automatic fp64_t fp64;
-              fp64.s = fp32.s;
-              fp64.e = (fp32.e - 127) + 1023;
-              fp64.m = {fp32.m, 29'b0};
-
-              vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op = fp64;
+              fp32 = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[31:0];
+              vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op =
+                fp64_from_fp32(fp32, fp32_m_lzc);
             end
             default:;
           endcase