diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 7a89f04b3..13b5b0424 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -229,6 +229,66 @@ package ara_pkg; logic [51:0] m; } fp64_t; + function automatic int unsigned fp_mantissa_bits(rvv_pkg::vew_e fp_dtype, logic is_alt); + unique case ({fp_dtype, is_alt}) + {rvv_pkg::EW8, 1'b0}: fp_mantissa_bits = 2; + {rvv_pkg::EW8, 1'b1}: fp_mantissa_bits = 3; + {rvv_pkg::EW16, 1'b0}: fp_mantissa_bits = 10; + {rvv_pkg::EW16, 1'b1}: fp_mantissa_bits = 7; + {rvv_pkg::EW32, 1'b0}: fp_mantissa_bits = 23; + {rvv_pkg::EW64, 1'b0}: fp_mantissa_bits = 52; + default: fp_mantissa_bits = -1; + endcase + endfunction + + function automatic fp32_t fp32_from_fp16(fp16_t fp16, logic [$clog2(fp_mantissa_bits(rvv_pkg::EW16, 0)):0] fp16_m_lzc); + automatic fp16_t fp16_temp; + automatic fp32_t fp32; + + // Wide sign + fp32.s = fp16.s; + + // Wide exponent + // 127 - 15 = 112 + unique case(fp16.e) + '0: fp32.e = (fp16.m == '0) ? '0 : 8'd112 - {4'd0, fp16_m_lzc}; // Zero or Subnormal + '1: fp32.e = '1; // NaN + default: fp32.e = 8'd112 + {3'd0, fp16.e}; // Normal + endcase + + // Wide mantissa + // If the input is NaN, output a quiet NaN mantissa. + // Otherwise, append trailing zeros to the mantissa. + fp16_temp.m = ((fp16.e == '0) && (fp16.m != '0)) ? (fp16.m << 1) << fp16_m_lzc : fp16.m; + fp32.m = ((fp16.e == '1) && (fp16.m != '0) ) ? {1'b1, 22'b0} : {fp16_temp.m, 13'b0}; + + fp32_from_fp16 = fp32; + endfunction + + function automatic fp64_t fp64_from_fp32(fp32_t fp32, logic [$clog2(fp_mantissa_bits(rvv_pkg::EW32, 0)):0] fp32_m_lzc); + automatic fp32_t fp32_temp; + automatic fp64_t fp64; + + // Wide sign + fp64.s = fp32.s; + + // Wide exponent + // 1023 - 127 = 896 + unique case(fp32.e) + '0: fp64.e = (fp32.m == '0) ? '0 : 11'd896 - {6'd0, fp32_m_lzc}; // Zero or Subnormal + '1: fp64.e = '1; // NaN + default: fp64.e = 11'd896 + {3'd0, fp32.e}; // Normal + endcase + + // Wide mantissa + // If the input is NaN, output a quiet NaN mantissa. + // Otherwise, append trailing zeros to the mantissa. + fp32_temp.m = ((fp32.e == '0) && (fp32.m != '0)) ? (fp32.m << 1) << fp32_m_lzc : fp32.m; + fp64.m = ((fp32.e == '1) && (fp32.m != '0)) ? {1'b1, 51'b0} : {fp32_temp.m, 29'b0}; + + fp64_from_fp32 = fp64; + endfunction + ///////////////////////////// // Accelerator interface // ///////////////////////////// diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index 7c416acd5..640d77f14 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -155,42 +155,46 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Helper to fill with neutral values the last packet logic incomplete_packet, last_packet; + //////////////////////////////// + // Floating-point conversion // + //////////////////////////////// - logic [3:0] lzc_count16[2]; - logic [4:0] lzc_count32; + logic [$clog2(fp_mantissa_bits(EW16, 0))-1:0] fp16_m_lzc[2]; // 4 bits each + logic [$clog2(fp_mantissa_bits(EW32, 0))-1:0] fp32_m_lzc; // 5 bits each fp16_t fp16[2]; fp32_t fp32; - if (FPUSupport != FPUSupportNone) begin - // To convert subnormal numbers to normalized form in floating-point numbers, - // it is necessary to determine the number of leading zeros in the mantissa. - // This is typically accomplished using a lzc (leading zero count) module, - // which can accurately count the number of leading zeros in a given number. - // By knowing the number of leading zeros in the mantissa, we can properly - // adjust the exponent and shift the binary point to achieve a normalized - // representation of the number. - + // To convert subnormal numbers to normalized form in floating-point numbers, + // it is necessary to determine the number of leading zeros in the mantissa. + // This is typically accomplished using a lzc (leading zero count) module, + // which can accurately count the number of leading zeros in a given number. + // By knowing the number of leading zeros in the mantissa, we can properly + // adjust the exponent and shift the binary point to achieve a normalized + // representation of the number. + if ({RVVH(FPUSupport), RVVF(FPUSupport)} == 2'b11) begin // sew: 16-bit - for (genvar i = 0; i < 2; i = i + 1) begin + for (genvar i = 0; i < 2; i++) begin lzc #( - .WIDTH(10), - .MODE (1 ) + .WIDTH(fp_mantissa_bits(EW16, 0)), + .MODE (1) ) leading_zero_e16_i ( - .in_i ( fp16[i].m ), - .cnt_o ( lzc_count16[i] ), - .empty_o ( /*Unused*/ ) + .in_i (fp16[i].m ), + .cnt_o (fp16_m_lzc[i]), + .empty_o(/*Unused*/ ) ); end + end + if ({RVVF(FPUSupport), RVVD(FPUSupport)} == 2'b11) begin // sew: 32-bit lzc #( - .WIDTH (23), - .MODE (1 ) - ) leading_zero_e32( - .in_i ( fp32.m ), - .cnt_o ( lzc_count32 ), - .empty_o ( /*Unused*/ ) + .WIDTH(fp_mantissa_bits(EW32, 0)), + .MODE (1) + ) leading_zero_e32 ( + .in_i (fp32.m ), + .cnt_o (fp32_m_lzc), + .empty_o(/*Unused*/) ); end @@ -204,6 +208,9 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i incomplete_packet = 1'b0; last_packet = 1'b0; + for (int i = 0; i < 2; i++) fp16[i] = '0; + for (int i = 0; i < 1; i++) fp32[i] = '0; + // Reductions need to mask away the inactive elements // A temporary solution is to send a neutral value directly // from the opqueues @@ -376,56 +383,13 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i unique casez ({cmd.eew, RVVH(FPUSupport), RVVF(FPUSupport), RVVD(FPUSupport)}) {EW16, 1'b1, 1'b1, 1'b?}: begin for (int e = 0; e < 2; e++) begin - automatic fp32_t fp32_o; - automatic fp16_t fp16_temp; - automatic logic [7:0] fp32_exp; - fp16[e] = ibuf_operand[8*select + 32*e +: 16]; - - fp16_temp.m = ((fp16[e].e == '0) && (fp16[e].m != '0)) ? fp16[e].m << (5'd1 + {1'd0, lzc_count16[e]}) : fp16[e].m; - - fp32_exp = (fp16[e].m == '0) ? '0 : 8'd112 - {4'd0, lzc_count16[e]}; //127 - 15 = 112 - - unique case(fp16[e].e) - '0: fp32_o.e = fp32_exp; // Zero or Subnormal - '1: fp32_o.e = '1; // NaN - default: fp32_o.e = 8'd112 + {3'd0, fp16[e].e}; // Normal ,127 - 15 = 112 - endcase - - fp32_o.s = fp16[e].s; - - // If the input is NaN, output a quiet NaN mantissa. - // Otherwise, append trailing zeros to the mantissa. - fp32_o.m = ((fp16[e].e == '1) && (fp16[e].m != '0) ) ? {1'b1, 22'b0} : {fp16_temp.m, 13'b0}; - - conv_operand[32*e +: 32] = fp32_o; + conv_operand[32*e +: 32] = fp32_from_fp16(fp16[e], fp16_m_lzc[e]); end end {EW32, 1'b?, 1'b1, 1'b1}: begin - automatic fp64_t fp64; - automatic fp32_t fp32_temp; - - automatic logic [10:0] fp64_exp; - - fp32 = ibuf_operand[8*select +: 32]; - - fp32_temp.m = ((fp32.e == '0) && (fp32.m != '0)) ? fp32.m << (8'd1 + {3'd0, lzc_count32}) : fp32.m; - - fp64_exp = (fp32.m == '0) ? '0 : 11'd896 - {6'd0, lzc_count32}; //1023 - 127 = 896 - - unique case(fp32.e) - '0: fp64.e = fp64_exp; // Zero or Subnormal - '1: fp64.e = '1; // NaN - default: fp64.e = 11'd896 + {3'd0, fp32.e}; // Normal , 1023 - 127 = 896 - endcase - - fp64.s = fp32.s; - - // If the input is NaN, output a quiet NaN mantissa. - // Otherwise, append trailing zeros to the mantissa. - fp64.m = ((fp32.e == '1) && (fp32.m != '0)) ? {1'b1, 51'b0} : {fp32_temp.m, 29'b0}; - - conv_operand = fp64; + fp32 = ibuf_operand[8*select +: 32]; + conv_operand = fp64_from_fp32(fp32, fp32_m_lzc); end default:; endcase diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index c5c05a291..a0eb10f9d 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -724,6 +724,50 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; 15: reduction_rx_cnt_init = reduction_rx_cnt_t'(4); endcase endfunction: reduction_rx_cnt_init + + //////////////////////////////// + // Floating-point conversion // + //////////////////////////////// + + logic [$clog2(fp_mantissa_bits(EW16, 0))-1:0] fp16_m_lzc[2]; // 4 bits each + logic [$clog2(fp_mantissa_bits(EW32, 0))-1:0] fp32_m_lzc; // 5 bits each + + fp16_t fp16[2]; + fp32_t fp32; + + // To convert subnormal numbers to normalized form in floating-point numbers, + // it is necessary to determine the number of leading zeros in the mantissa. + // This is typically accomplished using a lzc (leading zero count) module, + // which can accurately count the number of leading zeros in a given number. + // By knowing the number of leading zeros in the mantissa, we can properly + // adjust the exponent and shift the binary point to achieve a normalized + // representation of the number. + if ({RVVH(FPUSupport), RVVF(FPUSupport)} == 2'b11) begin + // sew: 16-bit + for (genvar i = 0; i < 2; i++) begin + lzc #( + .WIDTH(fp_mantissa_bits(EW16, 0)), + .MODE (1) + ) leading_zero_e16_i ( + .in_i (fp16[i].m ), + .cnt_o (fp16_m_lzc[i]), + .empty_o(/*Unused*/ ) + ); + end + end + + if ({RVVF(FPUSupport), RVVD(FPUSupport)} == 2'b11) begin + // sew: 32-bit + lzc #( + .WIDTH(fp_mantissa_bits(EW32, 0)), + .MODE (1) + ) leading_zero_e32 ( + .in_i (fp32.m ), + .cnt_o (fp32_m_lzc), + .empty_o(/*Unused*/) + ); + end + /////////// // FPU // /////////// @@ -1342,6 +1386,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; ? {vinsn_issue_q.use_vs2, vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs1} : {vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs2, vinsn_issue_q.use_vs1}; + for (int i = 0; i < 2; i++) fp16[i] = '0; + for (int i = 0; i < 1; i++) fp32[i] = '0; + first_op_d = first_op_q; simd_red_cnt_d = simd_red_cnt_q; reduction_rx_cnt_d = reduction_rx_cnt_q; @@ -2165,24 +2212,15 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; RVVD(FPUSupport)}) {EW32, 1'b1, 1'b1, 1'b?}: begin for (int e = 0; e < 2; e++) begin - automatic fp16_t fp16 = - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[15:0]; - automatic fp32_t fp32; - fp32.s = fp16.s; - fp32.e = (fp16.e - 15) + 127; - fp32.m = {fp16.m, 13'b0}; - - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[32*e +: 32] = fp32; + fp16[e] = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[15:0]; + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[32*e +: 32] = + fp32_from_fp16(fp16[e], fp16_m_lzc[e]); end end {EW64, 1'b?, 1'b1, 1'b1}: begin - automatic fp32_t fp32 = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[31:0]; - automatic fp64_t fp64; - fp64.s = fp32.s; - fp64.e = (fp32.e - 127) + 1023; - fp64.m = {fp32.m, 29'b0}; - - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op = fp64; + fp32 = vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op[31:0]; + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].scalar_op = + fp64_from_fp32(fp32, fp32_m_lzc); end default:; endcase