Xilinx · lstasytis · Sep 16, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
@@ -236,100 +236,90 @@ def docompute(self):
             raise ValueError("Invalid IODMA direction, please set to in or out")
         # define templates for instantiation
         dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
-        dwc_inst_template = dwc_func + "<%d, %d, %d>(%s, %s, numReps);"
+        dwc_inst_template = dwc_func + "<%d, %d, %d, %d, %d>(%s, %s, numReps);"
         # do stream infrastructure and instantiations
         intfw = self.get_nodeattr("intfWidth")
         strmw = self.get_nodeattr("streamWidth")
-        width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)
+
         # we always need two streams: one of width_lcm, and one of intfw width
         # because we use WidthAdjustedInputStream,
         dtype_bits = self.get_input_datatype().bitwidth()
         total_bits = dtype_bits * np.prod(self.get_normal_input_shape())
 
         if direction == "in":
+            inWidth = intfw
+            outWidth = strmw
+
+            numInWords = total_bits // inWidth
+            numOutWords = total_bits // outWidth
+            totalIters = max(numInWords, numOutWords)
+
+            if outWidth > inWidth:
+                totalIters += int(np.floor(outWidth / inWidth) + 1) - 1
+
             # AXI MM -> IODMA -> (DWCs) -> out
             # DWCs depend on AXI MM and out interface width
             if strmw == intfw:
                 # case 0: AXI MM width = out width, no DWCs needed
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
                 ]
-            elif (strmw % intfw == 0) or (intfw % strmw == 0):
-                # case 1: AXI MM width divisible by out width or vice versa
-                # single DWC + single extra stream needed
+            else:
+                # case 1: Need to perform a data width conversion
+                # we use the HLS variant here
+                # TODO: use RTL variant if possible
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     "hls::stream<ap_uint<%d> > dma2dwc;" % intfw,
                     dma_inst_template % ("in0_" + self.hls_sname(), "dma2dwc"),
                     dwc_inst_template
                     % (
-                        intfw,
-                        strmw,
-                        total_bits // intfw,
+                        inWidth,
+                        outWidth,
+                        numInWords,
+                        numOutWords,
+                        totalIters,
                         "dma2dwc",
                         "out_" + self.hls_sname(),
                     ),
                 ]
-            else:
-                # case 2: AXI MM width not divisible by out width or vice versa
-                # need 2 DWCs (going through the least common multiple width)
-                # and 2 streams
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    "hls::stream<ap_uint<%d> > dma2lcm;" % intfw,
-                    "hls::stream<ap_uint<%d> > lcm2out;" % width_lcm,
-                    dma_inst_template % ("in0_" + self.hls_sname(), "dma2lcm"),
-                    dwc_inst_template
-                    % (intfw, width_lcm, total_bits // intfw, "dma2lcm", "lcm2out"),
-                    dwc_inst_template
-                    % (
-                        width_lcm,
-                        strmw,
-                        total_bits // width_lcm,
-                        "lcm2out",
-                        "out_" + self.hls_sname(),
-                    ),
-                ]
+
         elif direction == "out":
+            inWidth = strmw
+            outWidth = intfw
+
+            numInWords = total_bits // inWidth
+            numOutWords = total_bits // outWidth
+            totalIters = max(numInWords, numOutWords)
+
+            if outWidth > inWidth:
+                totalIters += int(np.floor(outWidth / inWidth) + 1) - 1
+
             # in0 -> (DWCs) -> IODMA -> AXI MM
             # DWCs depend on AXI MM and out interface width
             if strmw == intfw:
                 # case 0: in width = AXI MM width, no DWCs needed
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
                 ]
-            elif (strmw % intfw == 0) or (intfw % strmw == 0):
-                # case 1: AXI MM width divisible by in width or vice versa
-                # single DWC + single extra stream needed
+            else:
+                # case 1: Need to perform a data width conversion
+                # we use the HLS variant here
+                # TODO: use RTL variant if possible
                 self.code_gen_dict["$DOCOMPUTE$"] = [
                     "hls::stream<ap_uint<%d> > dwc2dma;" % intfw,
                     dwc_inst_template
                     % (
-                        strmw,
-                        intfw,
-                        total_bits // strmw,
+                        inWidth,
+                        outWidth,
+                        numInWords,
+                        numOutWords,
+                        totalIters,
                         "in0_" + self.hls_sname(),
                         "dwc2dma",
                     ),
                     dma_inst_template % ("dwc2dma", "out_" + self.hls_sname()),
                 ]
-            else:
-                # case 2: AXI MM width not divisible by out width or vice versa
-                # need 2 DWCs (going through the least common multiple width)
-                # and 2 streams
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    "hls::stream<ap_uint<%d> > in2lcm;" % width_lcm,
-                    "hls::stream<ap_uint<%d> > lcm2dma;" % intfw,
-                    dwc_inst_template
-                    % (
-                        strmw,
-                        width_lcm,
-                        total_bits // strmw,
-                        "in0_" + self.hls_sname(),
-                        "in2lcm",
-                    ),
-                    dwc_inst_template
-                    % (width_lcm, intfw, total_bits // width_lcm, "in2lcm", "lcm2dma"),
-                    dma_inst_template % ("lcm2dma", "out_" + self.hls_sname()),
-                ]
+
         else:
             raise Exception("Unknown IODMA direction: %s" % direction)
 

diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
@@ -29,7 +29,7 @@
 import numpy as np
 import os
 from qonnx.core.datatype import DataType
-
+import math
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.custom_op.fpgadataflow.streamingdatawidthconverter import (
     StreamingDataWidthConverter,
@@ -41,7 +41,7 @@
 
 
 class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend):
-    """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch
+    """Class that corresponds to finn-hlslib StreamingDataWidthConverterGeneralized_Batch
     function."""
 
     def get_nodeattr_types(self):
@@ -54,22 +54,38 @@ def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
 
     def defines(self, var):
-        numReps = 1
-        numInWords = int(np.prod(self.get_folded_input_shape()[:-1]))
+        # in cases of convolution input generator and downsampling,
+        # we have a 4D input and padding / cropping can only happen
+        # for the final 2 dimensions,
+        # so we use numReps to represent the first 2 dimensions
+        # + batching if shape[0] != 1
+        numReps = int(np.prod(self.get_folded_input_shape()[:-2]))
+        # numReps = 1
+
+        # assuming folded shapes are at least 2 dim-long
+        numInWords = int(np.prod(self.get_folded_input_shape()[-2:-1]))
+        numOutWords = int(np.prod(self.get_folded_output_shape()[-2:-1]))
+
+        # numInWords = int(np.prod(self.get_folded_input_shape()[-2:]))
+        # numOutWords = int(np.prod(self.get_folded_output_shape()[-2:]))
+
         inWidth = self.get_nodeattr("inWidth")
         outWidth = self.get_nodeattr("outWidth")
+        totalIters = max(numInWords, numOutWords)
+
+        # if we are building up a word, the overall loop count is longer
+        if outWidth > inWidth:
+            totalIters += int(np.floor(outWidth / inWidth) + 1) - 1
+
+
         self.code_gen_dict["$DEFINES$"] = [
             "#define InWidth %d " % inWidth,
             "#define OutWidth %d " % outWidth,
             "#define NumInWords %d " % numInWords,
+            "#define NumOutWords %d " % numOutWords,
+            "#define totalIters %d " % totalIters,
             "#define numReps %d" % numReps,
         ]
-        if self.needs_lcm():
-            lcmWidth = self.get_iowidth_lcm()
-            assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation"
-            numLCMToOut = numInWords // (lcmWidth / inWidth)
-            self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth)
-            self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut))
 
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
@@ -78,6 +94,7 @@ def strm_decl(self):
                 self.get_instream_width(), self.hls_sname(), self.hls_sname()
             )
         )
+
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
                 self.get_outstream_width(), self.hls_sname(), self.hls_sname()
@@ -86,22 +103,12 @@ def strm_decl(self):
 
     def docompute(self):
         # TODO continue with fxns below, they are copy-pasted
-        op = "StreamingDataWidthConverter_Batch"
-        if self.needs_lcm():
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
-                    self.get_iowidth_lcm()
-                ),
-                "%s<InWidth, LCMWidth, NumInWords>(in0_%s, intermediate, numReps);"
-                % (op, self.hls_sname()),
-                "%s<LCMWidth, OutWidth, NumLCMToOut>(intermediate, out_%s, numReps);"
-                % (op, self.hls_sname()),
-            ]
-        else:
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                "%s<InWidth, OutWidth, NumInWords>(in0_%s, out_%s, numReps);"
-                % (op, self.hls_sname(), self.hls_sname())
-            ]
+        op = "StreamingDataWidthConverterGeneralized_Batch"
+
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            "%s<InWidth, OutWidth, NumInWords,NumOutWords," % op
+            + " totalIters>(in0_%s, out_%s, numReps);" % (self.hls_sname(), self.hls_sname())
+        ]
 
     def blackboxfunction(self):
         in_packed_bits = self.get_instream_width()
@@ -127,8 +134,6 @@ def pragmas(self):
             "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
         )
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
-        if self.needs_lcm():
-            self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation")
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -160,14 +165,40 @@ def execute_node(self, context, graph):
         else:
             export_idt = self.get_input_datatype()
         # reshape input into folded shape
+
         reshaped_input = inp.reshape(folded_ishape)
-        # make copy before saving array
-        reshaped_input = reshaped_input.copy()
         np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
 
+        exp_shape = self.get_normal_output_shape()
+
         if mode == "cppsim":
-            output = inp
-            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+            # cppsim simply passes through the values because
+            # the DWC fails some test cases due to
+            # endianness differences in the cppsim flow
+            # of passing numpy arrays. TODO: Fix?
+            # Essentially need to fix cppsim to reverse
+            # endian and then back same as rtlsim
+            # for this particular (and maybe all) cases
+            # only shows up for the DWC, since when a word
+            # leftover appears when breaking down larger in
+            # words to smaller out words, the remainder should
+            # now be the LSB, but is the other way around on the
+            # cpp output.
+
+            in_shape = self.get_normal_input_shape()
+            out_shape = self.get_normal_output_shape()
+            inp = context[node.input[0]]
+            assert str(inp.dtype) == "float32", "Input datatype is not float32"
+            assert inp.shape == tuple(in_shape), "Input shape does not match expected shape."
+
+            # initialize as zeroes to introduce padding if needed
+            output = np.zeros((out_shape), dtype=np.float32)
+            if out_shape[-1] > in_shape[-1]:
+                output[..., : in_shape[-1]] = inp[..., : in_shape[-1]]
+            else:
+                output[..., : out_shape[-1]] = inp[..., : out_shape[-1]]
+
+            output = np.asarray([output], dtype=np.float32).reshape(*out_shape)
             context[node.output[0]] = output
 
         elif mode == "rtlsim":
@@ -182,15 +213,19 @@ def execute_node(self, context, graph):
             odt = export_idt
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
+
             out_npy_path = "{}/output.npy".format(code_gen_dir)
             out_shape = self.get_folded_output_shape()
+
             rtlsim_output_to_npy(
                 rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
             )
+
             # load and reshape output
-            output = np.load(out_npy_path)
-            output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
+            output_pre_reshape = np.load(out_npy_path)
+            output = np.asarray([output_pre_reshape], dtype=np.float32).reshape(exp_shape)
             context[node.output[0]] = output
+
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
@@ -207,3 +242,34 @@ def execute_node(self, context, graph):
             exp_shape
         ), """Output
         shape doesn't match expected shape, should be same as input shape"""
+
+
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs"""
+
+        # TODO: This calculation does not currently take into account the extra
+        # tracking variables, nor the muxing of one of the stream ports to the buffer
+        # which shifts according to how many elements are in the buffer
+        # the true LUT cost is between 2*(inw+outw) and 10*(inw+outw)
+
+        inw = self.get_instream_width()
+        outw = self.get_outstream_width()
+
+        # we use an intermediate buffer of size inwidth+outwidth
+        intw = inw + outw
+
+        # we assume a shift-based implementation
+        # even if we don't use LUTs explicitly, we make some unavailable
+        # to other logic because they're tied into the DWC control sets
+
+        cnt_luts = 0
+        cset_luts = 0
+
+        cnt_luts += abs(math.ceil(math.log(intw / inw, 2)))
+
+        cset_luts += intw + outw
+
+        # generalized DWC cost penalty, this value is temporary
+        cnt_luts *=8
+
+        return int(cnt_luts + cset_luts)