Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized DataWidthConverter #1186

Draft
wants to merge 3 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 42 additions & 52 deletions src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,100 +236,90 @@ def docompute(self):
raise ValueError("Invalid IODMA direction, please set to in or out")
# define templates for instantiation
dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);"
dwc_inst_template = dwc_func + "<%d, %d, %d>(%s, %s, numReps);"
dwc_inst_template = dwc_func + "<%d, %d, %d, %d, %d>(%s, %s, numReps);"
# do stream infrastructure and instantiations
intfw = self.get_nodeattr("intfWidth")
strmw = self.get_nodeattr("streamWidth")
width_lcm = (strmw * intfw) // math.gcd(strmw, intfw)

# we always need two streams: one of width_lcm, and one of intfw width
# because we use WidthAdjustedInputStream,
dtype_bits = self.get_input_datatype().bitwidth()
total_bits = dtype_bits * np.prod(self.get_normal_input_shape())

if direction == "in":
inWidth = intfw
outWidth = strmw

numInWords = total_bits // inWidth
numOutWords = total_bits // outWidth
totalIters = max(numInWords, numOutWords)

if outWidth > inWidth:
totalIters += int(np.floor(outWidth / inWidth) + 1) - 1

# AXI MM -> IODMA -> (DWCs) -> out
# DWCs depend on AXI MM and out interface width
if strmw == intfw:
# case 0: AXI MM width = out width, no DWCs needed
self.code_gen_dict["$DOCOMPUTE$"] = [
dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
]
elif (strmw % intfw == 0) or (intfw % strmw == 0):
# case 1: AXI MM width divisible by out width or vice versa
# single DWC + single extra stream needed
else:
# case 1: Need to perform a data width conversion
# we use the HLS variant here
# TODO: use RTL variant if possible
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream<ap_uint<%d> > dma2dwc;" % intfw,
dma_inst_template % ("in0_" + self.hls_sname(), "dma2dwc"),
dwc_inst_template
% (
intfw,
strmw,
total_bits // intfw,
inWidth,
outWidth,
numInWords,
numOutWords,
totalIters,
"dma2dwc",
"out_" + self.hls_sname(),
),
]
else:
# case 2: AXI MM width not divisible by out width or vice versa
# need 2 DWCs (going through the least common multiple width)
# and 2 streams
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream<ap_uint<%d> > dma2lcm;" % intfw,
"hls::stream<ap_uint<%d> > lcm2out;" % width_lcm,
dma_inst_template % ("in0_" + self.hls_sname(), "dma2lcm"),
dwc_inst_template
% (intfw, width_lcm, total_bits // intfw, "dma2lcm", "lcm2out"),
dwc_inst_template
% (
width_lcm,
strmw,
total_bits // width_lcm,
"lcm2out",
"out_" + self.hls_sname(),
),
]

elif direction == "out":
inWidth = strmw
outWidth = intfw

numInWords = total_bits // inWidth
numOutWords = total_bits // outWidth
totalIters = max(numInWords, numOutWords)

if outWidth > inWidth:
totalIters += int(np.floor(outWidth / inWidth) + 1) - 1

# in0 -> (DWCs) -> IODMA -> AXI MM
# DWCs depend on AXI MM and out interface width
if strmw == intfw:
# case 0: in width = AXI MM width, no DWCs needed
self.code_gen_dict["$DOCOMPUTE$"] = [
dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
]
elif (strmw % intfw == 0) or (intfw % strmw == 0):
# case 1: AXI MM width divisible by in width or vice versa
# single DWC + single extra stream needed
else:
# case 1: Need to perform a data width conversion
# we use the HLS variant here
# TODO: use RTL variant if possible
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream<ap_uint<%d> > dwc2dma;" % intfw,
dwc_inst_template
% (
strmw,
intfw,
total_bits // strmw,
inWidth,
outWidth,
numInWords,
numOutWords,
totalIters,
"in0_" + self.hls_sname(),
"dwc2dma",
),
dma_inst_template % ("dwc2dma", "out_" + self.hls_sname()),
]
else:
# case 2: AXI MM width not divisible by out width or vice versa
# need 2 DWCs (going through the least common multiple width)
# and 2 streams
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream<ap_uint<%d> > in2lcm;" % width_lcm,
"hls::stream<ap_uint<%d> > lcm2dma;" % intfw,
dwc_inst_template
% (
strmw,
width_lcm,
total_bits // strmw,
"in0_" + self.hls_sname(),
"in2lcm",
),
dwc_inst_template
% (width_lcm, intfw, total_bits // width_lcm, "in2lcm", "lcm2dma"),
dma_inst_template % ("lcm2dma", "out_" + self.hls_sname()),
]

else:
raise Exception("Unknown IODMA direction: %s" % direction)

Expand Down
134 changes: 100 additions & 34 deletions src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import numpy as np
import os
from qonnx.core.datatype import DataType

import math
from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
from finn.custom_op.fpgadataflow.streamingdatawidthconverter import (
StreamingDataWidthConverter,
Expand All @@ -41,7 +41,7 @@


class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend):
"""Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch
"""Class that corresponds to finn-hlslib StreamingDataWidthConverterGeneralized_Batch
function."""

def get_nodeattr_types(self):
Expand All @@ -54,22 +54,38 @@ def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']

def defines(self, var):
numReps = 1
numInWords = int(np.prod(self.get_folded_input_shape()[:-1]))
# in cases of convolution input generator and downsampling,
# we have a 4D input and padding / cropping can only happen
# for the final 2 dimensions,
# so we use numReps to represent the first 2 dimensions
# + batching if shape[0] != 1
numReps = int(np.prod(self.get_folded_input_shape()[:-2]))
# numReps = 1

# assuming folded shapes are at least 2 dim-long
numInWords = int(np.prod(self.get_folded_input_shape()[-2:-1]))
numOutWords = int(np.prod(self.get_folded_output_shape()[-2:-1]))

# numInWords = int(np.prod(self.get_folded_input_shape()[-2:]))
# numOutWords = int(np.prod(self.get_folded_output_shape()[-2:]))

inWidth = self.get_nodeattr("inWidth")
outWidth = self.get_nodeattr("outWidth")
totalIters = max(numInWords, numOutWords)

# if we are building up a word, the overall loop count is longer
if outWidth > inWidth:
totalIters += int(np.floor(outWidth / inWidth) + 1) - 1


self.code_gen_dict["$DEFINES$"] = [
"#define InWidth %d " % inWidth,
"#define OutWidth %d " % outWidth,
"#define NumInWords %d " % numInWords,
"#define NumOutWords %d " % numOutWords,
"#define totalIters %d " % totalIters,
"#define numReps %d" % numReps,
]
if self.needs_lcm():
lcmWidth = self.get_iowidth_lcm()
assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation"
numLCMToOut = numInWords // (lcmWidth / inWidth)
self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth)
self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut))

def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
Expand All @@ -78,6 +94,7 @@ def strm_decl(self):
self.get_instream_width(), self.hls_sname(), self.hls_sname()
)
)

self.code_gen_dict["$STREAMDECLARATIONS$"].append(
'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
self.get_outstream_width(), self.hls_sname(), self.hls_sname()
Expand All @@ -86,22 +103,12 @@ def strm_decl(self):

def docompute(self):
# TODO continue with fxns below, they are copy-pasted
op = "StreamingDataWidthConverter_Batch"
if self.needs_lcm():
self.code_gen_dict["$DOCOMPUTE$"] = [
'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
self.get_iowidth_lcm()
),
"%s<InWidth, LCMWidth, NumInWords>(in0_%s, intermediate, numReps);"
% (op, self.hls_sname()),
"%s<LCMWidth, OutWidth, NumLCMToOut>(intermediate, out_%s, numReps);"
% (op, self.hls_sname()),
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
"%s<InWidth, OutWidth, NumInWords>(in0_%s, out_%s, numReps);"
% (op, self.hls_sname(), self.hls_sname())
]
op = "StreamingDataWidthConverterGeneralized_Batch"

self.code_gen_dict["$DOCOMPUTE$"] = [
"%s<InWidth, OutWidth, NumInWords,NumOutWords," % op
+ " totalIters>(in0_%s, out_%s, numReps);" % (self.hls_sname(), self.hls_sname())
]

def blackboxfunction(self):
in_packed_bits = self.get_instream_width()
Expand All @@ -127,8 +134,6 @@ def pragmas(self):
"#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
if self.needs_lcm():
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation")

def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
Expand Down Expand Up @@ -160,14 +165,40 @@ def execute_node(self, context, graph):
else:
export_idt = self.get_input_datatype()
# reshape input into folded shape

reshaped_input = inp.reshape(folded_ishape)
# make copy before saving array
reshaped_input = reshaped_input.copy()
np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)

exp_shape = self.get_normal_output_shape()

if mode == "cppsim":
output = inp
output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
# cppsim simply passes through the values because
# the DWC fails some test cases due to
# endianness differences in the cppsim flow
# of passing numpy arrays. TODO: Fix?
# Essentially need to fix cppsim to reverse
# endian and then back same as rtlsim
# for this particular (and maybe all) cases
# only shows up for the DWC, since when a word
# leftover appears when breaking down larger in
# words to smaller out words, the remainder should
# now be the LSB, but is the other way around on the
# cpp output.

in_shape = self.get_normal_input_shape()
out_shape = self.get_normal_output_shape()
inp = context[node.input[0]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
assert inp.shape == tuple(in_shape), "Input shape does not match expected shape."

# initialize as zeroes to introduce padding if needed
output = np.zeros((out_shape), dtype=np.float32)
if out_shape[-1] > in_shape[-1]:
output[..., : in_shape[-1]] = inp[..., : in_shape[-1]]
else:
output[..., : out_shape[-1]] = inp[..., : out_shape[-1]]

output = np.asarray([output], dtype=np.float32).reshape(*out_shape)
context[node.output[0]] = output

elif mode == "rtlsim":
Expand All @@ -182,15 +213,19 @@ def execute_node(self, context, graph):
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()

out_npy_path = "{}/output.npy".format(code_gen_dir)
out_shape = self.get_folded_output_shape()

rtlsim_output_to_npy(
rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
)

# load and reshape output
output = np.load(out_npy_path)
output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
output_pre_reshape = np.load(out_npy_path)
output = np.asarray([output_pre_reshape], dtype=np.float32).reshape(exp_shape)
context[node.output[0]] = output

else:
raise Exception(
"""Invalid value for attribute exec_mode! Is currently set to: {}
Expand All @@ -207,3 +242,34 @@ def execute_node(self, context, graph):
exp_shape
), """Output
shape doesn't match expected shape, should be same as input shape"""


def lut_estimation(self):
"""Calculates resource estimations for LUTs"""

# TODO: This calculation does not currently take into account the extra
# tracking variables, nor the muxing of one of the stream ports to the buffer
# which shifts according to how many elements are in the buffer
# the true LUT cost is between 2*(inw+outw) and 10*(inw+outw)

inw = self.get_instream_width()
outw = self.get_outstream_width()

# we use an intermediate buffer of size inwidth+outwidth
intw = inw + outw

# we assume a shift-based implementation
# even if we don't use LUTs explicitly, we make some unavailable
# to other logic because they're tied into the DWC control sets

cnt_luts = 0
cset_luts = 0

cnt_luts += abs(math.ceil(math.log(intw / inw, 2)))

cset_luts += intw + outw

# generalized DWC cost penalty, this value is temporary
cnt_luts *=8

return int(cnt_luts + cset_luts)
Loading
Loading