diff --git a/custom_hls/sm_utils.hpp b/custom_hls/sm_utils.hpp new file mode 100644 index 0000000000..918f8879bf --- /dev/null +++ b/custom_hls/sm_utils.hpp @@ -0,0 +1,164 @@ +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +// +// This file is subject to the Xilinx Design License Agreement located +// in the LICENSE.md file in the root directory of this repository. +// +// This file contains confidential and proprietary information of Xilinx, Inc. +// and is protected under U.S. and international copyright and other +// intellectual property laws. +// +// DISCLAIMER +// This disclaimer is not a license and does not grant any rights to the materials +// distributed herewith. Except as otherwise provided in a valid license issued to +// you by Xilinx, and to the maximum extent permitted by applicable law: (1) THESE +// MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX HEREBY +// DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, +// INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR +// FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether +// in contract or tort, including negligence, or under any other theory of +// liability) for any loss or damage of any kind or nature related to, arising +// under or in connection with these materials, including for any direct, or any +// indirect, special, incidental, or consequential loss or damage (including loss +// of data, profits, goodwill, or any type of loss or damage suffered as a result +// of any action brought by a third party) even if such damage or loss was +// reasonably foreseeable or Xilinx had been advised of the possibility of the +// same. +// +// CRITICAL APPLICATIONS +// Xilinx products are not designed or intended to be fail-safe, or for use in +// any application requiring failsafe performance, such as life-support or safety +// devices or systems, Class III medical devices, nuclear facilities, applications +// related to the deployment of airbags, or any other applications that could lead +// to death, personal injury, or severe property or environmental damage +// (individually and collectively, "Critical Applications"). Customer assumes the +// sole risk and liability of any use of Xilinx products in Critical Applications, +// subject only to applicable laws and regulations governing limitations on product +// liability. +// +// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT ALL TIMES. +#ifndef SM_UTIL_HPP +#define SM_UTIL_HPP +#include "hls_vector.h" + +//- Compile-Time Functions -------------------------------------------------- + +// ceil(log2(x)) +template +constexpr unsigned clog2(T x) { + return x<2? 0 : 1+clog2((x+1)/2); +} + +//- Streaming Flit with `last` Marking -------------------------------------- +template +struct flit_t { + bool last; + T data; + +public: + flit_t(bool last_, T const &data_) : last(last_), data(data_) {} + ~flit_t() {} +}; + +//- Streaming Copy ---------------------------------------------------------- +template +void move(hls::stream &src, hls::stream &dst) { +#pragma HLS pipeline II=1 style=flp + if(!src.empty()) dst.write(src.read()); +} + +//- Tree Reduce ------------------------------------------------------------- +template< unsigned long N, typename TA, typename TR = TA, typename F > +TR tree_reduce(hls::stream &v, F f) { +#pragma HLS inline +#pragma HLS function_instantiate variable=f + TR tree[2*N-1]; +#pragma HLS array_partition complete dim=1 variable=tree + for(unsigned i = N; i-- > 0;) { +#pragma HLS unroll + tree[N-1 + i] = v.read(); + } + for(unsigned i = N-1; i-- > 0;) { +#pragma HLS unroll + tree[i] = f(tree[2*i+1], tree[2*i+2]); + } + return tree[0]; +} + +// Recursive comparison and count (of max) +// Builds a tree to compute the max of a vector +template +struct MaxReduction { + + static T max(const hls::vector& input) { +#pragma HLS INLINE + constexpr unsigned M = (N + 1) / 2; + hls::vector res; + + for(unsigned i = 0; i < M; ++i) { +#pragma HLS unroll + if (2*i + 1 < N) + res[i] = input[2*i] > input[2*i + 1] ? input[2*i] : input[2*i + 1]; + else + res[i] = input[2*i]; // Handle the case where the input size is odd + } + + return MaxReduction::max(res); + } + +}; + +template +struct MaxReduction<2, T> { + static T max(const hls::vector& input) { +#pragma HLS INLINE + return (input[0] > input[1]) ? input[0] : input[1]; + } +}; + +template +struct MaxReduction<1, T> { + static T max(const hls::vector& input) { +#pragma HLS INLINE + return input[0]; + } +}; + +// Recursive reduction tree for the total summation +// Code for the Nth stage +template +struct TreeReduction { + static float reduce(const hls::vector& input) { +#pragma HLS INLINE + constexpr unsigned M = (N + 1) / 2; + hls::vector sum; + + for(unsigned i = 0; i < M; ++i) { +#pragma HLS unroll + if (2*i + 1 < N) + sum[i] = input[2*i] + input[2*i + 1]; + else + sum[i] = input[2*i]; // Handle the case where the input size is odd + } + + return TreeReduction::reduce(sum); + } +}; + +template<> +struct TreeReduction<2> { + static float reduce(const hls::vector& input) { +#pragma HLS INLINE + return input[0] + input[1]; + } +}; + +template<> +struct TreeReduction<1> { + static float reduce(const hls::vector& input) { +#pragma HLS INLINE + return input[0]; + } +}; + + +#endif \ No newline at end of file diff --git a/custom_hls/softmax.hpp b/custom_hls/softmax.hpp new file mode 100644 index 0000000000..9452045a77 --- /dev/null +++ b/custom_hls/softmax.hpp @@ -0,0 +1,285 @@ +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +// +// This file is subject to the Xilinx Design License Agreement located +// in the LICENSE.md file in the root directory of this repository. +// +// This file contains confidential and proprietary information of Xilinx, Inc. +// and is protected under U.S. and international copyright and other +// intellectual property laws. +// +// DISCLAIMER +// This disclaimer is not a license and does not grant any rights to the materials +// distributed herewith. Except as otherwise provided in a valid license issued to +// you by Xilinx, and to the maximum extent permitted by applicable law: (1) THESE +// MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX HEREBY +// DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, +// INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR +// FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether +// in contract or tort, including negligence, or under any other theory of +// liability) for any loss or damage of any kind or nature related to, arising +// under or in connection with these materials, including for any direct, or any +// indirect, special, incidental, or consequential loss or damage (including loss +// of data, profits, goodwill, or any type of loss or damage suffered as a result +// of any action brought by a third party) even if such damage or loss was +// reasonably foreseeable or Xilinx had been advised of the possibility of the +// same. +// +// CRITICAL APPLICATIONS +// Xilinx products are not designed or intended to be fail-safe, or for use in +// any application requiring failsafe performance, such as life-support or safety +// devices or systems, Class III medical devices, nuclear facilities, applications +// related to the deployment of airbags, or any other applications that could lead +// to death, personal injury, or severe property or environmental damage +// (individually and collectively, "Critical Applications"). Customer assumes the +// sole risk and liability of any use of Xilinx products in Critical Applications, +// subject only to applicable laws and regulations governing limitations on product +// liability. +// +// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT ALL TIMES. + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sm_utils.hpp" + +// First stage of the pipeline: +// +// Trigger: When a vector of SIMD elements is present in the stream +// +// Desc: Pass over the input N items and calc the max value +template +void max_calc_stage( + hls::stream> &ins, + hls::stream> &outs, + hls::stream &maxs +) { +#pragma HLS pipeline II=1 style=flp + static ap_uint count = 0; + static T max = 0; +#pragma HLS reset variable=count +#pragma HLS reset variable=max + + if(!ins.empty()){ + hls::vector out; + hls::vector max_v; + hls::vector const in = ins.read(); + + + for(unsigned i=0; i::max(max_v); + + count++; + if (count == (N/SIMD)-1) { + count = 0; + maxs.write(max); + max = 0; + } + } +} + + +// Second stage of the pipeline +// +// Trigger: When a max value is sent from the preceeding stage +// +// Desc: For each item in a N item sequence calc the (exp - max) in float +// track the sum while processing the N items. +template +void exp_sum_calc( + hls::stream> &ins, + hls::stream &maxs, + hls::stream> &outs, + hls::stream &sums +){ +#pragma HLS pipeline II=1 style=flp + static ap_uint count = 0; + static float sum = 0.0f; + static bool valid = false; + static float max = 0.0f; +#pragma HLS reset variable=count +#pragma HLS reset variable=sum +#pragma HLS reset variable=valid +#pragma HLS reset variable=max + + if (count == (N/SIMD)) { + count = 0; + valid = false; + sums.write(sum); + sum = 0.0f; + return; + } + + if(valid && !ins.empty()) { + hls::vector const in = ins.read(); + hls::vector out; + for (unsigned i=0; i::reduce(out); + outs.write(out); + + count++; + } + + if (!maxs.empty() && !valid) { + max = maxs.read(); + valid = true; + } + +} + +// Third stage of the pipeline +// +// Trigger: When a sum value is sent from the preceeding stage +// +// Desc: For the N items take the input and divide it by the sum +template +void div_calc( + hls::stream> &ins, + hls::stream &sums, + hls::stream> &outs +){ +#pragma HLS pipeline II=1 style=flp + static ap_uint count = 0; + static bool valid = false; + static float sum = 0.0f; +#pragma HLS reset variable=count +#pragma HLS reset variable=valid +#pragma HLS reset variable=sum + + if (count == (N/SIMD)) { + count = 0; + valid = false; + return; + } + + if (valid && !ins.empty()) { + hls::vector const in = ins.read(); + hls::vector out; + for(unsigned i=0; i +void smax( + hls::stream> &src, + hls::stream> &dst +) { +#pragma HLS dataflow disable_start_propagation + static_assert(N%SIMD == 0, "N must be a multiple of SIMD"); + + static hls::stream> max_data_s; +#pragma HLS stream variable=max_data_s depth=N + static hls::stream max_s; +#pragma HLS stream variable=max_s depth=2 + + static hls::stream> exp_data_s; +#pragma HLS stream variable=exp_data_s depth=N + static hls::stream sum_s; +#pragma HLS stream variable=sum_s depth=2 + + max_calc_stage(src, max_data_s, max_s); + exp_sum_calc(max_data_s, max_s, exp_data_s, sum_s); + div_calc(exp_data_s, sum_s, dst); + +} // smax() + +// Threshold/quantisation at the output of the softmax +template< + typename T, // The quantised output type (Needs to be signed) + typename TF // The float based input type +> +T quant_threshold(TF val) { +#pragma HLS INLINE + constexpr unsigned numBits = sizeof(T)*CHAR_BIT; + if(val>=1.0f){ + T frac_val = ~T(0); + if(std::is_signed::value) { + return frac_val; + } else { + T mask = ~(T(1) << (numBits - 1)); + return frac_val & mask; + } + } + + + ap_fixed fixed_point_val = val; + T frac_val = fixed_point_val.range(numBits - 2, 0); + return frac_val; +} + + + +// Quantisation pipeline stage +// +// Trigger: When a SIMD vector is received from the preceeding stage +// +// Desc: Apply quantisation to the SIMD elements and write them into the +// SIMD width output stream. +template< + unsigned N, + unsigned SIMD, + typename T +> +void quant_stage( + hls::stream> &in, + hls::stream> &out +) { +#pragma HLS pipeline II=1 style=flp + if(!in.empty()) { + hls::vector const x = in.read(); + hls::vector y; + for(unsigned i=0; i(x[i]); + } + out.write(y); + } +} + +// Quantised version of softmax +// This is the same as the float softmax with an additional baked in quantisation stage at the end +template< + unsigned N, // The width of the input dimension + unsigned SIMD, // Amount of parallelism (how many items consumed/produced at a time + typename T + > +void smaxquant( + hls::stream> &src, + hls::stream> &dst +) { +#pragma HLS DATAFLOW disable_start_propagation + hls::stream> smax_out; +#pragma HLS stream variable=smax_out depth=2 + static_assert(N%SIMD == 0, "SIMD must be a factor of N"); + + smax(src, smax_out); + quant_stage(smax_out, dst); + +} // smaxquant() diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index c7500bcaa6..4004523bad 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -81,14 +81,14 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then export XILINX_XRT=/opt/xilinx/xrt source $VITIS_PATH/settings64.sh gecho "Found Vitis at $VITIS_PATH" - if [ -f "$XILINX_XRT/setup.sh" ];then - # source XRT - source $XILINX_XRT/setup.sh - gecho "Found XRT at $XILINX_XRT" - else - recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?" - exit -1 - fi + # if [ -f "$XILINX_XRT/setup.sh" ];then + # # source XRT + # source $XILINX_XRT/setup.sh + # gecho "Found XRT at $XILINX_XRT" + # else + # recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?" + # exit -1 + # fi else yecho "Unable to find $VITIS_PATH/settings64.sh" yecho "Functionality dependent on Vitis will not be available." @@ -137,6 +137,15 @@ else echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts" fi +# add hls library path to LD_LIBRARY_PATH +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fpo_v7_1" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fft_v9_1" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fir_v7_0" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/dds_v6_0" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/tps/lnx64/gcc-8.3.0/lib" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lib/lnx64.o/Rhel" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/lib/csim" + export PATH=$PATH:$HOME/.local/bin # execute the provided command(s) as root exec "$@" diff --git a/fetch-repos.sh b/fetch-repos.sh index 2033973f2a..847952e399 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -115,10 +115,10 @@ fetch_board_files() { cd $OLD_PWD } -fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR -fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR -fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR -fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR +# fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR +# fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR +# fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR +# fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR fetch_repo $CNPY_URL $CNPY_COMMIT $CNPY_DIR fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR diff --git a/run-docker.sh b/run-docker.sh index 88fabff2fa..b1fe44eb0c 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -265,6 +265,36 @@ if [ ! -z "$FINN_XILINX_PATH" ];then DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR " fi fi + +# This part is used for internal ci for finn-examples +# if using build verification for finn-examples ci, set up the necessary Docker variables +if [ "$VERIFICATION_EN" = 1 ]; then + if [ -z "$FINN_EXAMPLES_ROOT" ]; then + recho "FINN_EXAMPLES_ROOT path has not been set." + recho "Please set FINN_EXAMPLES_ROOT path to enable verification." + exit -1 + elif [ ! -d "${FINN_EXAMPLES_ROOT}/ci" ]; then + recho "ci folder not found in ${FINN_EXAMPLES_ROOT}." + recho "Please ensure the FINN-examples repo has been set up correctly, and FINN_EXAMPLES_ROOT path is set correctly, to enable verification." + exit -1 + elif [ -z "$VERIFICATION_IO" ]; then + recho "VERIFICATION_IO paths has not been set." + recho "Please ensure the path to the input and expected output files has been set correctly to eneable verification." + exit -1 + elif [ ! -d "$VERIFICATION_IO" ]; then + recho "${VERIFICATION_IO} is not a directory." + recho "Please ensure the VERIFICATION_IO path has been set to the directory containing the input and expected output files for verification." + exit -1 + else + DOCKER_EXEC+="-e VERIFICATION_EN=$VERIFICATION_EN " + DOCKER_EXEC+="-e FINN_EXAMPLES_ROOT=$FINN_EXAMPLES_ROOT " + DOCKER_EXEC+="-e VERIFICATION_IO=$VERIFICATION_IO " + FINN_DOCKER_EXTRA+="-v $FINN_EXAMPLES_ROOT/ci:$FINN_EXAMPLES_ROOT/ci " + FINN_DOCKER_EXTRA+="-v $VERIFICATION_IO:$VERIFICATION_IO " + fi +fi + + DOCKER_EXEC+="$FINN_DOCKER_EXTRA " if [ -z "$FINN_SINGULARITY" ];then diff --git a/setup.cfg b/setup.cfg index 4834011dea..511ce451dd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,6 +32,7 @@ [metadata] name = finn +version = 0.9.0 description = A Framework for Fast, Scalable Quantized Neural Network Inference author = Yaman Umuroglu author_email = yamanu@xilinx.com @@ -60,6 +61,60 @@ package_dir = # tests_require = pytest; pytest-cov # Require a specific Python version, e.g. Python 2.7 or >= 3.4 # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +install_requires = + qonnx @ git+https://github.com/fastmachinelearning/qonnx.git@fd61cfeebbdaba351abf7e9d54cd785d7776fa4f + pyverilator @ git+https://github.com/maltanar/pyverilator.git@766e457465f5c0dd315490d7b9cc5d74f9a76f4f + brevitas @ git+https://github.com/Xilinx/brevitas.git@84f42259ec869eb151af4cb8a8b23ad925f493db + finn-experimental @ git+https://github.com/Xilinx/finn-experimental.git@de99347e936d51715f5356a1b6c64e37b91c23c2 + dataset_loading @ git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading + bitstring==3.1.7 + clize==5.0.1 + dataclasses-json==0.5.7 + gspread==3.6.0 + importlib-resources==6.1.0 + ipython==8.12.2 + numpy==1.24.1 + onnx==1.13.0 + onnxoptimizer + onnxruntime==1.16.1 + pre-commit==3.3.2 + protobuf==3.20.3 + psutil==5.9.4 + pyscaffold==4.4 + scipy==1.10.1 + setupext-janitor>=1.1.2 + setuptools==68.2.2 + sigtools==4.0.1 + toposort==1.7.0 + vcdvcd==1.0.5 + wget==3.2 + torch==1.13.1 + torchvision==0.14.1 + torchaudio==0.13.1 + pygments==2.14.0 + ipykernel==6.21.2 + jupyter==1.0.0 + markupsafe==2.0.1 + matplotlib==3.7.0 + pytest-dependency==0.5.1 + pytest-xdist[setproctitle]==3.2.0 + pytest-parallel==0.1.1 + netron>=5.0.0 + pandas==1.5.3 + scikit-learn==1.2.1 + tqdm==4.64.1 + pytest==6.2.5 + pytest-metadata==1.7.0 + pytest-html==3.0.0 + pytest-html-merger==0.0.8 + pytest-cov==4.1.0 + deap==1.3.1 + mip==1.13.0 + networkx==2.8 + future-annotations==1.0.0 + dependencies==2.0.1 + tokenize-rt==4.2.1 + tclwrapper==0.0.1 [options.packages.find] where = src @@ -164,14 +219,3 @@ exclude = dist .eggs docs/conf.py - -[pyscaffold] -# PyScaffold's parameters when the project was created. -# This will be used when updating. Do not change! -version = 3.2.1 -package = finn -extensions = - travis - pre_commit - namespace -namespace = finn diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index aed2ab7fe1..9bcbb1e860 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -54,6 +54,8 @@ from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax + custom_op = dict() @@ -81,3 +83,4 @@ custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour +custom_op["QuantSoftmax"] = QuantSoftmax diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 405c47a08d..8f5a0a7cc7 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -52,6 +52,7 @@ from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls +from finn.custom_op.fpgadataflow.hls.quantsoftmax_hls import QuantSoftmax_hls custom_op = dict() @@ -79,3 +80,4 @@ custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls custom_op["MVAU_hls"] = MVAU_hls custom_op["VVAU_hls"] = VVAU_hls +custom_op["QuantSoftmax_hls"] = QuantSoftmax_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py new file mode 100644 index 0000000000..19903866b3 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py @@ -0,0 +1,184 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import numpy as np +from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow import templates +from finn.util.basic import CppBuilder +class QuantSoftmax_hls(QuantSoftmax, HLSBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(QuantSoftmax.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = [ + '#include ', + '#include "softmax.hpp"', + '#include "utils.hpp"', + ] + + def defines(self, var): + simd = self.get_nodeattr("simd") + dtype = self.get_input_datatype() + channels = self.get_nodeattr("channels") + self.code_gen_dict["$DEFINES$"] = [ + f""" + constexpr unsigned SIMD = {simd}; + constexpr unsigned W = {channels}; + using T = {dtype.get_hls_datatype_str()}; + using F = float; + """ + ] + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + f''' + static hls::stream> src0; + static hls::stream> dst0; + + move(in0_{self.hls_sname()}, src0); + smaxquant(src0, dst0); + move(dst0, out_{self.hls_sname()}); + ''' + ] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + f''' + void {self.onnx_node.name}( + hls::stream> &in0_{self.hls_sname()}, + hls::stream> &out_{self.hls_sname()} + ) + ''' + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + f''' + #pragma HLS interface AXIS port=in0_{self.hls_sname()} + #pragma HLS interface AXIS port=out_{self.hls_sname()} + #pragma HLS aggregate variable=in0_{self.hls_sname()} compact=bit + #pragma HLS aggregate variable=out_{self.hls_sname()} compact=bit + + #pragma HLS interface ap_ctrl_none port=return + #pragma HLS dataflow disable_start_propagation + ''' + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + inp = context[node.input[0]] + export_idt = self.get_input_datatype() + inp = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + # # execute the precompiled model + super().exec_precompiled_singlenode_model() + # # load output npy file + super().npy_to_dynamic_output(context) + else: + raise Exception(f"Unsupported execution mode: {mode}") + + def compile_singlenode_code(self): + """Builds the bash script for compilation using the CppBuilder from + finn.util.basic and executes the script to produce the executable.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + builder = CppBuilder() + # to enable additional debug features please uncommand the next line + # builder.append_includes("-DDEBUG") + builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") + builder.append_includes("-I$FINN_ROOT/deps/cnpy/") + builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") + builder.append_includes("-I$FINN_ROOT/custom_hls") + builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) + builder.append_includes("--std=c++14") + builder.append_includes("-O3") + builder.append_sources(code_gen_dir + "/*.cpp") + builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") + builder.append_includes("-lz") + builder.append_includes("-fno-builtin -fno-inline -Wl,-rpath,\"$HLS_PATH/lnx64/lib/csim\" -L$HLS_PATH/lnx64/lib/csim -lhlsmc++-GCC46") + builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr -lIp_floating_point_v7_1_bitacc_cmodel") + builder.set_executable_path(code_gen_dir + "/node_model") + builder.build(code_gen_dir) + self.set_nodeattr("executable_path", builder.executable_path) + + def code_generation_cppsim(self, model): + """Generates c++ code for simulation (cppsim).""" + self.code_gen_dict["$READNPYDATA$"] = [""] + self.code_gen_dict["$DATAOUTSTREAM$"] = [""] + self.code_gen_dict["$STREAMDECLARATIONS$"] = [""] + node = self.onnx_node + path = self.get_nodeattr("code_gen_dir_cppsim") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("cppsim") + self.pragmas() + oshape = self.get_folded_output_shape() + oshape_str = str(oshape).replace("(", "{").replace(")", "}") + self.code_gen_dict["$DOCOMPUTE$"] = [ + f''' + static hls::stream> in0_V; + static hls::stream> out_V; + + npy2vectorstream("{path}/input_0.npy", in0_V); + int stream_size = in0_V.size(); + + while(out_V.size() != stream_size){{ + smaxquant(in0_V, out_V); + }} + + vectorstream2npy(out_V,{oshape_str}, "{path}/output.npy"); + ''' + ] + self.save_as_npy() + + template = templates.docompute_template + + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + f"/execute_{node.op_type}.cpp" + with open(code_gen_dir, "w") as f: + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + f.write(template) + + def prepare_rtlsim(self): + # this node currently does not support rtlsim + raise NotImplementedError("QuantSoftmax_hls does not support rtlsim") \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index d8210fd684..5436aa31af 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -307,16 +307,21 @@ def npy_to_dynamic_outputs(self, context, npy_list): def exec_precompiled_singlenode_model(self): """Executes precompiled executable.""" - executable_path = self.get_nodeattr("executable_path") - if executable_path == "": + executable = self.get_nodeattr("executable_path") + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + if executable == "": raise Exception( """ Found no executable for this node, did you run the codegen and compilation transformations? """ ) - process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) - process_execute.communicate() + with open(code_gen_dir + "/sim.log", "w") as f: + try: + subprocess.check_output(executable, stderr=f) + except subprocess.CalledProcessError: + raise Exception(f"Error running the generated code. Check {f.name} for more details.") + def hls_sname(self): """Get the naming convention used by Vitis HLS for stream signals diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py new file mode 100644 index 0000000000..ac9c17fb63 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py @@ -0,0 +1,120 @@ + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from onnx.helper import make_node +import warnings +from qonnx.core.datatype import DataType +from onnx.helper import make_node +import numpy as np +from scipy.special import softmax +class QuantSoftmax(HWCustomOp): + """Abstraction layer for HW implementation of VectorVectorActivation layers.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "ifm_dim": ("ints", True, []), + "simd": ("i", False, 1), + "channels": ("i", True, 0), + # FINN DataTypes for inputs, weights, outputs + "data_type": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + h, w = self.get_nodeattr("ifm_dim") + c = self.get_nodeattr("channels") + return (1, h, w, c) + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_number_output_values(self): + raise NotImplementedError("This function is not yet implemented.") + + def quantise_to_int(self, arr, dtype): + max_val = np.iinfo(dtype).max + output = np.zeros_like(arr, dtype=dtype) + frac_part = arr - np.floor(arr) + scaled_frac = frac_part * max_val + output = scaled_frac.astype(dtype) + output[arr >= 1.0] = max_val + return output + + def execute_node(self, context, graph): + node = self.onnx_node + input_data = context[node.input[0]] + output_data = softmax(input_data, axis=-1) + qsm_out = self.quantise_to_int(output_data, np.int8) + context[node.output[0]] = qsm_out + + + def get_number_output_values(self): + raise NotImplementedError + + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + data_type = DataType[self.get_nodeattr("data_type")] + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert data_type.allowed(0), "DataType must support zero" + return data_type + + def make_shape_compatible_op(self, model): + shape = self.get_normal_input_shape() + # create an ONNX Softmax node with the same shape as this one + return make_node("Softmax", + inputs=[self.onnx_node.input[0]], + outputs=[self.onnx_node.output[0]], + shape=list(shape) + ) + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "data_type changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("data_type", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + raise NotImplementedError + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("simd") + return ibits * simd + + def get_outstream_width(self, ind=0): + obits = self.get_output_datatype().bitwidth() + simd = self.get_nodeattr("simd") + return obits * simd + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("channels") + simd = self.get_nodeattr("simd") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("channels") + simd = self.get_nodeattr("simd") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index c31f90af0b..d1e9387b1b 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -186,9 +186,23 @@ def prepare_codegen_rtl_values(self, model): n_thres_steps = self.get_nodeattr("numSteps") wdt = self.get_weight_datatype() if expected_thresholds != n_thres_steps: - min_val = wdt.min() - thresholds = np.insert(thresholds, 0, min_val, axis=1) - bias = bias - 1 + if DataType[output_data_type].signed(): + min_val = wdt.min() + thresholds = np.insert(thresholds, 0, min_val, axis=1) + bias = bias - 1 + # TODO: temporary fix for unsigned narrow quantization + else: + max_val = wdt.max() + if max_val > DataType[input_data_type].max(): + thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1) + else: + max_val = max_val + 1 + # increase wdt + if not wdt.signed(): + wdt = DataType.get_smallest_possible(max_val) + else: + wdt = DataType.get_smallest_possible(-max_val - 1) + thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1) n_thres_steps += 1 # add dummy dimension as final dimension (that's what gets packed with next call) @@ -528,8 +542,22 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): n_thres_steps = self.get_nodeattr("numSteps") wdt = self.get_weight_datatype() if expected_thresholds != n_thres_steps: - min_val = wdt.min() - thresholds = np.insert(thresholds, 0, min_val, axis=1) + if DataType[output_data_type].signed(): + min_val = wdt.min() + thresholds = np.insert(thresholds, 0, min_val, axis=1) + # TODO: temporary fix for unsigned narrow quantization + else: + max_val = wdt.max() + if max_val > self.get_input_datatype().max(): + thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1) + else: + max_val = max_val + 1 + # increase wdt + if not wdt.signed(): + wdt = DataType.get_smallest_possible(max_val) + else: + wdt = DataType.get_smallest_possible(-max_val - 1) + thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1) n_thres_steps += 1 # If a single threshold value is found, broadcast the value @@ -541,7 +569,6 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): thresh_padded = np.zeros((thresholds.shape[0], width_padded)) thresh_padded[: thresholds.shape[0], :n_thres_steps] = thresholds thresh_stream = [] - wdt = self.get_weight_datatype() bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) padding = np.zeros(width_padded, dtype=np.int32) diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 3d89a0ab23..8c9e99a578 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -32,6 +32,7 @@ #define AP_INT_MAX_W $AP_INT_MAX_W$ #include "cnpy.h" #include "npy2apintstream.hpp" +#include "npy2vectorstream.hpp" #include #include "bnn-library.h" @@ -42,18 +43,22 @@ $DEFINES$ int main(){ -$PRAGMAS$ -$STREAMDECLARATIONS$ + $PRAGMAS$ -$READNPYDATA$ + try { + $STREAMDECLARATIONS$ -$DOCOMPUTE$ + $READNPYDATA$ -$DATAOUTSTREAM$ + $DOCOMPUTE$ -$SAVEASCNPY$ + $DATAOUTSTREAM$ + $SAVEASCNPY$ + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + } } """ diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index e14181b140..e400e4335f 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1697,3 +1697,55 @@ def apply(self, model): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferQuantSoftmax(Transformation): + ''' + Find softmax layers that are followed by a MultiThreshold layer and replace them with QuantizedSoftmax + ''' + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + # check that an optype of Softmax is present followed by a MultiThreshold + consumer = model.find_consumer(n.output[0]) + if n.op_type == "Softmax" and consumer is not None and consumer.op_type == "MultiThreshold": + # get the shape of the input/output tensor + input_shape = model.get_tensor_shape(n.input[0]) + assert input_shape == model.get_tensor_shape(consumer.input[0]), ( + "Softmax and MultiThreshold input shapes do not match" + ) + h = int(input_shape[1]) + w = int(input_shape[2]) + c = int(input_shape[3]) + idt0 = model.get_tensor_datatype(n.input[0]) + # create node with no parallelization first + simd = 1 + # create and insert new node + new_node = helper.make_node( + "QuantSoftmax", + [n.input[0]], # input tensor(s) + [consumer.output[0]], # output tensor(s) + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ifm_dim=[h, w], + channels=c, + data_type = idt0.name, + name="Quant"+n.name, + simd=simd + ) + graph.node.insert(node_ind, new_node) + graph.node.remove(n) + # remove multithreshold too + graph.node.remove(consumer) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 91c191962f..c2e2cbcd8a 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -192,8 +192,12 @@ def build(self, code_gen_dir): f.write("#!/bin/bash \n") f.write(bash_compile + "\n") bash_command = ["bash", self.compile_script] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() + + with open(str(self.code_gen_dir) + "/compile.log", "w") as f: + try: + subprocess.check_output(bash_command, stderr=f) + except subprocess.CalledProcessError: + raise Exception(f"Error in compiling the generated code. Check {f.name} for more details.") def launch_process_helper(args, proc_env=None, cwd=None): diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py new file mode 100644 index 0000000000..c813bc3ff9 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py @@ -0,0 +1,242 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import torch +from onnx import helper +import finn.core.onnx_exec as oxe +from brevitas.export import export_qonnx +from qonnx.util.cleanup import cleanup as qonnx_cleanup +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model +from qonnx.transformation.infer_datatypes import InferDataTypes +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from qonnx.transformation.general import ( + ApplyConfig, + GiveUniqueNodeNames, +) +import finn.transformation.streamline.absorb as absorb +from onnx import helper +import torch +import torch.nn as nn +import brevitas.nn as qnn +import numpy as np +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 +export_onnx_path = "pytest_quantsoftmax_dut.onnx" + +class QuantSoftMaxSimple(nn.Module): + def __init__(self, bit_width=8, signed=True): + super(QuantSoftMaxSimple, self).__init__() + self.output_identity = qnn.QuantIdentity(bit_width=bit_width, scaling_per_tensor=True, bias=False, signed = signed) + self.softmax = nn.Softmax(dim=3) # softmax along the last dimension + + def get_quant_scale(self): + return self.output_identity.quant_act_scale() + + def forward(self, x): + x = self.softmax(x) + x = self.output_identity(x) + return x + +def create_model(io_shape=(1, 12, 128, 128), idt=DataType["INT8"]): + ''' + Create a quantized softmax model. + Input and output are quantized to Int8ActPerTensorFloat, this is to make sure + that the softmax layer is followed by a Quant node. + ''' + dut = QuantSoftMaxSimple(idt.bitwidth(), idt.signed()) + input = torch.rand(io_shape) + export_qonnx(dut, input, export_onnx_path, opset_version=11) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) + # set the model input to UINT8 + model = ModelWrapper(export_onnx_path) + model.set_tensor_datatype(model.graph.input[0].name, idt) + return model, dut.get_quant_scale() + +def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType["UINT8"], ifm_dim=(128, 128), channels=12): + ''' + Create a single quantized softmax node with variable parameters. + this is before SpecializeLayers() transformation. + ''' + h = ifm_dim[1] + w = ifm_dim[2] + + inp = helper.make_tensor_value_info("global_in", TensorProto.FLOAT, [1, h, w, channels]) + outp = helper.make_tensor_value_info("global_out", TensorProto.FLOAT, [1, h, w, channels]) + new_node = helper.make_node( + "QuantSoftmax", + ["global_in"], + ["global_out"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ifm_dim=[h, w], + channels=channels, + data_type = idt.name, + simd=simd, + preferred_impl_style=impl_style, + ) + graph = helper.make_graph( + [new_node], + "softmax_graph", + inputs=[inp], + outputs=[outp] + ) + model = qonnx_make_model(graph) + model = ModelWrapper(model) + + model.set_tensor_datatype("global_in", idt) + model.set_tensor_datatype("global_out", idt) + + return model + +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_ip"]) +@pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"]) +@pytest.mark.fpgadataflow +def test_convert_to_hw_softmax_layer(exec_mode, simd): + ''' + This test checks that the softmax layer can be converted to a HW layer. + ''' + if (exec_mode == "stitched_ip" or exec_mode == "rtlsim") and simd != "simd1": + pytest.skip("Skipping this test to avoid long test times") + # Create the qonnx model + io_shape = (1, 12, 128, 128) + # input = torch.randn(io_shape) + input = gen_finn_dt_tensor(DataType["UINT8"], io_shape) + input_t = {"global_in": input} + + model, _ = create_model(io_shape) + + simd = int(simd[-1]) + folding_config = { + "Defaults": {}, + "QuantSoftmax_0": { + "simd": simd, + "preferred_impl_style": "hls" + } + } + try: + model = model.transform(ConvertQONNXtoFINN()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(to_hw.InferQuantSoftmax()) + model = model.transform(GiveUniqueNodeNames()) + # isolate fpga dataflow layers + parent_model = model.transform(CreateDataflowPartition()) + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node_path = getCustomOp(sdp_node).get_nodeattr("model") + model = ModelWrapper(sdp_node_path) + model = model.transform(ApplyConfig(folding_config)) + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + if exec_mode == "cppsim": + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + try: + model = model.transform(PrepareRTLSim()) + pytest.fail("PrepareRTLSim should have failed") + except Exception as e: + # expected to fail because this node do not support rtlsim + pass + elif exec_mode == "stitched_ip": + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + except Exception as e: + pytest.fail(f"Failed to transform the model: {str(e)}") + + +@pytest.mark.parametrize("impl_style", ["hls"]) +@pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"]) +@pytest.mark.parametrize("idt", ["INT8"]) +@pytest.mark.parametrize("ifm_dim", [(1, 12, 12, 12), (1, 128, 128, 384)]) +@pytest.mark.fpgadataflow +def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim): + idt = DataType[idt] + simd = int(simd[-1]) + io_shape = (ifm_dim[0], ifm_dim[1], ifm_dim[2], ifm_dim[3]) + tollerance = 2 + model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=ifm_dim[3]) + + if(ifm_dim[3] % 3 != 0): + pytest.skip(f"Skipping this test because the number of channels is not a multiple of {simd}") + + input = gen_finn_dt_tensor(idt, io_shape) + input_t = {"global_in": input} + + # Create reference values using the qonnx model + ref_model, scale = create_model(io_shape, idt) + y_ref = oxe.execute_onnx(ref_model, input_t)["global_out"] + y_ref = y_ref / scale + y_ref = y_ref.numpy() + + y_out = oxe.execute_onnx(model, input_t)["global_out"] + assert np.allclose(y_ref, y_out, atol=tollerance), "Model output does not match expected output" + + try: + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + except Exception as e: + pytest.fail(f"Failed to transform the model: {str(e)}") + + # run the model + y_hw = oxe.execute_onnx(model, input_t)["global_out"] + + # Debug prints to help identify the failing values + for i in range(len(y_ref)): + for j in range(len(y_ref[i])): + for k in range(len(y_ref[i][j])): + for l in range(len(y_ref[i][j][k])): + if np.allclose(y_ref[i][j][k][l], y_hw[i][j][k][l], atol=tollerance) == False: + print(f"| {i},{j},{k},{l:<2} | {y_ref[i][j][k][l]:<4.0f} | {y_hw[i][j][k][l]:<4.0f} | {y_ref[i][j][k][l] - y_hw[i][j][k][l]:<4.0f} |") + + assert np.allclose(y_ref, y_hw, atol=tollerance), "Model output does not match expected output" \ No newline at end of file diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index e4dd49fc7f..fe7ba3d9fb 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -129,14 +129,14 @@ def make_single_multithresholding_modelwrapper( [1, 2, 2], ], ) -@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("activation", [DataType["UINT4"], DataType["INT4"], DataType["BIPOLAR"]]) @pytest.mark.parametrize( "idt_tdt_cfg", [ (DataType["INT8"], DataType["INT8"]), (DataType["INT8"], DataType["INT9"]), - (DataType["UINT8"], DataType["UINT8"]), - (DataType["UINT8"], DataType["UINT9"]), + (DataType["UINT5"], DataType["UINT5"]), + (DataType["UINT5"], DataType["UINT6"]), ], ) @pytest.mark.parametrize("fold", [-1, 1, 2]) @@ -184,7 +184,7 @@ def test_fpgadataflow_thresholding( activation_bias = 0 else: activation_bias = activation.min() - if narrow: + if narrow and activation.signed(): activation_bias += 1 # Generate random thresholds and sort in ascending order diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py index 1ad695bb94..e6175ac58b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py @@ -122,13 +122,16 @@ def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.parametrize( + "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])] +) # configuration (ch, pe) -@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)]) @pytest.mark.parametrize("narrow", [True, False]) @pytest.mark.parametrize("per_tensor", [True, False]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor): +def test_runtime_thresholds_read(impl_style, idt_act_cfg, cfg, narrow, per_tensor): """Read back threshold weights during runtime 1. Create random initial weights T @@ -140,8 +143,8 @@ def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor): pe = cfg[1] n_inp_vecs = [1, 2, 2] hls_mem_mode = "internal_decoupled" - act = DataType["INT4"] - idt = DataType["INT16"] + act = idt_act_cfg[1] + idt = idt_act_cfg[0] odt = act n_steps = act.get_num_possible_values() - 1 # Generate random thresholds and sort in ascending order @@ -151,7 +154,7 @@ def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor): T = sort_thresholds_increasing(T) actval = act.min() - if narrow: + if narrow and act.signed(): actval += 1 model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, ch) @@ -219,13 +222,16 @@ def read_weights(sim): @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.parametrize( + "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])] +) # configuration (ch, pe) -@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)]) @pytest.mark.parametrize("narrow", [True, False]) @pytest.mark.parametrize("per_tensor", [True, False]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor): +def test_runtime_thresholds_write(impl_style, idt_act_cfg, cfg, narrow, per_tensor): """Write threshold weights during runtime 1. Create random initial weights T_init @@ -241,8 +247,8 @@ def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor): n_inp_vecs = [1, 2, 2] hls_mem_mode = "internal_decoupled" - act = DataType["INT4"] - idt = DataType["INT16"] + act = idt_act_cfg[1] + idt = idt_act_cfg[0] odt = act n_steps = act.get_num_possible_values() - 1 @@ -253,7 +259,7 @@ def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor): T_init = sort_thresholds_increasing(T_init) actval = act.min() - if narrow: + if narrow and act.signed(): actval += 1 model = make_single_thresholding_modelwrapper(